001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.util.ArrayList; 010import java.util.Arrays; 011import java.util.HashMap; 012import java.util.List; 013import java.util.Locale; 014import java.util.Map; 015import java.util.regex.Matcher; 016import java.util.regex.Pattern; 017 018import org.openstreetmap.josm.data.osm.OsmPrimitive; 019import org.openstreetmap.josm.data.osm.Way; 020import org.openstreetmap.josm.data.validation.Severity; 021import org.openstreetmap.josm.data.validation.Test; 022import org.openstreetmap.josm.data.validation.TestError; 023import org.openstreetmap.josm.data.validation.util.ValUtil; 024import org.openstreetmap.josm.gui.progress.ProgressMonitor; 025import org.openstreetmap.josm.tools.MultiMap; 026 027/** 028 * Checks for similar named ways, symptom of a possible typo. It uses the 029 * Levenshtein distance to check for similarity 030 * 031 * @author frsantos 032 */ 033public class SimilarNamedWays extends Test { 034 035 protected static final int SIMILAR_NAMED = 701; 036 037 /** All ways, grouped by cells */ 038 private Map<Point2D, List<Way>> cellWays; 039 /** The already detected errors */ 040 private MultiMap<Way, Way> errorWays; 041 042 private final List<NormalizeRule> rules = new ArrayList<>(); 043 044 /** 045 * Constructor 046 */ 047 public SimilarNamedWays() { 048 super(tr("Similarly named ways"), 049 tr("This test checks for ways with similar names that may have been misspelled.")); 050 051 // FIXME: hardcode these rules for now. Replace them with preferences later 052 // See https://josm.openstreetmap.de/ticket/3733#comment:19 053 addRegExprRule("\\d+", "0"); // Highway 66 054 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 055 addRegExprRule("^[A-Z] ", "X"); // E Street 056 addSynonyms("east", "west", "north", "south"); 057 addSynonyms("first", "second", "third"); 058 } 059 060 @Override 061 public void startTest(ProgressMonitor monitor) { 062 super.startTest(monitor); 063 cellWays = new HashMap<>(1000); 064 errorWays = new MultiMap<>(); 065 } 066 067 @Override 068 public void endTest() { 069 cellWays = null; 070 errorWays = null; 071 super.endTest(); 072 } 073 074 @Override 075 public void visit(Way w) { 076 if (!w.isUsable()) 077 return; 078 079 String name = w.get("name"); 080 if (name == null || name.length() < 6) 081 return; 082 083 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 084 for (List<Way> ways : theCellWays) { 085 for (Way w2 : ways) { 086 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 087 continue; 088 } 089 090 String name2 = w2.get("name"); 091 if (name2 == null || name2.length() < 6) { 092 continue; 093 } 094 095 if (similaryName(name, name2)) { 096 List<OsmPrimitive> primitives = new ArrayList<>(2); 097 primitives.add(w); 098 primitives.add(w2); 099 errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED) 100 .message(tr("Similarly named ways")) 101 .primitives(primitives) 102 .build()); 103 errorWays.put(w, w2); 104 } 105 } 106 ways.add(w); 107 } 108 } 109 110 /** 111 * Compute Levenshtein distance 112 * 113 * @param s First word 114 * @param t Second word 115 * @return The distance between words 116 */ 117 public static int getLevenshteinDistance(String s, String t) { 118 int[][] d; // matrix 119 int n; // length of s 120 int m; // length of t 121 int i; // iterates through s 122 int j; // iterates through t 123 char si; // ith character of s 124 char tj; // jth character of t 125 int cost; // cost 126 127 // Step 1 128 n = s.length(); 129 m = t.length(); 130 if (n == 0) 131 return m; 132 if (m == 0) 133 return n; 134 d = new int[n+1][m+1]; 135 136 // Step 2 137 for (i = 0; i <= n; i++) { 138 d[i][0] = i; 139 } 140 for (j = 0; j <= m; j++) { 141 d[0][j] = j; 142 } 143 144 // Step 3 145 for (i = 1; i <= n; i++) { 146 147 si = s.charAt(i - 1); 148 149 // Step 4 150 for (j = 1; j <= m; j++) { 151 152 tj = t.charAt(j - 1); 153 154 // Step 5 155 if (si == tj) { 156 cost = 0; 157 } else { 158 cost = 1; 159 } 160 161 // Step 6 162 d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost); 163 } 164 } 165 166 // Step 7 167 return d[n][m]; 168 } 169 170 /** 171 * Add a regular expression rule. 172 * @param regExpr the regular expression to search for 173 * @param replacement a string to replace with, which should match the expression. 174 */ 175 public void addRegExprRule(String regExpr, String replacement) { 176 rules.add(new RegExprRule(regExpr, replacement)); 177 } 178 179 /** 180 * Add a rule with synonym words. 181 * @param words words which are synonyms 182 */ 183 public void addSynonyms(String... words) { 184 for (String word : words) { 185 rules.add(new SynonymRule(word, words)); 186 } 187 } 188 189 /** 190 * Check if two names are similar, but not identical. First both names will be "normalized". 191 * Afterwards the Levenshtein distance will be calculated.<br> 192 * Examples for normalization rules:<br> 193 * <code>replaceAll("\\d+", "0")</code><br> 194 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 195 * @param name first name to compare 196 * @param name2 second name to compare 197 * @return true if the normalized names are different but only a "little bit" 198 */ 199 public boolean similaryName(String name, String name2) { 200 // check plain strings 201 int distance = getLevenshteinDistance(name, name2); 202 boolean similar = distance > 0 && distance <= 2; 203 204 // try all rules 205 for (NormalizeRule rule : rules) { 206 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 207 if (levenshteinDistance == 0) 208 // one rule results in identical names: identical 209 return false; 210 else if (levenshteinDistance <= 2) { 211 // 0 < distance <= 2 212 similar = true; 213 } 214 } 215 return similar; 216 } 217 218 @FunctionalInterface 219 public interface NormalizeRule { 220 221 /** 222 * Normalize the string by replacing parts. 223 * @param name name to normalize 224 * @return normalized string 225 */ 226 String normalize(String name); 227 } 228 229 public static class RegExprRule implements NormalizeRule { 230 private final Pattern regExpr; 231 private final String replacement; 232 233 public RegExprRule(String expression, String replacement) { 234 this.regExpr = Pattern.compile(expression); 235 this.replacement = replacement; 236 } 237 238 @Override 239 public String normalize(String name) { 240 return regExpr.matcher(name).replaceAll(replacement); 241 } 242 243 @Override 244 public String toString() { 245 return "replaceAll(" + regExpr + ", " + replacement + ')'; 246 } 247 } 248 249 public static class SynonymRule implements NormalizeRule { 250 251 private final String[] words; 252 private final Pattern regExpr; 253 private final String replacement; 254 255 public SynonymRule(String replacement, String ... words) { 256 this.replacement = replacement.toLowerCase(Locale.ENGLISH); 257 this.words = words; 258 259 // build regular expression for other words (for fast match) 260 StringBuilder expression = new StringBuilder(); 261 int maxLength = 0; 262 for (int i = 0; i < words.length; i++) { 263 if (words[i].length() > maxLength) { 264 maxLength = words[i].length(); 265 } 266 if (expression.length() > 0) { 267 expression.append('|'); 268 } 269 expression.append(Pattern.quote(words[i])); 270 } 271 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 272 } 273 274 @Override 275 public String normalize(String name) { 276 // find first match 277 Matcher matcher = regExpr.matcher(name); 278 if (!matcher.find()) 279 return name; 280 281 int start = matcher.start(); 282 283 // which word matches? 284 String part = ""; 285 for (int i = 0; i < words.length; i++) { 286 String word = words[i]; 287 part = name.substring(start, start + word.length()); 288 if (word.equalsIgnoreCase(part)) { 289 break; 290 } 291 } 292 293 // replace the word 294 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 295 296 // adjust case (replacement is not shorter than matching word!) 297 int minLength = Math.min(replacement.length(), part.length()); 298 for (int i = 0; i < minLength; i++) { 299 if (Character.isUpperCase(part.charAt(i))) { 300 newName[start + i] = Character.toUpperCase(newName[start + i]); 301 } 302 } 303 304 return new String(newName); 305 } 306 307 @Override 308 public String toString() { 309 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')'; 310 } 311 } 312}