001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.text.Normalizer; 010import java.util.ArrayList; 011import java.util.Arrays; 012import java.util.HashMap; 013import java.util.List; 014import java.util.Locale; 015import java.util.Map; 016import java.util.regex.Matcher; 017import java.util.regex.Pattern; 018 019import org.openstreetmap.josm.data.osm.OsmPrimitive; 020import org.openstreetmap.josm.data.osm.Way; 021import org.openstreetmap.josm.data.validation.Severity; 022import org.openstreetmap.josm.data.validation.Test; 023import org.openstreetmap.josm.data.validation.TestError; 024import org.openstreetmap.josm.data.validation.util.ValUtil; 025import org.openstreetmap.josm.gui.progress.ProgressMonitor; 026import org.openstreetmap.josm.tools.MultiMap; 027 028/** 029 * Checks for similar named ways, symptom of a possible typo. It uses the 030 * Levenshtein distance to check for similarity 031 * 032 * @author frsantos 033 */ 034public class SimilarNamedWays extends Test { 035 036 protected static final int SIMILAR_NAMED = 701; 037 038 private static final Pattern REMOVE_DIACRITICS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); 039 040 /** All ways, grouped by cells */ 041 private Map<Point2D, List<Way>> cellWays; 042 /** The already detected errors */ 043 private MultiMap<Way, Way> errorWays; 044 045 private final List<NormalizeRule> rules = new ArrayList<>(); 046 047 /** 048 * Constructor 049 */ 050 public SimilarNamedWays() { 051 super(tr("Similarly named ways"), 052 tr("This test checks for ways with similar names that may have been misspelled.")); 053 054 // FIXME: hardcode these rules for now. Replace them with preferences later 055 // See https://josm.openstreetmap.de/ticket/3733#comment:19 056 addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers 057 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 058 addRegExprRule("^[A-Z] ", "X"); // E Street 059 addSynonyms("east", "west", "north", "south"); 060 addSynonyms("first", "second", "third"); 061 } 062 063 @Override 064 public void startTest(ProgressMonitor monitor) { 065 super.startTest(monitor); 066 cellWays = new HashMap<>(1000); 067 errorWays = new MultiMap<>(); 068 } 069 070 @Override 071 public void endTest() { 072 cellWays = null; 073 errorWays = null; 074 super.endTest(); 075 } 076 077 @Override 078 public void visit(Way w) { 079 if (!w.isUsable()) 080 return; 081 082 String name = w.get("name"); 083 if (name == null || name.length() < 6) 084 return; 085 086 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 087 for (List<Way> ways : theCellWays) { 088 for (Way w2 : ways) { 089 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 090 continue; 091 } 092 093 String name2 = w2.get("name"); 094 if (name2 == null || name2.length() < 6) { 095 continue; 096 } 097 098 if (similaryName(name, name2)) { 099 List<OsmPrimitive> primitives = new ArrayList<>(2); 100 primitives.add(w); 101 primitives.add(w2); 102 errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED) 103 .message(tr("Similarly named ways")) 104 .primitives(primitives) 105 .build()); 106 errorWays.put(w, w2); 107 } 108 } 109 ways.add(w); 110 } 111 } 112 113 /** 114 * Compute Levenshtein distance 115 * 116 * @param s First word 117 * @param t Second word 118 * @return The distance between words 119 */ 120 public static int getLevenshteinDistance(String s, String t) { 121 int[][] d; // matrix 122 int n; // length of s 123 int m; // length of t 124 int i; // iterates through s 125 int j; // iterates through t 126 char si; // ith character of s 127 char tj; // jth character of t 128 int cost; // cost 129 130 // Step 1 131 n = s.length(); 132 m = t.length(); 133 if (n == 0) 134 return m; 135 if (m == 0) 136 return n; 137 d = new int[n+1][m+1]; 138 139 // Step 2 140 for (i = 0; i <= n; i++) { 141 d[i][0] = i; 142 } 143 for (j = 0; j <= m; j++) { 144 d[0][j] = j; 145 } 146 147 // Step 3 148 for (i = 1; i <= n; i++) { 149 150 si = s.charAt(i - 1); 151 152 // Step 4 153 for (j = 1; j <= m; j++) { 154 155 tj = t.charAt(j - 1); 156 157 // Step 5 158 if (si == tj) { 159 cost = 0; 160 } else { 161 cost = 1; 162 } 163 164 // Step 6 165 d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost); 166 } 167 } 168 169 // Step 7 170 return d[n][m]; 171 } 172 173 /** 174 * Add a regular expression rule. 175 * @param regExpr the regular expression to search for 176 * @param replacement a string to replace with, which should match the expression. 177 */ 178 public void addRegExprRule(String regExpr, String replacement) { 179 rules.add(new RegExprRule(regExpr, replacement)); 180 } 181 182 /** 183 * Add a rule with synonym words. 184 * @param words words which are synonyms 185 */ 186 public void addSynonyms(String... words) { 187 for (String word : words) { 188 rules.add(new SynonymRule(word, words)); 189 } 190 } 191 192 /** 193 * Check if two names are similar, but not identical. First both names will be "normalized". 194 * Afterwards the Levenshtein distance will be calculated.<br> 195 * Examples for normalization rules:<br> 196 * <code>replaceAll("\\d+", "0")</code><br> 197 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 198 * @param name first name to compare 199 * @param name2 second name to compare 200 * @return true if the normalized names are different but only a "little bit" 201 */ 202 public boolean similaryName(String name, String name2) { 203 // check plain strings 204 int distance = getLevenshteinDistance(name, name2); 205 boolean similar = distance > 0 && distance <= 2; 206 207 // check if only the case differs, so we don't consider large distance as different strings 208 if (distance > 2 && name.length() == name2.length()) { 209 similar = deAccent(name).equalsIgnoreCase(deAccent(name2)); 210 } 211 212 // try all rules 213 for (NormalizeRule rule : rules) { 214 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 215 if (levenshteinDistance == 0) 216 // one rule results in identical names: identical 217 return false; 218 else if (levenshteinDistance <= 2) { 219 // 0 < distance <= 2 220 similar = true; 221 } 222 } 223 return similar; 224 } 225 226 /** 227 * Removes diacritics (accents) from string. 228 * @param str string 229 * @return {@code str} without any diacritic (accent) 230 * @since 12283 231 */ 232 public static String deAccent(String str) { 233 // https://stackoverflow.com/a/1215117/2257172 234 return REMOVE_DIACRITICS.matcher(Normalizer.normalize(str, Normalizer.Form.NFD)).replaceAll(""); 235 } 236 237 /** 238 * A normalization that is applied to names before testing them 239 */ 240 @FunctionalInterface 241 public interface NormalizeRule { 242 243 /** 244 * Normalize the string by replacing parts. 245 * @param name name to normalize 246 * @return normalized string 247 */ 248 String normalize(String name); 249 } 250 251 /** 252 * A rule to replace by regular expression, 253 * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement} 254 */ 255 public static class RegExprRule implements NormalizeRule { 256 private final Pattern regExpr; 257 private final String replacement; 258 259 /** 260 * Create a new rule to replace by regular expression 261 * @param expression The regular expression 262 * @param replacement The replacement 263 */ 264 public RegExprRule(String expression, String replacement) { 265 this.regExpr = Pattern.compile(expression); 266 this.replacement = replacement; 267 } 268 269 @Override 270 public String normalize(String name) { 271 return regExpr.matcher(name).replaceAll(replacement); 272 } 273 274 @Override 275 public String toString() { 276 return "replaceAll(" + regExpr + ", " + replacement + ')'; 277 } 278 } 279 280 /** 281 * A rule that registers synonyms to a given word 282 */ 283 public static class SynonymRule implements NormalizeRule { 284 285 private final String[] words; 286 private final Pattern regExpr; 287 private final String replacement; 288 289 /** 290 * Create a new {@link SynonymRule} 291 * @param replacement The word to use instead 292 * @param words The synonyms for that word 293 */ 294 public SynonymRule(String replacement, String... words) { 295 this.replacement = replacement.toLowerCase(Locale.ENGLISH); 296 this.words = words; 297 298 // build regular expression for other words (for fast match) 299 StringBuilder expression = new StringBuilder(); 300 int maxLength = 0; 301 for (int i = 0; i < words.length; i++) { 302 if (words[i].length() > maxLength) { 303 maxLength = words[i].length(); 304 } 305 if (expression.length() > 0) { 306 expression.append('|'); 307 } 308 expression.append(Pattern.quote(words[i])); 309 } 310 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 311 } 312 313 @Override 314 public String normalize(String name) { 315 // find first match 316 Matcher matcher = regExpr.matcher(name); 317 if (!matcher.find()) 318 return name; 319 320 int start = matcher.start(); 321 322 // which word matches? 323 String part = ""; 324 for (int i = 0; i < words.length; i++) { 325 String word = words[i]; 326 part = name.substring(start, start + word.length()); 327 if (word.equalsIgnoreCase(part)) { 328 break; 329 } 330 } 331 332 // replace the word 333 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 334 335 // adjust case (replacement is not shorter than matching word!) 336 int minLength = Math.min(replacement.length(), part.length()); 337 for (int i = 0; i < minLength; i++) { 338 if (Character.isUpperCase(part.charAt(i))) { 339 newName[start + i] = Character.toUpperCase(newName[start + i]); 340 } 341 } 342 343 return new String(newName); 344 } 345 346 @Override 347 public String toString() { 348 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')'; 349 } 350 } 351}