001// License: GPL. See LICENSE file for details. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.util.ArrayList; 010import java.util.Arrays; 011import java.util.HashMap; 012import java.util.List; 013import java.util.Map; 014import java.util.regex.Matcher; 015import java.util.regex.Pattern; 016 017import org.openstreetmap.josm.data.osm.OsmPrimitive; 018import org.openstreetmap.josm.data.osm.Way; 019import org.openstreetmap.josm.data.validation.Severity; 020import org.openstreetmap.josm.data.validation.Test; 021import org.openstreetmap.josm.data.validation.TestError; 022import org.openstreetmap.josm.data.validation.util.ValUtil; 023import org.openstreetmap.josm.gui.progress.ProgressMonitor; 024import org.openstreetmap.josm.tools.MultiMap; 025import org.openstreetmap.josm.tools.Utils; 026 027/** 028 * Checks for similar named ways, symptom of a possible typo. It uses the 029 * Levenshtein distance to check for similarity 030 * 031 * @author frsantos 032 */ 033public class SimilarNamedWays extends Test { 034 035 protected static final int SIMILAR_NAMED = 701; 036 037 /** All ways, grouped by cells */ 038 private Map<Point2D,List<Way>> cellWays; 039 /** The already detected errors */ 040 private MultiMap<Way, Way> errorWays; 041 042 private ArrayList<NormalizeRule> rules = new ArrayList<NormalizeRule>(); 043 044 /** 045 * Constructor 046 */ 047 public SimilarNamedWays() { 048 super(tr("Similarly named ways"), 049 tr("This test checks for ways with similar names that may have been misspelled.")); 050 051 // FIXME: hardcode these rules for now. Replace them with preferences later 052 // See https://josm.openstreetmap.de/ticket/3733#comment:19 053 addRegExprRule("\\d+", "0"); // Highway 66 054 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 055 addRegExprRule("^[A-Z] ", "X"); // E Street 056 addSynonyms("east", "west", "north", "south"); 057 addSynonyms("first", "second", "third"); 058 } 059 060 @Override 061 public void startTest(ProgressMonitor monitor) { 062 super.startTest(monitor); 063 cellWays = new HashMap<>(1000); 064 errorWays = new MultiMap<>(); 065 } 066 067 @Override 068 public void endTest() { 069 cellWays = null; 070 errorWays = null; 071 super.endTest(); 072 } 073 074 @Override 075 public void visit(Way w) { 076 if (!w.isUsable()) 077 return; 078 079 String name = w.get("name"); 080 if (name == null || name.length() < 6) 081 return; 082 083 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 084 for (List<Way> ways : theCellWays) { 085 for (Way w2 : ways) { 086 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 087 continue; 088 } 089 090 String name2 = w2.get("name"); 091 if (name2 == null || name2.length() < 6) { 092 continue; 093 } 094 095 if (similaryName(name, name2)) { 096 List<OsmPrimitive> primitives = new ArrayList<>(2); 097 primitives.add(w); 098 primitives.add(w2); 099 errors.add(new TestError(this, Severity.WARNING, tr("Similarly named ways"), SIMILAR_NAMED, primitives)); 100 errorWays.put(w, w2); 101 } 102 } 103 ways.add(w); 104 } 105 } 106 107 /** 108 * Compute Levenshtein distance 109 * 110 * @param s First word 111 * @param t Second word 112 * @return The distance between words 113 */ 114 public static int getLevenshteinDistance(String s, String t) { 115 int[][] d; // matrix 116 int n; // length of s 117 int m; // length of t 118 int i; // iterates through s 119 int j; // iterates through t 120 char s_i; // ith character of s 121 char t_j; // jth character of t 122 int cost; // cost 123 124 // Step 1 125 n = s.length(); 126 m = t.length(); 127 if (n == 0) 128 return m; 129 if (m == 0) 130 return n; 131 d = new int[n + 1][m + 1]; 132 133 // Step 2 134 for (i = 0; i <= n; i++) { 135 d[i][0] = i; 136 } 137 for (j = 0; j <= m; j++) { 138 d[0][j] = j; 139 } 140 141 // Step 3 142 for (i = 1; i <= n; i++) { 143 144 s_i = s.charAt(i - 1); 145 146 // Step 4 147 for (j = 1; j <= m; j++) { 148 149 t_j = t.charAt(j - 1); 150 151 // Step 5 152 if (s_i == t_j) { 153 cost = 0; 154 } else { 155 cost = 1; 156 } 157 158 // Step 6 159 d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); 160 } 161 } 162 163 // Step 7 164 return d[n][m]; 165 } 166 167 /** 168 * Add a regular expression rule. 169 * @param regExpr the regular expression to search for 170 * @param replacement a string to replace with, which should match the expression. 171 */ 172 public void addRegExprRule(String regExpr, String replacement) { 173 rules.add(new RegExprRule(regExpr, replacement)); 174 } 175 176 /** 177 * Add a rule with synonym words. 178 * @param words words which are synonyms 179 */ 180 public void addSynonyms(String... words) { 181 for (String word : words) { 182 rules.add(new SynonymRule(word, words)); 183 } 184 } 185 186 /** 187 * Check if two names are similar, but not identical. First both names will be "normalized". 188 * Afterwards the Levenshtein distance will be calculated.<br> 189 * Examples for normalization rules:<br> 190 * <code>replaceAll("\\d+", "0")</code><br> 191 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 192 * @param name first name to compare 193 * @param name2 second name to compare 194 * @return true if the normalized names are different but only a "little bit" 195 */ 196 public boolean similaryName(String name, String name2) { 197 // check plain strings 198 int distance = getLevenshteinDistance(name, name2); 199 boolean similar = distance>0 && distance<=2; 200 201 // try all rules 202 for (NormalizeRule rule : rules) { 203 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 204 if (levenshteinDistance == 0) 205 // one rule results in identical names: identical 206 return false; 207 else if (levenshteinDistance <= 2) { 208 // 0 < distance <= 2 209 similar = true; 210 } 211 } 212 return similar; 213 } 214 215 public interface NormalizeRule { 216 217 /** 218 * Normalize the string by replacing parts. 219 * @param name name to normalize 220 * @return normalized string 221 */ 222 String normalize(String name); 223 } 224 225 public class RegExprRule implements NormalizeRule { 226 private final Pattern regExpr; 227 private final String replacement; 228 229 public RegExprRule(String expression, String replacement) { 230 this.regExpr = Pattern.compile(expression); 231 this.replacement = replacement; 232 } 233 234 @Override 235 public String normalize(String name) { 236 return regExpr.matcher(name).replaceAll(replacement); 237 } 238 239 @Override 240 public String toString() { 241 return "replaceAll(" + regExpr + ", " + replacement + ")"; 242 } 243 } 244 245 public class SynonymRule implements NormalizeRule { 246 247 private final String[] words; 248 private final Pattern regExpr; 249 private final String replacement; 250 251 public SynonymRule(String replacement, String[] words) { 252 this.replacement = replacement.toLowerCase(); 253 this.words = words; 254 255 // build regular expression for other words (for fast match) 256 StringBuilder expression = new StringBuilder(); 257 int maxLength = 0; 258 for (int i = 0; i < words.length; i++) { 259 if (words[i].length() > maxLength) { 260 maxLength = words[i].length(); 261 } 262 if (expression.length() > 0) { 263 expression.append("|"); 264 } 265 expression.append(Pattern.quote(words[i])); 266 } 267 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 268 } 269 270 @Override 271 public String normalize(String name) { 272 // find first match 273 Matcher matcher = regExpr.matcher(name); 274 if (!matcher.find()) 275 return name; 276 277 int start = matcher.start(); 278 279 // which word matches? 280 String part = ""; 281 for (int i = 0; i < words.length; i++) { 282 String word = words[i]; 283 part = name.substring(start, start + word.length()); 284 if (word.equalsIgnoreCase(part)) { 285 break; 286 } 287 } 288 289 // replace the word 290 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 291 292 // adjust case (replacement is not shorter than matching word!) 293 int minLength = Math.min(replacement.length(), part.length()); 294 for (int i = 0; i < minLength; i++) { 295 if (Character.isUpperCase(part.charAt(i))) { 296 newName[start + i] = Character.toUpperCase(newName[start + i]); 297 } 298 } 299 300 return new String(newName); 301 } 302 303 @Override 304 public String toString() { 305 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ")"; 306 } 307 } 308}