001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.util.ArrayList; 010import java.util.Arrays; 011import java.util.HashMap; 012import java.util.List; 013import java.util.Locale; 014import java.util.Map; 015import java.util.regex.Matcher; 016import java.util.regex.Pattern; 017 018import org.openstreetmap.josm.data.osm.OsmPrimitive; 019import org.openstreetmap.josm.data.osm.Way; 020import org.openstreetmap.josm.data.validation.Severity; 021import org.openstreetmap.josm.data.validation.Test; 022import org.openstreetmap.josm.data.validation.TestError; 023import org.openstreetmap.josm.data.validation.util.ValUtil; 024import org.openstreetmap.josm.gui.progress.ProgressMonitor; 025import org.openstreetmap.josm.tools.MultiMap; 026import org.openstreetmap.josm.tools.Utils; 027 028/** 029 * Checks for similar named ways, symptom of a possible typo. It uses the 030 * Levenshtein distance to check for similarity 031 * 032 * @author frsantos 033 */ 034public class SimilarNamedWays extends Test { 035 036 protected static final int SIMILAR_NAMED = 701; 037 038 /** All ways, grouped by cells */ 039 private Map<Point2D, List<Way>> cellWays; 040 /** The already detected errors */ 041 private MultiMap<Way, Way> errorWays; 042 043 private final List<NormalizeRule> rules = new ArrayList<>(); 044 045 /** 046 * Constructor 047 */ 048 public SimilarNamedWays() { 049 super(tr("Similarly named ways"), 050 tr("This test checks for ways with similar names that may have been misspelled.")); 051 052 // FIXME: hardcode these rules for now. Replace them with preferences later 053 // See https://josm.openstreetmap.de/ticket/3733#comment:19 054 addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers 055 addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II" 056 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 057 addRegExprRule("^[A-Z] ", "X"); // E Street 058 addSynonyms("east", "west", "north", "south"); 059 addSynonyms("first", "second", "third"); 060 } 061 062 @Override 063 public void startTest(ProgressMonitor monitor) { 064 super.startTest(monitor); 065 cellWays = new HashMap<>(1000); 066 errorWays = new MultiMap<>(); 067 } 068 069 @Override 070 public void endTest() { 071 cellWays = null; 072 errorWays = null; 073 super.endTest(); 074 } 075 076 @Override 077 public void visit(Way w) { 078 if (!w.isUsable()) 079 return; 080 081 String name = w.get("name"); 082 if (name == null || name.length() < 6) 083 return; 084 085 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 086 for (List<Way> ways : theCellWays) { 087 for (Way w2 : ways) { 088 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 089 continue; 090 } 091 092 String name2 = w2.get("name"); 093 if (name2 == null || name2.length() < 6) { 094 continue; 095 } 096 097 if (similaryName(name, name2)) { 098 List<OsmPrimitive> primitives = new ArrayList<>(2); 099 primitives.add(w); 100 primitives.add(w2); 101 errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED) 102 .message(tr("Similarly named ways")) 103 .primitives(primitives) 104 .build()); 105 errorWays.put(w, w2); 106 } 107 } 108 ways.add(w); 109 } 110 } 111 112 /** 113 * Add a regular expression rule. 114 * @param regExpr the regular expression to search for 115 * @param replacement a string to replace with, which should match the expression. 116 */ 117 public void addRegExprRule(String regExpr, String replacement) { 118 rules.add(new RegExprRule(regExpr, replacement)); 119 } 120 121 /** 122 * Add a rule with synonym words. 123 * @param words words which are synonyms 124 */ 125 public void addSynonyms(String... words) { 126 for (String word : words) { 127 rules.add(new SynonymRule(word, words)); 128 } 129 } 130 131 /** 132 * Check if two names are similar, but not identical. First both names will be "normalized". 133 * Afterwards the Levenshtein distance will be calculated.<br> 134 * Examples for normalization rules:<br> 135 * <code>replaceAll("\\d+", "0")</code><br> 136 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 137 * @param name first name to compare 138 * @param name2 second name to compare 139 * @return true if the normalized names are different but only a "little bit" 140 */ 141 public boolean similaryName(String name, String name2) { 142 boolean similar = Utils.isSimilar(name, name2); 143 144 // try all rules 145 for (NormalizeRule rule : rules) { 146 int levenshteinDistance = Utils.getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 147 if (levenshteinDistance == 0) 148 // one rule results in identical names: identical 149 return false; 150 else if (levenshteinDistance <= 2) { 151 // 0 < distance <= 2 152 similar = true; 153 } 154 } 155 return similar; 156 } 157 158 /** 159 * A normalization that is applied to names before testing them 160 */ 161 @FunctionalInterface 162 public interface NormalizeRule { 163 164 /** 165 * Normalize the string by replacing parts. 166 * @param name name to normalize 167 * @return normalized string 168 */ 169 String normalize(String name); 170 } 171 172 /** 173 * A rule to replace by regular expression, 174 * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement} 175 */ 176 public static class RegExprRule implements NormalizeRule { 177 private final Pattern regExpr; 178 private final String replacement; 179 180 /** 181 * Create a new rule to replace by regular expression 182 * @param expression The regular expression 183 * @param replacement The replacement 184 */ 185 public RegExprRule(String expression, String replacement) { 186 this.regExpr = Pattern.compile(expression); 187 this.replacement = replacement; 188 } 189 190 @Override 191 public String normalize(String name) { 192 return regExpr.matcher(name).replaceAll(replacement); 193 } 194 195 @Override 196 public String toString() { 197 return "replaceAll(" + regExpr + ", " + replacement + ')'; 198 } 199 } 200 201 /** 202 * A rule that registers synonyms to a given word 203 */ 204 public static class SynonymRule implements NormalizeRule { 205 206 private final String[] words; 207 private final Pattern regExpr; 208 private final String replacement; 209 210 /** 211 * Create a new {@link SynonymRule} 212 * @param replacement The word to use instead 213 * @param words The synonyms for that word 214 */ 215 public SynonymRule(String replacement, String... words) { 216 this.replacement = replacement.toLowerCase(Locale.ENGLISH); 217 this.words = words; 218 219 // build regular expression for other words (for fast match) 220 StringBuilder expression = new StringBuilder(); 221 int maxLength = 0; 222 for (int i = 0; i < words.length; i++) { 223 if (words[i].length() > maxLength) { 224 maxLength = words[i].length(); 225 } 226 if (expression.length() > 0) { 227 expression.append('|'); 228 } 229 expression.append(Pattern.quote(words[i])); 230 } 231 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 232 } 233 234 @Override 235 public String normalize(String name) { 236 // find first match 237 Matcher matcher = regExpr.matcher(name); 238 if (!matcher.find()) 239 return name; 240 241 int start = matcher.start(); 242 243 // which word matches? 244 String part = ""; 245 for (int i = 0; i < words.length; i++) { 246 String word = words[i]; 247 part = name.substring(start, start + word.length()); 248 if (word.equalsIgnoreCase(part)) { 249 break; 250 } 251 } 252 253 // replace the word 254 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 255 256 // adjust case (replacement is not shorter than matching word!) 257 int minLength = Math.min(replacement.length(), part.length()); 258 for (int i = 0; i < minLength; i++) { 259 if (Character.isUpperCase(part.charAt(i))) { 260 newName[start + i] = Character.toUpperCase(newName[start + i]); 261 } 262 } 263 264 return new String(newName); 265 } 266 267 @Override 268 public String toString() { 269 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')'; 270 } 271 } 272}