00001 /* Byte-wise substring search, using the Two-Way algorithm. 00002 Copyright (C) 2008 Free Software Foundation, Inc. 00003 This file is part of the GNU C Library. 00004 Written by Eric Blake <ebb9@byu.net>, 2008. 00005 00006 This program is free software; you can redistribute it and/or modify 00007 it under the terms of the GNU Lesser General Public License as published by 00008 the Free Software Foundation; either version 2.1, or (at your option) 00009 any later version. 00010 00011 This program is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU Lesser General Public License for more details. 00015 00016 You should have received a copy of the GNU Lesser General Public License along 00017 with this program; if not, write to the Free Software Foundation, 00018 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 00019 00020 /* Before including this file, you need to include <config.h> and 00021 <string.h>, and define: 00022 RESULT_TYPE A macro that expands to the return type. 00023 AVAILABLE(h, h_l, j, n_l) 00024 A macro that returns nonzero if there are 00025 at least N_L bytes left starting at H[J]. 00026 H is 'unsigned char *', H_L, J, and N_L 00027 are 'size_t'; H_L is an lvalue. For 00028 NUL-terminated searches, H_L can be 00029 modified each iteration to avoid having 00030 to compute the end of H up front. 00031 00032 For case-insensitivity, you may optionally define: 00033 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L 00034 characters of P1 and P2 are equal. 00035 CANON_ELEMENT(c) A macro that canonicalizes an element right after 00036 it has been fetched from one of the two strings. 00037 The argument is an 'unsigned char'; the result 00038 must be an 'unsigned char' as well. 00039 00040 This file undefines the macros documented above, and defines 00041 LONG_NEEDLE_THRESHOLD. 00042 */ 00043 00044 #include <limits.h> 00045 #include <stdint.h> 00046 00047 /* We use the Two-Way string matching algorithm, which guarantees 00048 linear complexity with constant space. Additionally, for long 00049 needles, we also use a bad character shift table similar to the 00050 Boyer-Moore algorithm to achieve improved (potentially sub-linear) 00051 performance. 00052 00053 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 00054 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm 00055 */ 00056 00057 /* Point at which computing a bad-byte shift table is likely to be 00058 worthwhile. Small needles should not compute a table, since it 00059 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a 00060 speedup no greater than a factor of NEEDLE_LEN. The larger the 00061 needle, the better the potential performance gain. On the other 00062 hand, on non-POSIX systems with CHAR_BIT larger than eight, the 00063 memory required for the table is prohibitive. */ 00064 #if CHAR_BIT < 10 00065 # define LONG_NEEDLE_THRESHOLD 32U 00066 #else 00067 # define LONG_NEEDLE_THRESHOLD SIZE_MAX 00068 #endif 00069 00070 #ifndef MAX 00071 # define MAX(a, b) ((a < b) ? (b) : (a)) 00072 #endif 00073 00074 #ifndef CANON_ELEMENT 00075 # define CANON_ELEMENT(c) c 00076 #endif 00077 #ifndef CMP_FUNC 00078 # define CMP_FUNC memcmp 00079 #endif 00080 00081 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. 00082 Return the index of the first byte in the right half, and set 00083 *PERIOD to the global period of the right half. 00084 00085 The global period of a string is the smallest index (possibly its 00086 length) at which all remaining bytes in the string are repetitions 00087 of the prefix (the last repetition may be a subset of the prefix). 00088 00089 When NEEDLE is factored into two halves, a local period is the 00090 length of the smallest word that shares a suffix with the left half 00091 and shares a prefix with the right half. All factorizations of a 00092 non-empty NEEDLE have a local period of at least 1 and no greater 00093 than NEEDLE_LEN. 00094 00095 A critical factorization has the property that the local period 00096 equals the global period. All strings have at least one critical 00097 factorization with the left half smaller than the global period. 00098 00099 Given an ordered alphabet, a critical factorization can be computed 00100 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the 00101 larger of two ordered maximal suffixes. The ordered maximal 00102 suffixes are determined by lexicographic comparison of 00103 periodicity. */ 00104 static size_t 00105 critical_factorization (const unsigned char *needle, size_t needle_len, 00106 size_t * period) 00107 { 00108 /* Index of last byte of left half, or SIZE_MAX. */ 00109 size_t max_suffix, max_suffix_rev; 00110 size_t j; /* Index into NEEDLE for current candidate suffix. */ 00111 size_t k; /* Offset into current period. */ 00112 size_t p; /* Intermediate period. */ 00113 unsigned char a, b; /* Current comparison bytes. */ 00114 00115 /* Invariants: 00116 0 <= j < NEEDLE_LEN - 1 00117 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) 00118 min(max_suffix, max_suffix_rev) < global period of NEEDLE 00119 1 <= p <= global period of NEEDLE 00120 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] 00121 1 <= k <= p 00122 */ 00123 00124 /* Perform lexicographic search. */ 00125 max_suffix = SIZE_MAX; 00126 j = 0; 00127 k = p = 1; 00128 while (j + k < needle_len) 00129 { 00130 a = CANON_ELEMENT (needle[j + k]); 00131 b = CANON_ELEMENT (needle[max_suffix + k]); 00132 if (a < b) 00133 { 00134 /* Suffix is smaller, period is entire prefix so far. */ 00135 j += k; 00136 k = 1; 00137 p = j - max_suffix; 00138 } 00139 else if (a == b) 00140 { 00141 /* Advance through repetition of the current period. */ 00142 if (k != p) 00143 ++k; 00144 else 00145 { 00146 j += p; 00147 k = 1; 00148 } 00149 } 00150 else /* b < a */ 00151 { 00152 /* Suffix is larger, start over from current location. */ 00153 max_suffix = j++; 00154 k = p = 1; 00155 } 00156 } 00157 *period = p; 00158 00159 /* Perform reverse lexicographic search. */ 00160 max_suffix_rev = SIZE_MAX; 00161 j = 0; 00162 k = p = 1; 00163 while (j + k < needle_len) 00164 { 00165 a = CANON_ELEMENT (needle[j + k]); 00166 b = CANON_ELEMENT (needle[max_suffix_rev + k]); 00167 if (b < a) 00168 { 00169 /* Suffix is smaller, period is entire prefix so far. */ 00170 j += k; 00171 k = 1; 00172 p = j - max_suffix_rev; 00173 } 00174 else if (a == b) 00175 { 00176 /* Advance through repetition of the current period. */ 00177 if (k != p) 00178 ++k; 00179 else 00180 { 00181 j += p; 00182 k = 1; 00183 } 00184 } 00185 else /* a < b */ 00186 { 00187 /* Suffix is larger, start over from current location. */ 00188 max_suffix_rev = j++; 00189 k = p = 1; 00190 } 00191 } 00192 00193 /* Choose the longer suffix. Return the first byte of the right 00194 half, rather than the last byte of the left half. */ 00195 if (max_suffix_rev + 1 < max_suffix + 1) 00196 return max_suffix + 1; 00197 *period = p; 00198 return max_suffix_rev + 1; 00199 } 00200 00201 /* Return the first location of non-empty NEEDLE within HAYSTACK, or 00202 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 00203 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. 00204 Performance is guaranteed to be linear, with an initialization cost 00205 of 2 * NEEDLE_LEN comparisons. 00206 00207 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 00208 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. 00209 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 00210 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ 00211 static RETURN_TYPE 00212 two_way_short_needle (const unsigned char *haystack, size_t haystack_len, 00213 const unsigned char *needle, size_t needle_len) 00214 { 00215 size_t i; /* Index into current byte of NEEDLE. */ 00216 size_t j; /* Index into current window of HAYSTACK. */ 00217 size_t period; /* The period of the right half of needle. */ 00218 size_t suffix; /* The index of the right half of needle. */ 00219 00220 /* Factor the needle into two halves, such that the left half is 00221 smaller than the global period, and the right half is 00222 periodic (with a period as large as NEEDLE_LEN - suffix). */ 00223 suffix = critical_factorization (needle, needle_len, &period); 00224 00225 /* Perform the search. Each iteration compares the right half 00226 first. */ 00227 if (CMP_FUNC (needle, needle + period, suffix) == 0) 00228 { 00229 /* Entire needle is periodic; a mismatch can only advance by the 00230 period, so use memory to avoid rescanning known occurrences 00231 of the period. */ 00232 size_t memory = 0; 00233 j = 0; 00234 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 00235 { 00236 /* Scan for matches in right half. */ 00237 i = MAX (suffix, memory); 00238 while (i < needle_len && (CANON_ELEMENT (needle[i]) 00239 == CANON_ELEMENT (haystack[i + j]))) 00240 ++i; 00241 if (needle_len <= i) 00242 { 00243 /* Scan for matches in left half. */ 00244 i = suffix - 1; 00245 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 00246 == CANON_ELEMENT (haystack[i + j]))) 00247 --i; 00248 if (i + 1 < memory + 1) 00249 return (RETURN_TYPE) (haystack + j); 00250 /* No match, so remember how many repetitions of period 00251 on the right half were scanned. */ 00252 j += period; 00253 memory = needle_len - period; 00254 } 00255 else 00256 { 00257 j += i - suffix + 1; 00258 memory = 0; 00259 } 00260 } 00261 } 00262 else 00263 { 00264 /* The two halves of needle are distinct; no extra memory is 00265 required, and any mismatch results in a maximal shift. */ 00266 period = MAX (suffix, needle_len - suffix) + 1; 00267 j = 0; 00268 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 00269 { 00270 /* Scan for matches in right half. */ 00271 i = suffix; 00272 while (i < needle_len && (CANON_ELEMENT (needle[i]) 00273 == CANON_ELEMENT (haystack[i + j]))) 00274 ++i; 00275 if (needle_len <= i) 00276 { 00277 /* Scan for matches in left half. */ 00278 i = suffix - 1; 00279 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 00280 == CANON_ELEMENT (haystack[i + j]))) 00281 --i; 00282 if (i == SIZE_MAX) 00283 return (RETURN_TYPE) (haystack + j); 00284 j += period; 00285 } 00286 else 00287 j += i - suffix + 1; 00288 } 00289 } 00290 return NULL; 00291 } 00292 00293 /* Return the first location of non-empty NEEDLE within HAYSTACK, or 00294 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 00295 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. 00296 Performance is guaranteed to be linear, with an initialization cost 00297 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. 00298 00299 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 00300 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, 00301 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. 00302 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 00303 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and 00304 sublinear performance is not possible. */ 00305 static RETURN_TYPE 00306 two_way_long_needle (const unsigned char *haystack, size_t haystack_len, 00307 const unsigned char *needle, size_t needle_len) 00308 { 00309 size_t i; /* Index into current byte of NEEDLE. */ 00310 size_t j; /* Index into current window of HAYSTACK. */ 00311 size_t period; /* The period of the right half of needle. */ 00312 size_t suffix; /* The index of the right half of needle. */ 00313 size_t shift_table[1U << CHAR_BIT]; /* See below. */ 00314 00315 /* Factor the needle into two halves, such that the left half is 00316 smaller than the global period, and the right half is 00317 periodic (with a period as large as NEEDLE_LEN - suffix). */ 00318 suffix = critical_factorization (needle, needle_len, &period); 00319 00320 /* Populate shift_table. For each possible byte value c, 00321 shift_table[c] is the distance from the last occurrence of c to 00322 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. 00323 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ 00324 for (i = 0; i < 1U << CHAR_BIT; i++) 00325 shift_table[i] = needle_len; 00326 for (i = 0; i < needle_len; i++) 00327 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; 00328 00329 /* Perform the search. Each iteration compares the right half 00330 first. */ 00331 if (CMP_FUNC (needle, needle + period, suffix) == 0) 00332 { 00333 /* Entire needle is periodic; a mismatch can only advance by the 00334 period, so use memory to avoid rescanning known occurrences 00335 of the period. */ 00336 size_t memory = 0; 00337 size_t shift; 00338 j = 0; 00339 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 00340 { 00341 /* Check the last byte first; if it does not match, then 00342 shift to the next possible match location. */ 00343 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 00344 if (0 < shift) 00345 { 00346 if (memory && shift < period) 00347 { 00348 /* Since needle is periodic, but the last period has 00349 a byte out of place, there can be no match until 00350 after the mismatch. */ 00351 shift = needle_len - period; 00352 memory = 0; 00353 } 00354 j += shift; 00355 continue; 00356 } 00357 /* Scan for matches in right half. The last byte has 00358 already been matched, by virtue of the shift table. */ 00359 i = MAX (suffix, memory); 00360 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 00361 == CANON_ELEMENT (haystack[i + j]))) 00362 ++i; 00363 if (needle_len - 1 <= i) 00364 { 00365 /* Scan for matches in left half. */ 00366 i = suffix - 1; 00367 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 00368 == CANON_ELEMENT (haystack[i + j]))) 00369 --i; 00370 if (i + 1 < memory + 1) 00371 return (RETURN_TYPE) (haystack + j); 00372 /* No match, so remember how many repetitions of period 00373 on the right half were scanned. */ 00374 j += period; 00375 memory = needle_len - period; 00376 } 00377 else 00378 { 00379 j += i - suffix + 1; 00380 memory = 0; 00381 } 00382 } 00383 } 00384 else 00385 { 00386 /* The two halves of needle are distinct; no extra memory is 00387 required, and any mismatch results in a maximal shift. */ 00388 size_t shift; 00389 period = MAX (suffix, needle_len - suffix) + 1; 00390 j = 0; 00391 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 00392 { 00393 /* Check the last byte first; if it does not match, then 00394 shift to the next possible match location. */ 00395 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 00396 if (0 < shift) 00397 { 00398 j += shift; 00399 continue; 00400 } 00401 /* Scan for matches in right half. The last byte has 00402 already been matched, by virtue of the shift table. */ 00403 i = suffix; 00404 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 00405 == CANON_ELEMENT (haystack[i + j]))) 00406 ++i; 00407 if (needle_len - 1 <= i) 00408 { 00409 /* Scan for matches in left half. */ 00410 i = suffix - 1; 00411 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 00412 == CANON_ELEMENT (haystack[i + j]))) 00413 --i; 00414 if (i == SIZE_MAX) 00415 return (RETURN_TYPE) (haystack + j); 00416 j += period; 00417 } 00418 else 00419 j += i - suffix + 1; 00420 } 00421 } 00422 return NULL; 00423 } 00424 00425 #undef AVAILABLE 00426 #undef CANON_ELEMENT 00427 #undef CMP_FUNC 00428 #undef MAX 00429 #undef RETURN_TYPE