00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "regexp.h"
00025
00026 #include "lexer.h"
00027 #include <assert.h>
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031
00032 using namespace KJS;
00033
00034 #ifdef PCRE_CONFIG_UTF8
00035 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
00036 #endif
00037
00038 RegExp::RegExp(const UString &p, int f)
00039 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
00040 {
00041
00042 #ifdef PCRE_CONFIG_UTF8
00043 if (utf8Support == Unknown) {
00044 int supported;
00045 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
00046 utf8Support = supported ? Supported : Unsupported;
00047 }
00048 #endif
00049
00050 nrSubPatterns = 0;
00051
00052
00053
00054
00055
00056
00057 UString intern;
00058 const char* const nil = "\\x00";
00059 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
00060 bool escape = false;
00061 for (int i = 0; i < p.size(); ++i) {
00062 UChar c = p[i];
00063 if (escape) {
00064 escape = false;
00065
00066 if (c == 'u') {
00067
00068
00069 unsigned short u = 0;
00070 int j = 0;
00071 for (j = 0; j < 4; ++j) {
00072 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
00073 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
00074 ++i;
00075 } else {
00076
00077
00078 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
00079 i -= j;
00080 break;
00081 }
00082 }
00083 if (j < 4) {
00084
00085
00086 intern.append(UString('u'));
00087 } else {
00088 c = UChar(u);
00089 switch (u) {
00090 case 0:
00091
00092 intern += UString(nil);
00093 break;
00094 case '^':
00095 case '$':
00096 case '\\':
00097 case '.':
00098 case '*':
00099 case '+':
00100 case '?':
00101 case '(': case ')':
00102 case '{': case '}':
00103 case '[': case ']':
00104 case '|':
00105
00106 intern.append(UString('\\'));
00107
00108 default:
00109 intern += UString(&c, 1);
00110 break;
00111 }
00112 }
00113 continue;
00114 }
00115 intern += UString('\\');
00116 intern += UString(&c, 1);
00117 } else {
00118 if (c == '\\')
00119 escape = true;
00120 else if (c == '\0')
00121 intern += UString(nil);
00122 else
00123 intern += UString(&c, 1);
00124 }
00125 }
00126 } else {
00127 intern = p;
00128 }
00129
00130 #ifdef HAVE_PCREPOSIX
00131 int pcreflags = 0;
00132 const char *perrormsg;
00133 int errorOffset;
00134
00135 if (flgs & IgnoreCase)
00136 pcreflags |= PCRE_CASELESS;
00137
00138 if (flgs & Multiline)
00139 pcreflags |= PCRE_MULTILINE;
00140
00141 #ifdef PCRE_CONFIG_UTF8
00142 if (utf8Support == Supported)
00143 pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
00144 #endif
00145
00146
00147
00148 prepareMatch(intern);
00149
00150 pcregex = pcre_compile(buffer, pcreflags,
00151 &perrormsg, &errorOffset, NULL);
00152 doneMatch();
00153 if (!pcregex) {
00154 #ifndef NDEBUG
00155 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00156 #endif
00157 valid = false;
00158 return;
00159 }
00160
00161 #ifdef PCRE_INFO_CAPTURECOUNT
00162
00163 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00164 if (rc != 0)
00165 #endif
00166 nrSubPatterns = 0;
00167
00168 #else
00169
00170 int regflags = 0;
00171 #ifdef REG_EXTENDED
00172 regflags |= REG_EXTENDED;
00173 #endif
00174 #ifdef REG_ICASE
00175 if ( f & IgnoreCase )
00176 regflags |= REG_ICASE;
00177 #endif
00178
00179
00180
00181
00182
00183
00184 int errorCode = regcomp(&preg, intern.ascii(), regflags);
00185 if (errorCode != 0) {
00186 #ifndef NDEBUG
00187 char errorMessage[80];
00188 regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00189 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
00190 #endif
00191 valid = false;
00192 }
00193 #endif
00194 }
00195
00196 RegExp::~RegExp()
00197 {
00198 doneMatch();
00199 #ifdef HAVE_PCREPOSIX
00200 if (pcregex)
00201 pcre_free(pcregex);
00202 #else
00203
00204 regfree(&preg);
00205 #endif
00206 }
00207
00208 void RegExp::prepareUtf8(const UString& s)
00209 {
00210
00211 const int length = s.size();
00212 buffer = new char[length * 3 + 1];
00213
00214
00215
00216 originalPos = new int[length * 3 + 2];
00217
00218
00219
00220
00221 char *p = buffer;
00222 int *posOut = originalPos;
00223 const UChar *d = s.data();
00224 for (int i = 0; i != length; ++i) {
00225 unsigned short c = d[i].unicode();
00226
00227 int sequenceLen;
00228 if (c < 0x80) {
00229 *p++ = (char)c;
00230 sequenceLen = 1;
00231 } else if (c < 0x800) {
00232 *p++ = (char)((c >> 6) | 0xC0);
00233 *p++ = (char)((c | 0x80) & 0xBF);
00234 sequenceLen = 2;
00235 } else {
00236 *p++ = (char)((c >> 12) | 0xE0);
00237 *p++ = (char)(((c >> 6) | 0x80) & 0xBF);
00238 *p++ = (char)((c | 0x80) & 0xBF);
00239 sequenceLen = 3;
00240 }
00241
00242 while (sequenceLen > 0) {
00243 *posOut = i;
00244 ++posOut;
00245 --sequenceLen;
00246 }
00247 }
00248
00249 bufferSize = p - buffer;
00250
00251 *p++ = '\0';
00252
00253
00254 *posOut = length;
00255 *(posOut+1) = length+1;
00256 }
00257
00258 void RegExp::prepareASCII (const UString& s)
00259 {
00260 originalPos = 0;
00261
00262
00263
00264
00265 CString truncated = s.cstring();
00266 buffer = new char[truncated.size() + 1];
00267 memcpy(buffer, truncated.c_str(), truncated.size());
00268 buffer[truncated.size()] = '\0';
00269 bufferSize = truncated.size();
00270 }
00271
00272 void RegExp::prepareMatch(const UString &s)
00273 {
00274 delete[] originalPos;
00275 delete[] buffer;
00276 #ifdef PCRE_CONFIG_UTF8
00277 if (utf8Support == Supported)
00278 prepareUtf8(s);
00279 else
00280 #endif
00281 prepareASCII(s);
00282
00283 #ifndef NDEBUG
00284 originalS = s;
00285 #endif
00286 }
00287
00288 void RegExp::doneMatch()
00289 {
00290 delete[] originalPos; originalPos = 0;
00291 delete[] buffer; buffer = 0;
00292 }
00293
00294 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00295 {
00296 #ifndef NDEBUG
00297 assert(s.data() == originalS.data());
00298 #endif
00299 assert(valid);
00300
00301 if (i < 0)
00302 i = 0;
00303 if (ovector)
00304 *ovector = 0L;
00305 int dummyPos;
00306 if (!pos)
00307 pos = &dummyPos;
00308 *pos = -1;
00309 if (i > s.size() || s.isNull())
00310 return UString::null;
00311
00312 #ifdef HAVE_PCREPOSIX
00313 int ovecsize = (nrSubPatterns+1)*3;
00314 if (ovector) *ovector = new int[ovecsize];
00315 if (!pcregex)
00316 return UString::null;
00317
00318 int startPos;
00319 int nextPos;
00320
00321 #ifdef PCRE_CONFIG_UTF8
00322 if (utf8Support == Supported) {
00323 startPos = i;
00324 while (originalPos[startPos] < i)
00325 ++startPos;
00326
00327 nextPos = startPos;
00328 while (originalPos[nextPos] < (i + 1))
00329 ++nextPos;
00330 } else
00331 #endif
00332 {
00333 startPos = i;
00334 nextPos = i + 1;
00335 }
00336
00337 int baseFlags =
00338 #ifdef PCRE_CONFIG_UTF8
00339 utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
00340 #endif
00341 0;
00342 if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
00343 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags,
00344 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00345 {
00346
00347 if ((flgs & Global) && m_notEmpty && ovector)
00348 {
00349
00350
00351
00352 #ifdef KJS_VERBOSE
00353 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00354 #endif
00355 m_notEmpty = 0;
00356 if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
00357 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00358 return UString::null;
00359 }
00360 else
00361 return UString::null;
00362 }
00363
00364
00365
00366 if (ovector && originalPos) {
00367 for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
00368 if ((*ovector)[c] != -1)
00369 (*ovector)[c] = originalPos[(*ovector)[c]];
00370 }
00371 }
00372
00373 if (!ovector)
00374 return UString::null;
00375 #else
00376 const uint maxMatch = 10;
00377 regmatch_t rmatch[maxMatch];
00378
00379 char *str = strdup(s.ascii());
00380 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00381 free(str);
00382 return UString::null;
00383 }
00384 free(str);
00385
00386 if (!ovector) {
00387 *pos = rmatch[0].rm_so + i;
00388 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00389 }
00390
00391
00392 nrSubPatterns = 0;
00393 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00394 nrSubPatterns++;
00395
00396
00397 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
00398 return UString::null;
00399 }
00400
00401 if (nrSubPatterns == 0) nrSubPatterns = 1;
00402
00403 int ovecsize = (nrSubPatterns)*3;
00404 *ovector = new int[ovecsize];
00405 for (uint j = 0; j < nrSubPatterns; j++) {
00406 (*ovector)[2*j] = rmatch[j].rm_so + i;
00407 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00408 }
00409 #endif
00410
00411 *pos = (*ovector)[0];
00412 if ( *pos == (*ovector)[1] && (flgs & Global) )
00413 {
00414
00415 m_notEmpty=true;
00416 }
00417 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00418 }
00419
00420 #if 0 // unused
00421 bool RegExp::test(const UString &s, int)
00422 {
00423 #ifdef HAVE_PCREPOSIX
00424 int ovector[300];
00425 CString buffer(s.cstring());
00426
00427 if (s.isNull() ||
00428 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00429 0, ovector, 300) == PCRE_ERROR_NOMATCH)
00430 return false;
00431 else
00432 return true;
00433
00434 #else
00435
00436 char *str = strdup(s.ascii());
00437 int r = regexec(&preg, str, 0, 0, 0);
00438 free(str);
00439
00440 return r == 0;
00441 #endif
00442 }
00443 #endif