kjs Library API Documentation

regexp.cpp

00001 // -*- c-basic-offset: 2 -*-
00002 /*
00003  *  This file is part of the KDE libraries
00004  *  Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
00005  *
00006  *  This library is free software; you can redistribute it and/or
00007  *  modify it under the terms of the GNU Lesser General Public
00008  *  License as published by the Free Software Foundation; either
00009  *  version 2 of the License, or (at your option) any later version.
00010  *
00011  *  This library is distributed in the hope that it will be useful,
00012  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  *  Lesser General Public License for more details.
00015  *
00016  *  You should have received a copy of the GNU Lesser General Public
00017  *  License along with this library; if not, write to the Free Software
00018  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #include "regexp.h"
00023 
00024 #include "lexer.h"
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 
00029 using namespace KJS;
00030 
00031 RegExp::RegExp(const UString &p, int f)
00032   : pat(p), flgs(f), m_notEmpty(false)
00033 {
00034   // JS regexps can contain Unicode escape sequences (\uxxxx) which
00035   // are rather uncommon elsewhere. As our regexp libs don't understand
00036   // them we do the unescaping ourselves internally.
00037   UString intern;
00038   if (p.find('\\') >= 0) {
00039     bool escape = false;
00040     for (int i = 0; i < p.size(); ++i) {
00041       UChar c = p[i];
00042       if (escape) {
00043         escape = false;
00044         // we only care about \uxxxx
00045         if (c == 'u' && i + 4 < p.size()) {
00046           int c0 = p[i+1].unicode();
00047           int c1 = p[i+2].unicode();
00048           int c2 = p[i+3].unicode();
00049           int c3 = p[i+4].unicode();
00050           if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00051               Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00052             c = Lexer::convertUnicode(c0, c1, c2, c3);
00053             intern += UString(&c, 1);
00054             i += 4;
00055             continue;
00056           }
00057         }
00058         intern += UString('\\');
00059         intern += UString(&c, 1);
00060       } else {
00061         if (c == '\\')
00062           escape = true;
00063         else
00064           intern += UString(&c, 1);
00065       }
00066     }
00067   } else {
00068     intern = p;
00069   }
00070 
00071 #ifdef HAVE_PCREPOSIX
00072   int pcreflags = 0;
00073   const char *perrormsg;
00074   int errorOffset;
00075 
00076   if (flgs & IgnoreCase)
00077     pcreflags |= PCRE_CASELESS;
00078 
00079   if (flgs & Multiline)
00080     pcreflags |= PCRE_MULTILINE;
00081 
00082   pcregex = pcre_compile(intern.ascii(), pcreflags,
00083              &perrormsg, &errorOffset, NULL);
00084 #ifndef NDEBUG
00085   if (!pcregex)
00086     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00087 #endif
00088 
00089 #ifdef PCRE_INFO_CAPTURECOUNT
00090   // Get number of subpatterns that will be returned
00091   int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00092   if (rc != 0)
00093 #endif
00094     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
00095 
00096 #else /* HAVE_PCREPOSIX */
00097 
00098   nrSubPatterns = 0; // determined in match() with POSIX regex.
00099   int regflags = 0;
00100 #ifdef REG_EXTENDED
00101   regflags |= REG_EXTENDED;
00102 #endif
00103 #ifdef REG_ICASE
00104   if ( f & IgnoreCase )
00105     regflags |= REG_ICASE;
00106 #endif
00107 
00108   //NOTE: Multiline is not feasible with POSIX regex.
00109   //if ( f & Multiline )
00110   //    ;
00111   // Note: the Global flag is already handled by RegExpProtoFunc::execute
00112 
00113   if (regcomp(&preg, intern.ascii(), regflags) != 0) {
00114     /* TODO: throw JS exception */
00115     regcomp(&preg, "", regflags);
00116   }
00117 #endif
00118 }
00119 
00120 RegExp::~RegExp()
00121 {
00122 #ifdef HAVE_PCREPOSIX
00123   if (pcregex)
00124     pcre_free(pcregex);
00125 #else
00126   /* TODO: is this really okay after an error ? */
00127   regfree(&preg);
00128 #endif
00129 }
00130 
00131 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00132 {
00133   if (i < 0)
00134     i = 0;
00135   if (ovector)
00136     *ovector = 0L;
00137   int dummyPos;
00138   if (!pos)
00139     pos = &dummyPos;
00140   *pos = -1;
00141   if (i > s.size() || s.isNull())
00142     return UString::null;
00143 
00144 #ifdef HAVE_PCREPOSIX
00145   CString buffer(s.cstring());
00146   int bufferSize = buffer.size();
00147   int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
00148   if (ovector) *ovector = new int[ovecsize];
00149   if (!pcregex)
00150     return UString::null;
00151 
00152   if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i,
00153                 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest
00154                 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00155   {
00156     // Failed to match.
00157     if ((flgs & Global) && m_notEmpty && ovector)
00158     {
00159       // We set m_notEmpty ourselves, to look for a non-empty match
00160       // (see man pcretest or pcretest.c for details).
00161       // So we don't stop here, we want to try again at i+1.
00162 #ifndef NDEBUG
00163       fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00164 #endif
00165       m_notEmpty = 0;
00166       if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0,
00167                     ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00168         return UString::null;
00169     }
00170     else // done
00171       return UString::null;
00172   }
00173 
00174   // Got a match, proceed with it.
00175 
00176   if (!ovector)
00177     return UString::null; // don't rely on the return value if you pass ovector==0
00178 #else
00179   const uint maxMatch = 10;
00180   regmatch_t rmatch[maxMatch];
00181 
00182   char *str = strdup(s.ascii()); // TODO: why ???
00183   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00184     free(str);
00185     return UString::null;
00186   }
00187   free(str);
00188 
00189   if (!ovector) {
00190     *pos = rmatch[0].rm_so + i;
00191     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00192   }
00193 
00194   // map rmatch array to ovector used in PCRE case
00195   nrSubPatterns = 0;
00196   for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
00197       nrSubPatterns++;
00198   int ovecsize = (nrSubPatterns+1)*3; // see above
00199   *ovector = new int[ovecsize];
00200   for (uint j = 0; j < nrSubPatterns + 1; j++) {
00201     if (j>maxMatch)
00202       break;
00203     (*ovector)[2*j] = rmatch[j].rm_so + i;
00204     (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00205   }
00206 #endif
00207 
00208   *pos = (*ovector)[0];
00209 #ifdef HAVE_PCREPOSIX  // TODO check this stuff in non-pcre mode
00210   if ( *pos == (*ovector)[1] && (flgs & Global) )
00211   {
00212     // empty match, next try will be with m_notEmpty=true
00213     m_notEmpty=true;
00214   }
00215 #endif
00216   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00217 }
00218 
00219 #if 0 // unused
00220 bool RegExp::test(const UString &s, int)
00221 {
00222 #ifdef HAVE_PCREPOSIX
00223   int ovector[300];
00224   CString buffer(s.cstring());
00225 
00226   if (s.isNull() ||
00227       pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00228         0, ovector, 300) == PCRE_ERROR_NOMATCH)
00229     return false;
00230   else
00231     return true;
00232 
00233 #else
00234 
00235   char *str = strdup(s.ascii());
00236   int r = regexec(&preg, str, 0, 0, 0);
00237   free(str);
00238 
00239   return r == 0;
00240 #endif
00241 }
00242 #endif
KDE Logo
This file is part of the documentation for kjs Library Version 3.4.0.
Documentation copyright © 1996-2004 the KDE developers.
Generated on Wed May 4 07:12:50 2005 by doxygen 1.4.2 written by Dimitri van Heesch, © 1997-2003