|  | // Copyright 2003-2009 Google Inc.  All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // This is a variant of PCRE's pcrecpp.cc, originally written at Google. | 
|  | // The main changes are the addition of the HitLimit method and | 
|  | // compilation as PCRE in namespace re2. | 
|  |  | 
|  | #include <errno.h> | 
|  | #include "util/util.h" | 
|  | #include "util/flags.h" | 
|  | #include "util/pcre.h" | 
|  |  | 
|  | #ifdef WIN32 | 
|  | #define strtoll _strtoi64 | 
|  | #define strtoull _strtoui64 | 
|  | #endif | 
|  |  | 
|  | #define PCREPORT(level) LOG(level) | 
|  |  | 
|  | // Default PCRE limits. | 
|  | // Defaults chosen to allow a plausible amount of CPU and | 
|  | // not exceed main thread stacks.  Note that other threads | 
|  | // often have smaller stacks, and therefore tightening | 
|  | // regexp_stack_limit may frequently be necessary. | 
|  | DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); | 
|  | DEFINE_int32(regexp_match_limit, 1000000, | 
|  | "default PCRE match limit (function calls)"); | 
|  |  | 
|  | namespace re2 { | 
|  |  | 
|  | // Maximum number of args we can set | 
|  | static const int kMaxArgs = 16; | 
|  | static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace | 
|  |  | 
|  | // Approximate size of a recursive invocation of PCRE's | 
|  | // internal "match()" frame.  This varies depending on the | 
|  | // compiler and architecture, of course, so the constant is | 
|  | // just a conservative estimate.  To find the exact number, | 
|  | // run regexp_unittest with --regexp_stack_limit=0 under | 
|  | // a debugger and look at the frames when it crashes. | 
|  | // The exact frame size was 656 in production on 2008/02/03. | 
|  | static const int kPCREFrameSize = 700; | 
|  |  | 
|  | // Special name for missing C++ arguments. | 
|  | PCRE::Arg PCRE::no_more_args((void*)NULL); | 
|  |  | 
|  | const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; | 
|  | const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; | 
|  | const PCRE::ConsumeFunctor PCRE::Consume = { }; | 
|  | const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; | 
|  |  | 
|  | // If a regular expression has no error, its error_ field points here | 
|  | static const string empty_string; | 
|  |  | 
|  | void PCRE::Init(const char* pattern, Option options, int match_limit, | 
|  | int stack_limit, bool report_errors) { | 
|  | pattern_ = pattern; | 
|  | options_ = options; | 
|  | match_limit_ = match_limit; | 
|  | stack_limit_ = stack_limit; | 
|  | hit_limit_ = false; | 
|  | error_ = &empty_string; | 
|  | report_errors_ = report_errors; | 
|  | re_full_ = NULL; | 
|  | re_partial_ = NULL; | 
|  |  | 
|  | if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { | 
|  | error_ = new string("illegal regexp option"); | 
|  | PCREPORT(ERROR) | 
|  | << "Error compiling '" << pattern << "': illegal regexp option"; | 
|  | } else { | 
|  | re_partial_ = Compile(UNANCHORED); | 
|  | if (re_partial_ != NULL) { | 
|  | re_full_ = Compile(ANCHOR_BOTH); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | PCRE::PCRE(const char* pattern) { | 
|  | Init(pattern, None, 0, 0, true); | 
|  | } | 
|  | PCRE::PCRE(const char* pattern, Option option) { | 
|  | Init(pattern, option, 0, 0, true); | 
|  | } | 
|  | PCRE::PCRE(const string& pattern) { | 
|  | Init(pattern.c_str(), None, 0, 0, true); | 
|  | } | 
|  | PCRE::PCRE(const string& pattern, Option option) { | 
|  | Init(pattern.c_str(), option, 0, 0, true); | 
|  | } | 
|  | PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) { | 
|  | Init(pattern.c_str(), re_option.option(), re_option.match_limit(), | 
|  | re_option.stack_limit(), re_option.report_errors()); | 
|  | } | 
|  |  | 
|  | PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { | 
|  | Init(pattern, re_option.option(), re_option.match_limit(), | 
|  | re_option.stack_limit(), re_option.report_errors()); | 
|  | } | 
|  |  | 
|  | PCRE::~PCRE() { | 
|  | if (re_full_ != NULL)         pcre_free(re_full_); | 
|  | if (re_partial_ != NULL)      pcre_free(re_partial_); | 
|  | if (error_ != &empty_string)  delete error_; | 
|  | } | 
|  |  | 
|  | pcre* PCRE::Compile(Anchor anchor) { | 
|  | // Special treatment for anchoring.  This is needed because at | 
|  | // runtime pcre only provides an option for anchoring at the | 
|  | // beginning of a string. | 
|  | // | 
|  | // There are three types of anchoring we want: | 
|  | //    UNANCHORED      Compile the original pattern, and use | 
|  | //                    a pcre unanchored match. | 
|  | //    ANCHOR_START    Compile the original pattern, and use | 
|  | //                    a pcre anchored match. | 
|  | //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern | 
|  | //                    and use a pcre anchored match. | 
|  |  | 
|  | const char* error; | 
|  | int eoffset; | 
|  | pcre* re; | 
|  | if (anchor != ANCHOR_BOTH) { | 
|  | re = pcre_compile(pattern_.c_str(), | 
|  | (options_ & EnabledCompileOptions), | 
|  | &error, &eoffset, NULL); | 
|  | } else { | 
|  | // Tack a '\z' at the end of PCRE.  Parenthesize it first so that | 
|  | // the '\z' applies to all top-level alternatives in the regexp. | 
|  | string wrapped = "(?:";  // A non-counting grouping operator | 
|  | wrapped += pattern_; | 
|  | wrapped += ")\\z"; | 
|  | re = pcre_compile(wrapped.c_str(), | 
|  | (options_ & EnabledCompileOptions), | 
|  | &error, &eoffset, NULL); | 
|  | } | 
|  | if (re == NULL) { | 
|  | if (error_ == &empty_string) error_ = new string(error); | 
|  | PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; | 
|  | } | 
|  | return re; | 
|  | } | 
|  |  | 
|  | /***** Convenience interfaces *****/ | 
|  |  | 
|  | bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, | 
|  | const PCRE& re, | 
|  | const Arg& a0, | 
|  | const Arg& a1, | 
|  | const Arg& a2, | 
|  | const Arg& a3, | 
|  | const Arg& a4, | 
|  | const Arg& a5, | 
|  | const Arg& a6, | 
|  | const Arg& a7, | 
|  | const Arg& a8, | 
|  | const Arg& a9, | 
|  | const Arg& a10, | 
|  | const Arg& a11, | 
|  | const Arg& a12, | 
|  | const Arg& a13, | 
|  | const Arg& a14, | 
|  | const Arg& a15) const { | 
|  | const Arg* args[kMaxArgs]; | 
|  | int n = 0; | 
|  | if (&a0 == &no_more_args)  goto done; args[n++] = &a0; | 
|  | if (&a1 == &no_more_args)  goto done; args[n++] = &a1; | 
|  | if (&a2 == &no_more_args)  goto done; args[n++] = &a2; | 
|  | if (&a3 == &no_more_args)  goto done; args[n++] = &a3; | 
|  | if (&a4 == &no_more_args)  goto done; args[n++] = &a4; | 
|  | if (&a5 == &no_more_args)  goto done; args[n++] = &a5; | 
|  | if (&a6 == &no_more_args)  goto done; args[n++] = &a6; | 
|  | if (&a7 == &no_more_args)  goto done; args[n++] = &a7; | 
|  | if (&a8 == &no_more_args)  goto done; args[n++] = &a8; | 
|  | if (&a9 == &no_more_args)  goto done; args[n++] = &a9; | 
|  | if (&a10 == &no_more_args) goto done; args[n++] = &a10; | 
|  | if (&a11 == &no_more_args) goto done; args[n++] = &a11; | 
|  | if (&a12 == &no_more_args) goto done; args[n++] = &a12; | 
|  | if (&a13 == &no_more_args) goto done; args[n++] = &a13; | 
|  | if (&a14 == &no_more_args) goto done; args[n++] = &a14; | 
|  | if (&a15 == &no_more_args) goto done; args[n++] = &a15; | 
|  | done: | 
|  |  | 
|  | int consumed; | 
|  | int vec[kVecSize]; | 
|  | return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); | 
|  | } | 
|  |  | 
|  | bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, | 
|  | const PCRE& re, | 
|  | const Arg& a0, | 
|  | const Arg& a1, | 
|  | const Arg& a2, | 
|  | const Arg& a3, | 
|  | const Arg& a4, | 
|  | const Arg& a5, | 
|  | const Arg& a6, | 
|  | const Arg& a7, | 
|  | const Arg& a8, | 
|  | const Arg& a9, | 
|  | const Arg& a10, | 
|  | const Arg& a11, | 
|  | const Arg& a12, | 
|  | const Arg& a13, | 
|  | const Arg& a14, | 
|  | const Arg& a15) const { | 
|  | const Arg* args[kMaxArgs]; | 
|  | int n = 0; | 
|  | if (&a0 == &no_more_args)  goto done; args[n++] = &a0; | 
|  | if (&a1 == &no_more_args)  goto done; args[n++] = &a1; | 
|  | if (&a2 == &no_more_args)  goto done; args[n++] = &a2; | 
|  | if (&a3 == &no_more_args)  goto done; args[n++] = &a3; | 
|  | if (&a4 == &no_more_args)  goto done; args[n++] = &a4; | 
|  | if (&a5 == &no_more_args)  goto done; args[n++] = &a5; | 
|  | if (&a6 == &no_more_args)  goto done; args[n++] = &a6; | 
|  | if (&a7 == &no_more_args)  goto done; args[n++] = &a7; | 
|  | if (&a8 == &no_more_args)  goto done; args[n++] = &a8; | 
|  | if (&a9 == &no_more_args)  goto done; args[n++] = &a9; | 
|  | if (&a10 == &no_more_args) goto done; args[n++] = &a10; | 
|  | if (&a11 == &no_more_args) goto done; args[n++] = &a11; | 
|  | if (&a12 == &no_more_args) goto done; args[n++] = &a12; | 
|  | if (&a13 == &no_more_args) goto done; args[n++] = &a13; | 
|  | if (&a14 == &no_more_args) goto done; args[n++] = &a14; | 
|  | if (&a15 == &no_more_args) goto done; args[n++] = &a15; | 
|  | done: | 
|  |  | 
|  | int consumed; | 
|  | int vec[kVecSize]; | 
|  | return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); | 
|  | } | 
|  |  | 
|  | bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, | 
|  | const PCRE& pattern, | 
|  | const Arg& a0, | 
|  | const Arg& a1, | 
|  | const Arg& a2, | 
|  | const Arg& a3, | 
|  | const Arg& a4, | 
|  | const Arg& a5, | 
|  | const Arg& a6, | 
|  | const Arg& a7, | 
|  | const Arg& a8, | 
|  | const Arg& a9, | 
|  | const Arg& a10, | 
|  | const Arg& a11, | 
|  | const Arg& a12, | 
|  | const Arg& a13, | 
|  | const Arg& a14, | 
|  | const Arg& a15) const { | 
|  | const Arg* args[kMaxArgs]; | 
|  | int n = 0; | 
|  | if (&a0 == &no_more_args)  goto done; args[n++] = &a0; | 
|  | if (&a1 == &no_more_args)  goto done; args[n++] = &a1; | 
|  | if (&a2 == &no_more_args)  goto done; args[n++] = &a2; | 
|  | if (&a3 == &no_more_args)  goto done; args[n++] = &a3; | 
|  | if (&a4 == &no_more_args)  goto done; args[n++] = &a4; | 
|  | if (&a5 == &no_more_args)  goto done; args[n++] = &a5; | 
|  | if (&a6 == &no_more_args)  goto done; args[n++] = &a6; | 
|  | if (&a7 == &no_more_args)  goto done; args[n++] = &a7; | 
|  | if (&a8 == &no_more_args)  goto done; args[n++] = &a8; | 
|  | if (&a9 == &no_more_args)  goto done; args[n++] = &a9; | 
|  | if (&a10 == &no_more_args) goto done; args[n++] = &a10; | 
|  | if (&a11 == &no_more_args) goto done; args[n++] = &a11; | 
|  | if (&a12 == &no_more_args) goto done; args[n++] = &a12; | 
|  | if (&a13 == &no_more_args) goto done; args[n++] = &a13; | 
|  | if (&a14 == &no_more_args) goto done; args[n++] = &a14; | 
|  | if (&a15 == &no_more_args) goto done; args[n++] = &a15; | 
|  | done: | 
|  |  | 
|  | int consumed; | 
|  | int vec[kVecSize]; | 
|  | if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, | 
|  | args, n, vec, kVecSize)) { | 
|  | input->remove_prefix(consumed); | 
|  | return true; | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, | 
|  | const PCRE& pattern, | 
|  | const Arg& a0, | 
|  | const Arg& a1, | 
|  | const Arg& a2, | 
|  | const Arg& a3, | 
|  | const Arg& a4, | 
|  | const Arg& a5, | 
|  | const Arg& a6, | 
|  | const Arg& a7, | 
|  | const Arg& a8, | 
|  | const Arg& a9, | 
|  | const Arg& a10, | 
|  | const Arg& a11, | 
|  | const Arg& a12, | 
|  | const Arg& a13, | 
|  | const Arg& a14, | 
|  | const Arg& a15) const { | 
|  | const Arg* args[kMaxArgs]; | 
|  | int n = 0; | 
|  | if (&a0 == &no_more_args)  goto done; args[n++] = &a0; | 
|  | if (&a1 == &no_more_args)  goto done; args[n++] = &a1; | 
|  | if (&a2 == &no_more_args)  goto done; args[n++] = &a2; | 
|  | if (&a3 == &no_more_args)  goto done; args[n++] = &a3; | 
|  | if (&a4 == &no_more_args)  goto done; args[n++] = &a4; | 
|  | if (&a5 == &no_more_args)  goto done; args[n++] = &a5; | 
|  | if (&a6 == &no_more_args)  goto done; args[n++] = &a6; | 
|  | if (&a7 == &no_more_args)  goto done; args[n++] = &a7; | 
|  | if (&a8 == &no_more_args)  goto done; args[n++] = &a8; | 
|  | if (&a9 == &no_more_args)  goto done; args[n++] = &a9; | 
|  | if (&a10 == &no_more_args) goto done; args[n++] = &a10; | 
|  | if (&a11 == &no_more_args) goto done; args[n++] = &a11; | 
|  | if (&a12 == &no_more_args) goto done; args[n++] = &a12; | 
|  | if (&a13 == &no_more_args) goto done; args[n++] = &a13; | 
|  | if (&a14 == &no_more_args) goto done; args[n++] = &a14; | 
|  | if (&a15 == &no_more_args) goto done; args[n++] = &a15; | 
|  | done: | 
|  |  | 
|  | int consumed; | 
|  | int vec[kVecSize]; | 
|  | if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, | 
|  | args, n, vec, kVecSize)) { | 
|  | input->remove_prefix(consumed); | 
|  | return true; | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool PCRE::Replace(string *str, | 
|  | const PCRE& pattern, | 
|  | const StringPiece& rewrite) { | 
|  | int vec[kVecSize]; | 
|  | int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); | 
|  | if (matches == 0) | 
|  | return false; | 
|  |  | 
|  | string s; | 
|  | if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) | 
|  | return false; | 
|  |  | 
|  | assert(vec[0] >= 0); | 
|  | assert(vec[1] >= 0); | 
|  | str->replace(vec[0], vec[1] - vec[0], s); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | int PCRE::GlobalReplace(string *str, | 
|  | const PCRE& pattern, | 
|  | const StringPiece& rewrite) { | 
|  | int count = 0; | 
|  | int vec[kVecSize]; | 
|  | string out; | 
|  | int start = 0; | 
|  | bool last_match_was_empty_string = false; | 
|  |  | 
|  | for (; start <= str->length();) { | 
|  | // If the previous match was for the empty string, we shouldn't | 
|  | // just match again: we'll match in the same way and get an | 
|  | // infinite loop.  Instead, we do the match in a special way: | 
|  | // anchored -- to force another try at the same position -- | 
|  | // and with a flag saying that this time, ignore empty matches. | 
|  | // If this special match returns, that means there's a non-empty | 
|  | // match at this position as well, and we can continue.  If not, | 
|  | // we do what perl does, and just advance by one. | 
|  | // Notice that perl prints '@@@' for this; | 
|  | //    perl -le '$_ = "aa"; s/b*|aa/@/g; print' | 
|  | int matches; | 
|  | if (last_match_was_empty_string) { | 
|  | matches = pattern.TryMatch(*str, start, ANCHOR_START, false, | 
|  | vec, kVecSize); | 
|  | if (matches <= 0) { | 
|  | if (start < str->length()) | 
|  | out.push_back((*str)[start]); | 
|  | start++; | 
|  | last_match_was_empty_string = false; | 
|  | continue; | 
|  | } | 
|  | } else { | 
|  | matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); | 
|  | if (matches <= 0) | 
|  | break; | 
|  | } | 
|  | int matchstart = vec[0], matchend = vec[1]; | 
|  | assert(matchstart >= start); | 
|  | assert(matchend >= matchstart); | 
|  |  | 
|  | out.append(*str, start, matchstart - start); | 
|  | pattern.Rewrite(&out, rewrite, *str, vec, matches); | 
|  | start = matchend; | 
|  | count++; | 
|  | last_match_was_empty_string = (matchstart == matchend); | 
|  | } | 
|  |  | 
|  | if (count == 0) | 
|  | return 0; | 
|  |  | 
|  | if (start < str->length()) | 
|  | out.append(*str, start, str->length() - start); | 
|  | swap(out, *str); | 
|  | return count; | 
|  | } | 
|  |  | 
|  | bool PCRE::Extract(const StringPiece &text, | 
|  | const PCRE& pattern, | 
|  | const StringPiece &rewrite, | 
|  | string *out) { | 
|  | int vec[kVecSize]; | 
|  | int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); | 
|  | if (matches == 0) | 
|  | return false; | 
|  | out->clear(); | 
|  | return pattern.Rewrite(out, rewrite, text, vec, matches); | 
|  | } | 
|  |  | 
|  | string PCRE::QuoteMeta(const StringPiece& unquoted) { | 
|  | string result; | 
|  | result.reserve(unquoted.size() << 1); | 
|  |  | 
|  | // Escape any ascii character not in [A-Za-z_0-9]. | 
|  | // | 
|  | // Note that it's legal to escape a character even if it has no | 
|  | // special meaning in a regular expression -- so this function does | 
|  | // that.  (This also makes it identical to the perl function of the | 
|  | // same name except for the null-character special case; | 
|  | // see `perldoc -f quotemeta`.) | 
|  | for (int ii = 0; ii < unquoted.length(); ++ii) { | 
|  | // Note that using 'isalnum' here raises the benchmark time from | 
|  | // 32ns to 58ns: | 
|  | if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && | 
|  | (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && | 
|  | (unquoted[ii] < '0' || unquoted[ii] > '9') && | 
|  | unquoted[ii] != '_' && | 
|  | // If this is the part of a UTF8 or Latin1 character, we need | 
|  | // to copy this byte without escaping.  Experimentally this is | 
|  | // what works correctly with the regexp library. | 
|  | !(unquoted[ii] & 128)) { | 
|  | if (unquoted[ii] == '\0') {  // Special handling for null chars. | 
|  | // Can't use "\\0" since the next character might be a digit. | 
|  | result += "\\x00"; | 
|  | continue; | 
|  | } | 
|  | result += '\\'; | 
|  | } | 
|  | result += unquoted[ii]; | 
|  | } | 
|  |  | 
|  | return result; | 
|  | } | 
|  |  | 
|  | /***** Actual matching and rewriting code *****/ | 
|  |  | 
|  | bool PCRE::HitLimit() { | 
|  | return hit_limit_; | 
|  | } | 
|  |  | 
|  | void PCRE::ClearHitLimit() { | 
|  | hit_limit_ = 0; | 
|  | } | 
|  |  | 
|  | int PCRE::TryMatch(const StringPiece& text, | 
|  | int startpos, | 
|  | Anchor anchor, | 
|  | bool empty_ok, | 
|  | int *vec, | 
|  | int vecsize) const { | 
|  | pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; | 
|  | if (re == NULL) { | 
|  | PCREPORT(ERROR) << "Matching against invalid re: " << *error_; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | int match_limit = match_limit_; | 
|  | if (match_limit <= 0) { | 
|  | match_limit = FLAGS_regexp_match_limit; | 
|  | } | 
|  |  | 
|  | int stack_limit = stack_limit_; | 
|  | if (stack_limit <= 0) { | 
|  | stack_limit = FLAGS_regexp_stack_limit; | 
|  | } | 
|  |  | 
|  | pcre_extra extra = { 0 }; | 
|  | if (match_limit > 0) { | 
|  | extra.flags |= PCRE_EXTRA_MATCH_LIMIT; | 
|  | extra.match_limit = match_limit; | 
|  | } | 
|  | if (stack_limit > 0) { | 
|  | extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; | 
|  | extra.match_limit_recursion = stack_limit / kPCREFrameSize; | 
|  | } | 
|  |  | 
|  | int options = 0; | 
|  | if (anchor != UNANCHORED) | 
|  | options |= PCRE_ANCHORED; | 
|  | if (!empty_ok) | 
|  | options |= PCRE_NOTEMPTY; | 
|  |  | 
|  | int rc = pcre_exec(re,              // The regular expression object | 
|  | &extra, | 
|  | (text.data() == NULL) ? "" : text.data(), | 
|  | text.size(), | 
|  | startpos, | 
|  | options, | 
|  | vec, | 
|  | vecsize); | 
|  |  | 
|  | // Handle errors | 
|  | if (rc == 0) { | 
|  | // pcre_exec() returns 0 as a special case when the number of | 
|  | // capturing subpatterns exceeds the size of the vector. | 
|  | // When this happens, there is a match and the output vector | 
|  | // is filled, but we miss out on the positions of the extra subpatterns. | 
|  | rc = vecsize / 2; | 
|  | } else if (rc < 0) { | 
|  | switch (rc) { | 
|  | case PCRE_ERROR_NOMATCH: | 
|  | return 0; | 
|  | case PCRE_ERROR_MATCHLIMIT: | 
|  | // Writing to hit_limit is not safe if multiple threads | 
|  | // are using the PCRE, but the flag is only intended | 
|  | // for use by unit tests anyway, so we let it go. | 
|  | hit_limit_ = true; | 
|  | PCREPORT(WARNING) << "Exceeded match limit of " << match_limit | 
|  | << " when matching '" << pattern_ << "'" | 
|  | << " against text that is " << text.size() << " bytes."; | 
|  | return 0; | 
|  | case PCRE_ERROR_RECURSIONLIMIT: | 
|  | // See comment about hit_limit above. | 
|  | hit_limit_ = true; | 
|  | PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit | 
|  | << " when matching '" << pattern_ << "'" | 
|  | << " against text that is " << text.size() << " bytes."; | 
|  | return 0; | 
|  | default: | 
|  | // There are other return codes from pcre.h : | 
|  | // PCRE_ERROR_NULL           (-2) | 
|  | // PCRE_ERROR_BADOPTION      (-3) | 
|  | // PCRE_ERROR_BADMAGIC       (-4) | 
|  | // PCRE_ERROR_UNKNOWN_NODE   (-5) | 
|  | // PCRE_ERROR_NOMEMORY       (-6) | 
|  | // PCRE_ERROR_NOSUBSTRING    (-7) | 
|  | // ... | 
|  | PCREPORT(ERROR) << "Unexpected return code: " << rc | 
|  | << " when matching '" << pattern_ << "'" | 
|  | << ", re=" << re | 
|  | << ", text=" << text | 
|  | << ", vec=" << vec | 
|  | << ", vecsize=" << vecsize; | 
|  | return 0; | 
|  | } | 
|  | } | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | bool PCRE::DoMatchImpl(const StringPiece& text, | 
|  | Anchor anchor, | 
|  | int* consumed, | 
|  | const Arg* const* args, | 
|  | int n, | 
|  | int* vec, | 
|  | int vecsize) const { | 
|  | assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace | 
|  | int matches = TryMatch(text, 0, anchor, true, vec, vecsize); | 
|  | assert(matches >= 0);  // TryMatch never returns negatives | 
|  | if (matches == 0) | 
|  | return false; | 
|  |  | 
|  | *consumed = vec[1]; | 
|  |  | 
|  | if (n == 0 || args == NULL) { | 
|  | // We are not interested in results | 
|  | return true; | 
|  | } | 
|  | if (NumberOfCapturingGroups() < n) { | 
|  | // PCRE has fewer capturing groups than number of arg pointers passed in | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // If we got here, we must have matched the whole pattern. | 
|  | // We do not need (can not do) any more checks on the value of 'matches' here | 
|  | // -- see the comment for TryMatch. | 
|  | for (int i = 0; i < n; i++) { | 
|  | const int start = vec[2*(i+1)]; | 
|  | const int limit = vec[2*(i+1)+1]; | 
|  | if (!args[i]->Parse(text.data() + start, limit-start)) { | 
|  | // TODO: Should we indicate what the error was? | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::DoMatch(const StringPiece& text, | 
|  | Anchor anchor, | 
|  | int* consumed, | 
|  | const Arg* const args[], | 
|  | int n) const { | 
|  | assert(n >= 0); | 
|  | size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace | 
|  | // (as for kVecSize) | 
|  | int *vec = new int[vecsize]; | 
|  | bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); | 
|  | delete[] vec; | 
|  | return b; | 
|  | } | 
|  |  | 
|  | bool PCRE::Rewrite(string *out, const StringPiece &rewrite, | 
|  | const StringPiece &text, int *vec, int veclen) const { | 
|  | int number_of_capturing_groups = NumberOfCapturingGroups(); | 
|  | for (const char *s = rewrite.data(), *end = s + rewrite.size(); | 
|  | s < end; s++) { | 
|  | int c = *s; | 
|  | if (c == '\\') { | 
|  | c = *++s; | 
|  | if (isdigit(c)) { | 
|  | int n = (c - '0'); | 
|  | if (n >= veclen) { | 
|  | if (n <= number_of_capturing_groups) { | 
|  | // unmatched optional capturing group. treat | 
|  | // its value as empty string; i.e., nothing to append. | 
|  | } else { | 
|  | PCREPORT(ERROR) << "requested group " << n | 
|  | << " in regexp " << rewrite.data(); | 
|  | return false; | 
|  | } | 
|  | } | 
|  | int start = vec[2 * n]; | 
|  | if (start >= 0) | 
|  | out->append(text.data() + start, vec[2 * n + 1] - start); | 
|  | } else if (c == '\\') { | 
|  | out->push_back('\\'); | 
|  | } else { | 
|  | PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); | 
|  | return false; | 
|  | } | 
|  | } else { | 
|  | out->push_back(c); | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const { | 
|  | int max_token = -1; | 
|  | for (const char *s = rewrite.data(), *end = s + rewrite.size(); | 
|  | s < end; s++) { | 
|  | int c = *s; | 
|  | if (c != '\\') { | 
|  | continue; | 
|  | } | 
|  | if (++s == end) { | 
|  | *error = "Rewrite schema error: '\\' not allowed at end."; | 
|  | return false; | 
|  | } | 
|  | c = *s; | 
|  | if (c == '\\') { | 
|  | continue; | 
|  | } | 
|  | if (!isdigit(c)) { | 
|  | *error = "Rewrite schema error: " | 
|  | "'\\' must be followed by a digit or '\\'."; | 
|  | return false; | 
|  | } | 
|  | int n = (c - '0'); | 
|  | if (max_token < n) { | 
|  | max_token = n; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (max_token > NumberOfCapturingGroups()) { | 
|  | SStringPrintf(error, "Rewrite schema requests %d matches, " | 
|  | "but the regexp only has %d parenthesized subexpressions.", | 
|  | max_token, NumberOfCapturingGroups()); | 
|  | return false; | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  |  | 
|  | // Return the number of capturing subpatterns, or -1 if the | 
|  | // regexp wasn't valid on construction. | 
|  | int PCRE::NumberOfCapturingGroups() const { | 
|  | if (re_partial_ == NULL) return -1; | 
|  |  | 
|  | int result; | 
|  | CHECK(pcre_fullinfo(re_partial_,       // The regular expression object | 
|  | NULL,              // We did not study the pattern | 
|  | PCRE_INFO_CAPTURECOUNT, | 
|  | &result) == 0); | 
|  | return result; | 
|  | } | 
|  |  | 
|  |  | 
|  | /***** Parsers for various types *****/ | 
|  |  | 
|  | bool PCRE::Arg::parse_null(const char* str, int n, void* dest) { | 
|  | // We fail if somebody asked us to store into a non-NULL void* pointer | 
|  | return (dest == NULL); | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_string(const char* str, int n, void* dest) { | 
|  | if (dest == NULL) return true; | 
|  | reinterpret_cast<string*>(dest)->assign(str, n); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) { | 
|  | if (dest == NULL) return true; | 
|  | reinterpret_cast<StringPiece*>(dest)->set(str, n); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { | 
|  | if (n != 1) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<char*>(dest)) = str[0]; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { | 
|  | if (n != 1) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<unsigned char*>(dest)) = str[0]; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Largest number spec that we are willing to parse | 
|  | static const int kMaxNumberLength = 32; | 
|  |  | 
|  | // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 | 
|  | // PCREQUIPCRES "n > 0" | 
|  | // Copies "str" into "buf" and null-terminates if necessary. | 
|  | // Returns one of: | 
|  | //      a. "str" if no termination is needed | 
|  | //      b. "buf" if the string was copied and null-terminated | 
|  | //      c. "" if the input was invalid and has no hope of being parsed | 
|  | static const char* TerminateNumber(char* buf, const char* str, int n) { | 
|  | if ((n > 0) && isspace(*str)) { | 
|  | // We are less forgiving than the strtoxxx() routines and do not | 
|  | // allow leading spaces. | 
|  | return ""; | 
|  | } | 
|  |  | 
|  | // See if the character right after the input text may potentially | 
|  | // look like a digit. | 
|  | if (isdigit(str[n]) || | 
|  | ((str[n] >= 'a') && (str[n] <= 'f')) || | 
|  | ((str[n] >= 'A') && (str[n] <= 'F'))) { | 
|  | if (n > kMaxNumberLength) return ""; // Input too big to be a valid number | 
|  | memcpy(buf, str, n); | 
|  | buf[n] = '\0'; | 
|  | return buf; | 
|  | } else { | 
|  | // We can parse right out of the supplied string, so return it. | 
|  | return str; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_long_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | if (n == 0) return false; | 
|  | char buf[kMaxNumberLength+1]; | 
|  | str = TerminateNumber(buf, str, n); | 
|  | char* end; | 
|  | errno = 0; | 
|  | long r = strtol(str, &end, radix); | 
|  | if (end != str + n) return false;   // Leftover junk | 
|  | if (errno) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<long*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_ulong_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | if (n == 0) return false; | 
|  | char buf[kMaxNumberLength+1]; | 
|  | str = TerminateNumber(buf, str, n); | 
|  | if (str[0] == '-') { | 
|  | // strtoul() will silently accept negative numbers and parse | 
|  | // them.  This module is more strict and treats them as errors. | 
|  | return false; | 
|  | } | 
|  |  | 
|  | char* end; | 
|  | errno = 0; | 
|  | unsigned long r = strtoul(str, &end, radix); | 
|  | if (end != str + n) return false;   // Leftover junk | 
|  | if (errno) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<unsigned long*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_short_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | long r; | 
|  | if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | 
|  | if ((short)r != r) return false;       // Out of range | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<short*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_ushort_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | unsigned long r; | 
|  | if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | 
|  | if ((ushort)r != r) return false;                      // Out of range | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<unsigned short*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_int_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | long r; | 
|  | if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse | 
|  | if ((int)r != r) return false;         // Out of range | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<int*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_uint_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | unsigned long r; | 
|  | if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse | 
|  | if ((uint)r != r) return false;                       // Out of range | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<unsigned int*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_longlong_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | if (n == 0) return false; | 
|  | char buf[kMaxNumberLength+1]; | 
|  | str = TerminateNumber(buf, str, n); | 
|  | char* end; | 
|  | errno = 0; | 
|  | int64 r = strtoll(str, &end, radix); | 
|  | if (end != str + n) return false;   // Leftover junk | 
|  | if (errno) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<int64*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_ulonglong_radix(const char* str, | 
|  | int n, | 
|  | void* dest, | 
|  | int radix) { | 
|  | if (n == 0) return false; | 
|  | char buf[kMaxNumberLength+1]; | 
|  | str = TerminateNumber(buf, str, n); | 
|  | if (str[0] == '-') { | 
|  | // strtoull() will silently accept negative numbers and parse | 
|  | // them.  This module is more strict and treats them as errors. | 
|  | return false; | 
|  | } | 
|  | char* end; | 
|  | errno = 0; | 
|  | uint64 r = strtoull(str, &end, radix); | 
|  | if (end != str + n) return false;   // Leftover junk | 
|  | if (errno) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<uint64*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { | 
|  | if (n == 0) return false; | 
|  | static const int kMaxLength = 200; | 
|  | char buf[kMaxLength]; | 
|  | if (n >= kMaxLength) return false; | 
|  | memcpy(buf, str, n); | 
|  | buf[n] = '\0'; | 
|  | errno = 0; | 
|  | char* end; | 
|  | double r = strtod(buf, &end); | 
|  | if (end != buf + n) { | 
|  | #ifdef COMPILER_MSVC | 
|  | // Microsoft's strtod() doesn't handle inf and nan, so we have to | 
|  | // handle it explicitly.  Speed is not important here because this | 
|  | // code is only called in unit tests. | 
|  | bool pos = true; | 
|  | const char* i = buf; | 
|  | if ('-' == *i) { | 
|  | pos = false; | 
|  | ++i; | 
|  | } else if ('+' == *i) { | 
|  | ++i; | 
|  | } | 
|  | if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { | 
|  | r = numeric_limits<double>::infinity(); | 
|  | if (!pos) | 
|  | r = -r; | 
|  | } else if (0 == stricmp(i, "nan")) { | 
|  | r = numeric_limits<double>::quiet_NaN(); | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | #else | 
|  | return false;   // Leftover junk | 
|  | #endif | 
|  | } | 
|  | if (errno) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<double*>(dest)) = r; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { | 
|  | double r; | 
|  | if (!parse_double(str, n, &r)) return false; | 
|  | if (dest == NULL) return true; | 
|  | *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); | 
|  | return true; | 
|  | } | 
|  |  | 
|  |  | 
|  | #define DEFINE_INTEGER_PARSERS(name)                                        \ | 
|  | bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) {          \ | 
|  | return parse_##name##_radix(str, n, dest, 10);                          \ | 
|  | }                                                                         \ | 
|  | bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \ | 
|  | return parse_##name##_radix(str, n, dest, 16);                          \ | 
|  | }                                                                         \ | 
|  | bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \ | 
|  | return parse_##name##_radix(str, n, dest, 8);                           \ | 
|  | }                                                                         \ | 
|  | bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ | 
|  | return parse_##name##_radix(str, n, dest, 0);                           \ | 
|  | } | 
|  |  | 
|  | DEFINE_INTEGER_PARSERS(short); | 
|  | DEFINE_INTEGER_PARSERS(ushort); | 
|  | DEFINE_INTEGER_PARSERS(int); | 
|  | DEFINE_INTEGER_PARSERS(uint); | 
|  | DEFINE_INTEGER_PARSERS(long); | 
|  | DEFINE_INTEGER_PARSERS(ulong); | 
|  | DEFINE_INTEGER_PARSERS(longlong); | 
|  | DEFINE_INTEGER_PARSERS(ulonglong); | 
|  |  | 
|  | #undef DEFINE_INTEGER_PARSERS | 
|  |  | 
|  | }  // namespace re2 |