/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: RegularExpression.hpp 678879 2008-07-22 20:05:05Z amassari $ */ #if !defined(XERCESC_INCLUDE_GUARD_REGULAREXPRESSION_HPP) #define XERCESC_INCLUDE_GUARD_REGULAREXPRESSION_HPP // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include #include #include #include #include #include #include #include XERCES_CPP_NAMESPACE_BEGIN // --------------------------------------------------------------------------- // Forward Declaration // --------------------------------------------------------------------------- class RangeToken; class Match; class RegxParser; /** * The RegularExpression class represents a parsed executable regular expression. * This class is thread safe. Two similar regular expression syntaxes are * supported: * *
    *
  1. The XPath 2.0 / XQuery regular expression syntax. *
  2. The XML Schema regular expression syntax.
  3. *
* * XPath 2.0 regular expression syntax is used unless the "X" option is specified during construction. * * Options can be specified during construction to change the way that the regular expression is handled. * Options are specified by a string consisting of any number of the following characters: * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
CharacterMeaning
i * Ignore case when matching the regular expression.
m * Multi-line mode. The meta characters "^" and "$" will match the beginning and end of lines.
s * Single-line mode. The meta character "." will match a newline character.
xAllow extended comments.
FProhibit the fixed string optimization.
HProhibit the head character optimization.
XParse the regular expression according to the * XML Schema regular expression syntax.
*/ class XMLUTIL_EXPORT RegularExpression : public XMemory { public: // ----------------------------------------------------------------------- // Public Constructors and Destructor // ----------------------------------------------------------------------- /** @name Constructors and destructor */ //@{ /** Parses the given regular expression. * * @param pattern the regular expression in the local code page * @param manager the memory manager to use */ RegularExpression ( const char* const pattern , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); /** Parses the given regular expression using the options specified. * * @param pattern the regular expression in the local code page * @param options the options string in the local code page * @param manager the memory manager to use */ RegularExpression ( const char* const pattern , const char* const options , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); /** Parses the given regular expression. * * @param pattern the regular expression * @param manager the memory manager to use */ RegularExpression ( const XMLCh* const pattern , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); /** Parses the given regular expression using the options specified. * * @param pattern the regular expression * @param options the options string * @param manager the memory manager to use */ RegularExpression ( const XMLCh* const pattern , const XMLCh* const options , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); virtual ~RegularExpression(); //@} // ----------------------------------------------------------------------- // Public Constants // ----------------------------------------------------------------------- static const unsigned int IGNORE_CASE; static const unsigned int SINGLE_LINE; static const unsigned int MULTIPLE_LINE; static const unsigned int EXTENDED_COMMENT; static const unsigned int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; static const unsigned int PROHIBIT_FIXED_STRING_OPTIMIZATION; static const unsigned int XMLSCHEMA_MODE; typedef enum { wordTypeIgnore = 0, wordTypeLetter = 1, wordTypeOther = 2 } wordType; // ----------------------------------------------------------------------- // Public Helper methods // ----------------------------------------------------------------------- /** @name Public helper methods */ //@{ static int getOptionValue(const XMLCh ch); static bool isSet(const int options, const int flag); //@} // ----------------------------------------------------------------------- // Matching methods // ----------------------------------------------------------------------- /** @name Matching methods */ //@{ /** Tries to match the given null terminated string against the regular expression, returning * true if successful. * * @param matchString the string to match in the local code page * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const char* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given string between the specified start and end offsets * against the regular expression, returning true if successful. * * @param matchString the string to match in the local code page * @param start the offset of the start of the string * @param end the offset of the end of the string * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const char* const matchString, const XMLSize_t start, const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given null terminated string against the regular expression, returning * true if successful. * * @param matchString the string to match in the local code page * @param pMatch a Match object, which will be populated with the offsets for the * regular expression match and sub-matches. * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const char* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given string between the specified start and end offsets * against the regular expression, returning true if successful. * * @param matchString the string to match in the local code page * @param start the offset of the start of the string * @param end the offset of the end of the string * @param pMatch a Match object, which will be populated with the offsets for the * regular expression match and sub-matches. * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const char* const matchString, const XMLSize_t start, const XMLSize_t end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given null terminated string against the regular expression, returning * true if successful. * * @param matchString the string to match * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const XMLCh* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given string between the specified start and end offsets * against the regular expression, returning true if successful. * * @param matchString the string to match * @param start the offset of the start of the string * @param end the offset of the end of the string * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given null terminated string against the regular expression, returning * true if successful. * * @param matchString the string to match * @param pMatch a Match object, which will be populated with the offsets for the * regular expression match and sub-matches. * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const XMLCh* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given string between the specified start and end offsets * against the regular expression, returning true if successful. * * @param matchString the string to match * @param start the offset of the start of the string * @param end the offset of the end of the string * @param pMatch a Match object, which will be populated with the offsets for the * regular expression match and sub-matches. * @param manager the memory manager to use * * @return Whether the string matched the regular expression or not. */ bool matches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tries to match the given string between the specified start and end offsets * against the regular expression. The subEx vector is populated with the details * for every non-overlapping occurrence of a match in the string. * * @param matchString the string to match * @param start the offset of the start of the string * @param end the offset of the end of the string * @param subEx a RefVectorOf Match objects, populated with the offsets for the * regular expression match and sub-matches. * @param manager the memory manager to use */ void allMatches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, RefVectorOf *subEx, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; //@} // ----------------------------------------------------------------------- // Tokenize methods // ----------------------------------------------------------------------- // Note: The caller owns the string vector that is returned, and is responsible // for deleting it. /** @name Tokenize methods */ //@{ /** Tokenizes the null terminated string according to the regular expression, returning * the parts of the string that do not match the regular expression. * * @param matchString the string to match in the local code page * @param manager the memory manager to use * * @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the * given MemoryManager. The caller owns the string vector that is returned, and is responsible for * deleting it. */ RefArrayVectorOf *tokenize(const char* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tokenizes the string between the specified start and end offsets according to the regular * expression, returning the parts of the string that do not match the regular expression. * * @param matchString the string to match in the local code page * @param start the offset of the start of the string * @param end the offset of the end of the string * @param manager the memory manager to use * * @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the * given MemoryManager. The caller owns the string vector that is returned, and is responsible for * deleting it. */ RefArrayVectorOf *tokenize(const char* const matchString, const XMLSize_t start, const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tokenizes the null terminated string according to the regular expression, returning * the parts of the string that do not match the regular expression. * * @param matchString the string to match * @param manager the memory manager to use * * @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the * given MemoryManager. The caller owns the string vector that is returned, and is responsible for * deleting it. */ RefArrayVectorOf *tokenize(const XMLCh* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Tokenizes the string between the specified start and end offsets according to the regular * expression, returning the parts of the string that do not match the regular expression. * * @param matchString the string to match * @param start the offset of the start of the string * @param end the offset of the end of the string * @param manager the memory manager to use * * @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the * given MemoryManager. The caller owns the string vector that is returned, and is responsible for * deleting it. */ RefArrayVectorOf *tokenize(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; //@} // ----------------------------------------------------------------------- // Replace methods // ----------------------------------------------------------------------- // Note: The caller owns the XMLCh* that is returned, and is responsible for // deleting it. /** @name Replace methods */ //@{ /** Performs a search and replace on the given null terminated string, replacing * any substring that matches the regular expression with a string derived from * the replacement string. * * @param matchString the string to match in the local code page * @param replaceString the string to replace in the local code page * @param manager the memory manager to use * * @return The resulting string allocated using the given MemoryManager. The caller owns the string * that is returned, and is responsible for deleting it. */ XMLCh *replace(const char* const matchString, const char* const replaceString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Performs a search and replace on the given string between the specified start and end offsets, replacing * any substring that matches the regular expression with a string derived from * the replacement string. * * @param matchString the string to match in the local code page * @param replaceString the string to replace in the local code page * @param start the offset of the start of the string * @param end the offset of the end of the string * @param manager the memory manager to use * * @return The resulting string allocated using the given MemoryManager. The caller owns the string * that is returned, and is responsible for deleting it. */ XMLCh *replace(const char* const matchString, const char* const replaceString, const XMLSize_t start, const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Performs a search and replace on the given null terminated string, replacing * any substring that matches the regular expression with a string derived from * the replacement string. * * @param matchString the string to match * @param replaceString the string to replace * @param manager the memory manager to use * * @return The resulting string allocated using the given MemoryManager. The caller owns the string * that is returned, and is responsible for deleting it. */ XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; /** Performs a search and replace on the given string between the specified start and end offsets, replacing * any substring that matches the regular expression with a string derived from * the replacement string. * * @param matchString the string to match * @param replaceString the string to replace * @param start the offset of the start of the string * @param end the offset of the end of the string * @param manager the memory manager to use * * @return The resulting string allocated using the given MemoryManager. The caller owns the string * that is returned, and is responsible for deleting it. */ XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, const XMLSize_t start, const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; //@} // ----------------------------------------------------------------------- // Static initialize and cleanup methods // ----------------------------------------------------------------------- /** @name Static initilize and cleanup methods */ //@{ static void staticInitialize(MemoryManager* memoryManager); static void staticCleanup(); //@} protected: virtual RegxParser* getRegexParser(const int options, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); // ----------------------------------------------------------------------- // Cleanup methods // ----------------------------------------------------------------------- void cleanUp(); // ----------------------------------------------------------------------- // Setter methods // ----------------------------------------------------------------------- void setPattern(const XMLCh* const pattern, const XMLCh* const options=0); // ----------------------------------------------------------------------- // Protected data types // ----------------------------------------------------------------------- class XMLUTIL_EXPORT Context : public XMemory { public : Context(MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); Context(Context* src); ~Context(); Context& operator= (const Context& other); inline const XMLCh* getString() const { return fString; } void reset(const XMLCh* const string, const XMLSize_t stringLen, const XMLSize_t start, const XMLSize_t limit, const int noClosures, const unsigned int options); bool nextCh(XMLInt32& ch, XMLSize_t& offset); bool fAdoptMatch; XMLSize_t fStart; XMLSize_t fLimit; XMLSize_t fLength; // fLimit - fStart int fSize; XMLSize_t fStringMaxLen; int* fOffsets; Match* fMatch; const XMLCh* fString; unsigned int fOptions; MemoryManager* fMemoryManager; }; // ----------------------------------------------------------------------- // Unimplemented constructors and operators // ----------------------------------------------------------------------- RegularExpression(const RegularExpression&); RegularExpression& operator=(const RegularExpression&); // ----------------------------------------------------------------------- // Protected Helper methods // ----------------------------------------------------------------------- void prepare(); int parseOptions(const XMLCh* const options); /** * Matching helpers */ int match(Context* const context, const Op* const operations, XMLSize_t offset) const; bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2) const; /** * Helper methods used by match(Context* ...) */ bool matchChar(Context* const context, const XMLInt32 ch, XMLSize_t& offset, const bool ignoreCase) const; bool matchDot(Context* const context, XMLSize_t& offset) const; bool matchRange(Context* const context, const Op* const op, XMLSize_t& offset, const bool ignoreCase) const; bool matchAnchor(Context* const context, const XMLInt32 ch, const XMLSize_t offset) const; bool matchBackReference(Context* const context, const XMLInt32 ch, XMLSize_t& offset, const bool ignoreCase) const; bool matchString(Context* const context, const XMLCh* const literal, XMLSize_t& offset, const bool ignoreCase) const; int matchUnion(Context* const context, const Op* const op, XMLSize_t offset) const; int matchCapture(Context* const context, const Op* const op, XMLSize_t offset) const; /** * Replace helpers */ void subInExp(const XMLCh* const repString, const XMLCh* const origString, const Match* subEx, XMLBuffer &result, MemoryManager* const manager) const; /** * Converts a token tree into an operation tree */ void compile(const Token* const token); Op* compile(const Token* const token, Op* const next, const bool reverse); /** * Helper methods used by compile */ Op* compileSingle(const Token* const token, Op* const next, const Token::tokType tkType); Op* compileUnion(const Token* const token, Op* const next, const bool reverse); Op* compileParenthesis(const Token* const token, Op* const next, const bool reverse); Op* compileConcat(const Token* const token, Op* const next, const bool reverse); Op* compileClosure(const Token* const token, Op* const next, const bool reverse, const Token::tokType tkType); bool doTokenOverlap(const Op* op, Token* token); // ----------------------------------------------------------------------- // Protected data members // ----------------------------------------------------------------------- bool fHasBackReferences; bool fFixedStringOnly; int fNoGroups; XMLSize_t fMinLength; unsigned int fNoClosures; unsigned int fOptions; const BMPattern* fBMPattern; XMLCh* fPattern; XMLCh* fFixedString; const Op* fOperations; Token* fTokenTree; RangeToken* fFirstChar; static RangeToken* fWordRange; OpFactory fOpFactory; TokenFactory* fTokenFactory; MemoryManager* fMemoryManager; }; // ----------------------------------------------------------------------- // RegularExpression: Static initialize and cleanup methods // ----------------------------------------------------------------------- inline void RegularExpression::staticCleanup() { fWordRange = 0; } // --------------------------------------------------------------------------- // RegularExpression: Cleanup methods // --------------------------------------------------------------------------- inline void RegularExpression::cleanUp() { fMemoryManager->deallocate(fPattern);//delete [] fPattern; fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; delete fBMPattern; delete fTokenFactory; } // --------------------------------------------------------------------------- // RegularExpression: Helper methods // --------------------------------------------------------------------------- inline bool RegularExpression::isSet(const int options, const int flag) { return (options & flag) == flag; } inline Op* RegularExpression::compileSingle(const Token* const token, Op* const next, const Token::tokType tkType) { Op* ret = 0; switch (tkType) { case Token::T_DOT: ret = fOpFactory.createDotOp(); break; case Token::T_CHAR: ret = fOpFactory.createCharOp(token->getChar()); break; case Token::T_ANCHOR: ret = fOpFactory.createAnchorOp(token->getChar()); break; case Token::T_RANGE: case Token::T_NRANGE: ret = fOpFactory.createRangeOp(token); break; case Token::T_EMPTY: ret = next; break; case Token::T_STRING: ret = fOpFactory.createStringOp(token->getString()); break; case Token::T_BACKREFERENCE: ret = fOpFactory.createBackReferenceOp(token->getReferenceNo()); break; default: break; } if (tkType != Token::T_EMPTY) ret->setNextOp(next); return ret; } inline Op* RegularExpression::compileUnion(const Token* const token, Op* const next, const bool reverse) { XMLSize_t tokSize = token->size(); UnionOp* uniOp = fOpFactory.createUnionOp(tokSize); for (XMLSize_t i=0; iaddElement(compile(token->getChild(i), next, reverse)); } return uniOp; } inline Op* RegularExpression::compileParenthesis(const Token* const token, Op* const next, const bool reverse) { if (token->getNoParen() == 0) return compile(token->getChild(0), next, reverse); Op* captureOp = 0; if (reverse) { captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next); captureOp = compile(token->getChild(0), captureOp, reverse); return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp); } captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next); captureOp = compile(token->getChild(0), captureOp, reverse); return fOpFactory.createCaptureOp(token->getNoParen(), captureOp); } inline Op* RegularExpression::compileConcat(const Token* const token, Op* const next, const bool reverse) { Op* ret = next; XMLSize_t tokSize = token->size(); if (!reverse) { for (XMLSize_t i= tokSize; i>0; i--) { ret = compile(token->getChild(i-1), ret, false); } } else { for (XMLSize_t i= 0; i< tokSize; i++) { ret = compile(token->getChild(i), ret, true); } } return ret; } inline Op* RegularExpression::compileClosure(const Token* const token, Op* const next, const bool reverse, const Token::tokType tkType) { Op* ret = 0; Token* childTok = token->getChild(0); int min = token->getMin(); int max = token->getMax(); if (min >= 0 && min == max) { ret = next; for (int i=0; i< min; i++) { ret = compile(childTok, ret, reverse); } return ret; } if (min > 0 && max > 0) max -= min; if (max > 0) { ret = next; for (int i=0; isetNextOp(next); childOp->setChild(compile(childTok, ret, reverse)); ret = childOp; } } else { ChildOp* childOp = 0; if (tkType == Token::T_NONGREEDYCLOSURE) { childOp = fOpFactory.createNonGreedyClosureOp(); } else { if (childTok->getMinLength() == 0) childOp = fOpFactory.createClosureOp(fNoClosures++); else childOp = fOpFactory.createClosureOp(-1); } childOp->setNextOp(next); if(next==NULL || !doTokenOverlap(next, childTok)) { childOp->setOpType(tkType == Token::T_NONGREEDYCLOSURE?Op::O_FINITE_NONGREEDYCLOSURE:Op::O_FINITE_CLOSURE); childOp->setChild(compile(childTok, NULL, reverse)); } else { childOp->setChild(compile(childTok, childOp, reverse)); } ret = childOp; } if (min > 0) { for (int i=0; i< min; i++) { ret = compile(childTok, ret, reverse); } } return ret; } XERCES_CPP_NAMESPACE_END #endif /** * End of file RegularExpression.hpp */