814 lines
32 KiB
C++
814 lines
32 KiB
C++
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
* $Id: RegularExpression.hpp 678879 2008-07-22 20:05:05Z amassari $
|
|
*/
|
|
|
|
#if !defined(XERCESC_INCLUDE_GUARD_REGULAREXPRESSION_HPP)
|
|
#define XERCESC_INCLUDE_GUARD_REGULAREXPRESSION_HPP
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Includes
|
|
// ---------------------------------------------------------------------------
|
|
#include <xercesc/util/RefArrayVectorOf.hpp>
|
|
#include <xercesc/util/XMLString.hpp>
|
|
#include <xercesc/util/Janitor.hpp>
|
|
#include <xercesc/util/regx/Op.hpp>
|
|
#include <xercesc/util/regx/TokenFactory.hpp>
|
|
#include <xercesc/util/regx/BMPattern.hpp>
|
|
#include <xercesc/util/regx/OpFactory.hpp>
|
|
#include <xercesc/util/regx/RegxUtil.hpp>
|
|
|
|
XERCES_CPP_NAMESPACE_BEGIN
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Forward Declaration
|
|
// ---------------------------------------------------------------------------
|
|
class RangeToken;
|
|
class Match;
|
|
class RegxParser;
|
|
|
|
/**
|
|
* The RegularExpression class represents a parsed executable regular expression.
|
|
* This class is thread safe. Two similar regular expression syntaxes are
|
|
* supported:
|
|
*
|
|
* <ol>
|
|
* <li><a href="http://www.w3.org/TR/xpath-functions/#regex-syntax">The XPath 2.0 / XQuery regular expression syntax.</a>
|
|
* <li><a href="http://www.w3.org/TR/xmlschema-2/#regexs">The XML Schema regular expression syntax.</a></li>
|
|
* </ol>
|
|
*
|
|
* XPath 2.0 regular expression syntax is used unless the "X" option is specified during construction.
|
|
*
|
|
* Options can be specified during construction to change the way that the regular expression is handled.
|
|
* Options are specified by a string consisting of any number of the following characters:
|
|
*
|
|
* <table border='1'>
|
|
* <tr>
|
|
* <th>Character</th>
|
|
* <th>Meaning</th>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>i</td>
|
|
* <td valign='top' rowspan='1' colspan='1'><a href="http://www.w3.org/TR/xpath-functions/#flags">
|
|
* Ignore case</a> when matching the regular expression.</td>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>m</td>
|
|
* <td valign='top' rowspan='1' colspan='1'><a href="http://www.w3.org/TR/xpath-functions/#flags">
|
|
* Multi-line mode</a>. The meta characters "^" and "$" will match the beginning and end of lines.</td>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>s</td>
|
|
* <td valign='top' rowspan='1' colspan='1'><a href="http://www.w3.org/TR/xpath-functions/#flags">
|
|
* Single-line mode</a>. The meta character "." will match a newline character.</td>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>x</td>
|
|
* <td valign='top' rowspan='1' colspan='1'>Allow extended comments.</td>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>F</td>
|
|
* <td valign='top' rowspan='1' colspan='1'>Prohibit the fixed string optimization.</td>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>H</td>
|
|
* <td valign='top' rowspan='1' colspan='1'>Prohibit the head character optimization.</td>
|
|
* </tr>
|
|
* <tr>
|
|
* <td valign='top' rowspan='1' colspan='1'>X</td>
|
|
* <td valign='top' rowspan='1' colspan='1'>Parse the regular expression according to the
|
|
* <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema regular expression syntax</a>.</td>
|
|
* </tr>
|
|
* </table>
|
|
*/
|
|
class XMLUTIL_EXPORT RegularExpression : public XMemory
|
|
{
|
|
public:
|
|
// -----------------------------------------------------------------------
|
|
// Public Constructors and Destructor
|
|
// -----------------------------------------------------------------------
|
|
|
|
/** @name Constructors and destructor */
|
|
//@{
|
|
|
|
/** Parses the given regular expression.
|
|
*
|
|
* @param pattern the regular expression in the local code page
|
|
* @param manager the memory manager to use
|
|
*/
|
|
RegularExpression
|
|
(
|
|
const char* const pattern
|
|
, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
|
|
);
|
|
|
|
/** Parses the given regular expression using the options specified.
|
|
*
|
|
* @param pattern the regular expression in the local code page
|
|
* @param options the options string in the local code page
|
|
* @param manager the memory manager to use
|
|
*/
|
|
RegularExpression
|
|
(
|
|
const char* const pattern
|
|
, const char* const options
|
|
, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
|
|
);
|
|
|
|
/** Parses the given regular expression.
|
|
*
|
|
* @param pattern the regular expression
|
|
* @param manager the memory manager to use
|
|
*/
|
|
RegularExpression
|
|
(
|
|
const XMLCh* const pattern
|
|
, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
|
|
);
|
|
|
|
/** Parses the given regular expression using the options specified.
|
|
*
|
|
* @param pattern the regular expression
|
|
* @param options the options string
|
|
* @param manager the memory manager to use
|
|
*/
|
|
RegularExpression
|
|
(
|
|
const XMLCh* const pattern
|
|
, const XMLCh* const options
|
|
, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
|
|
);
|
|
|
|
virtual ~RegularExpression();
|
|
|
|
//@}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Public Constants
|
|
// -----------------------------------------------------------------------
|
|
static const unsigned int IGNORE_CASE;
|
|
static const unsigned int SINGLE_LINE;
|
|
static const unsigned int MULTIPLE_LINE;
|
|
static const unsigned int EXTENDED_COMMENT;
|
|
static const unsigned int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
|
|
static const unsigned int PROHIBIT_FIXED_STRING_OPTIMIZATION;
|
|
static const unsigned int XMLSCHEMA_MODE;
|
|
typedef enum
|
|
{
|
|
wordTypeIgnore = 0,
|
|
wordTypeLetter = 1,
|
|
wordTypeOther = 2
|
|
} wordType;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Public Helper methods
|
|
// -----------------------------------------------------------------------
|
|
|
|
/** @name Public helper methods */
|
|
//@{
|
|
|
|
static int getOptionValue(const XMLCh ch);
|
|
static bool isSet(const int options, const int flag);
|
|
|
|
//@}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Matching methods
|
|
// -----------------------------------------------------------------------
|
|
|
|
/** @name Matching methods */
|
|
//@{
|
|
|
|
/** Tries to match the given null terminated string against the regular expression, returning
|
|
* true if successful.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const char* const matchString,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given string between the specified start and end offsets
|
|
* against the regular expression, returning true if successful.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const char* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given null terminated string against the regular expression, returning
|
|
* true if successful.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param pMatch a Match object, which will be populated with the offsets for the
|
|
* regular expression match and sub-matches.
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const char* const matchString, Match* const pMatch,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given string between the specified start and end offsets
|
|
* against the regular expression, returning true if successful.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param pMatch a Match object, which will be populated with the offsets for the
|
|
* regular expression match and sub-matches.
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const char* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given null terminated string against the regular expression, returning
|
|
* true if successful.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const XMLCh* const matchString,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given string between the specified start and end offsets
|
|
* against the regular expression, returning true if successful.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given null terminated string against the regular expression, returning
|
|
* true if successful.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param pMatch a Match object, which will be populated with the offsets for the
|
|
* regular expression match and sub-matches.
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const XMLCh* const matchString, Match* const pMatch,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given string between the specified start and end offsets
|
|
* against the regular expression, returning true if successful.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param pMatch a Match object, which will be populated with the offsets for the
|
|
* regular expression match and sub-matches.
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return Whether the string matched the regular expression or not.
|
|
*/
|
|
bool matches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tries to match the given string between the specified start and end offsets
|
|
* against the regular expression. The subEx vector is populated with the details
|
|
* for every non-overlapping occurrence of a match in the string.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param subEx a RefVectorOf Match objects, populated with the offsets for the
|
|
* regular expression match and sub-matches.
|
|
* @param manager the memory manager to use
|
|
*/
|
|
void allMatches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
RefVectorOf<Match> *subEx, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
//@}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Tokenize methods
|
|
// -----------------------------------------------------------------------
|
|
// Note: The caller owns the string vector that is returned, and is responsible
|
|
// for deleting it.
|
|
|
|
/** @name Tokenize methods */
|
|
//@{
|
|
|
|
/** Tokenizes the null terminated string according to the regular expression, returning
|
|
* the parts of the string that do not match the regular expression.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the
|
|
* given MemoryManager. The caller owns the string vector that is returned, and is responsible for
|
|
* deleting it.
|
|
*/
|
|
RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tokenizes the string between the specified start and end offsets according to the regular
|
|
* expression, returning the parts of the string that do not match the regular expression.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the
|
|
* given MemoryManager. The caller owns the string vector that is returned, and is responsible for
|
|
* deleting it.
|
|
*/
|
|
RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tokenizes the null terminated string according to the regular expression, returning
|
|
* the parts of the string that do not match the regular expression.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the
|
|
* given MemoryManager. The caller owns the string vector that is returned, and is responsible for
|
|
* deleting it.
|
|
*/
|
|
RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Tokenizes the string between the specified start and end offsets according to the regular
|
|
* expression, returning the parts of the string that do not match the regular expression.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return A RefArrayVectorOf sub-strings that do not match the regular expression allocated using the
|
|
* given MemoryManager. The caller owns the string vector that is returned, and is responsible for
|
|
* deleting it.
|
|
*/
|
|
RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
//@}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Replace methods
|
|
// -----------------------------------------------------------------------
|
|
// Note: The caller owns the XMLCh* that is returned, and is responsible for
|
|
// deleting it.
|
|
|
|
/** @name Replace methods */
|
|
//@{
|
|
|
|
/** Performs a search and replace on the given null terminated string, replacing
|
|
* any substring that matches the regular expression with a string derived from
|
|
* the <a href="http://www.w3.org/TR/xpath-functions/#func-replace">replacement string</a>.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param replaceString the string to replace in the local code page
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return The resulting string allocated using the given MemoryManager. The caller owns the string
|
|
* that is returned, and is responsible for deleting it.
|
|
*/
|
|
XMLCh *replace(const char* const matchString, const char* const replaceString,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Performs a search and replace on the given string between the specified start and end offsets, replacing
|
|
* any substring that matches the regular expression with a string derived from
|
|
* the <a href="http://www.w3.org/TR/xpath-functions/#func-replace">replacement string</a>.
|
|
*
|
|
* @param matchString the string to match in the local code page
|
|
* @param replaceString the string to replace in the local code page
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return The resulting string allocated using the given MemoryManager. The caller owns the string
|
|
* that is returned, and is responsible for deleting it.
|
|
*/
|
|
XMLCh *replace(const char* const matchString, const char* const replaceString,
|
|
const XMLSize_t start, const XMLSize_t end,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Performs a search and replace on the given null terminated string, replacing
|
|
* any substring that matches the regular expression with a string derived from
|
|
* the <a href="http://www.w3.org/TR/xpath-functions/#func-replace">replacement string</a>.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param replaceString the string to replace
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return The resulting string allocated using the given MemoryManager. The caller owns the string
|
|
* that is returned, and is responsible for deleting it.
|
|
*/
|
|
XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
/** Performs a search and replace on the given string between the specified start and end offsets, replacing
|
|
* any substring that matches the regular expression with a string derived from
|
|
* the <a href="http://www.w3.org/TR/xpath-functions/#func-replace">replacement string</a>.
|
|
*
|
|
* @param matchString the string to match
|
|
* @param replaceString the string to replace
|
|
* @param start the offset of the start of the string
|
|
* @param end the offset of the end of the string
|
|
* @param manager the memory manager to use
|
|
*
|
|
* @return The resulting string allocated using the given MemoryManager. The caller owns the string
|
|
* that is returned, and is responsible for deleting it.
|
|
*/
|
|
XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString,
|
|
const XMLSize_t start, const XMLSize_t end,
|
|
MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const;
|
|
|
|
//@}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Static initialize and cleanup methods
|
|
// -----------------------------------------------------------------------
|
|
|
|
/** @name Static initilize and cleanup methods */
|
|
//@{
|
|
|
|
static void
|
|
staticInitialize(MemoryManager* memoryManager);
|
|
|
|
static void
|
|
staticCleanup();
|
|
|
|
//@}
|
|
|
|
protected:
|
|
virtual RegxParser* getRegexParser(const int options, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Cleanup methods
|
|
// -----------------------------------------------------------------------
|
|
void cleanUp();
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Setter methods
|
|
// -----------------------------------------------------------------------
|
|
void setPattern(const XMLCh* const pattern, const XMLCh* const options=0);
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Protected data types
|
|
// -----------------------------------------------------------------------
|
|
class XMLUTIL_EXPORT Context : public XMemory
|
|
{
|
|
public :
|
|
Context(MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
|
|
Context(Context* src);
|
|
~Context();
|
|
|
|
Context& operator= (const Context& other);
|
|
inline const XMLCh* getString() const { return fString; }
|
|
void reset(const XMLCh* const string, const XMLSize_t stringLen,
|
|
const XMLSize_t start, const XMLSize_t limit, const int noClosures,
|
|
const unsigned int options);
|
|
bool nextCh(XMLInt32& ch, XMLSize_t& offset);
|
|
|
|
bool fAdoptMatch;
|
|
XMLSize_t fStart;
|
|
XMLSize_t fLimit;
|
|
XMLSize_t fLength; // fLimit - fStart
|
|
int fSize;
|
|
XMLSize_t fStringMaxLen;
|
|
int* fOffsets;
|
|
Match* fMatch;
|
|
const XMLCh* fString;
|
|
unsigned int fOptions;
|
|
MemoryManager* fMemoryManager;
|
|
};
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Unimplemented constructors and operators
|
|
// -----------------------------------------------------------------------
|
|
RegularExpression(const RegularExpression&);
|
|
RegularExpression& operator=(const RegularExpression&);
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Protected Helper methods
|
|
// -----------------------------------------------------------------------
|
|
void prepare();
|
|
int parseOptions(const XMLCh* const options);
|
|
|
|
/**
|
|
* Matching helpers
|
|
*/
|
|
int match(Context* const context, const Op* const operations, XMLSize_t offset) const;
|
|
bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2) const;
|
|
|
|
/**
|
|
* Helper methods used by match(Context* ...)
|
|
*/
|
|
bool matchChar(Context* const context, const XMLInt32 ch, XMLSize_t& offset,
|
|
const bool ignoreCase) const;
|
|
bool matchDot(Context* const context, XMLSize_t& offset) const;
|
|
bool matchRange(Context* const context, const Op* const op,
|
|
XMLSize_t& offset, const bool ignoreCase) const;
|
|
bool matchAnchor(Context* const context, const XMLInt32 ch,
|
|
const XMLSize_t offset) const;
|
|
bool matchBackReference(Context* const context, const XMLInt32 ch,
|
|
XMLSize_t& offset, const bool ignoreCase) const;
|
|
bool matchString(Context* const context, const XMLCh* const literal,
|
|
XMLSize_t& offset, const bool ignoreCase) const;
|
|
int matchUnion(Context* const context, const Op* const op, XMLSize_t offset) const;
|
|
int matchCapture(Context* const context, const Op* const op, XMLSize_t offset) const;
|
|
|
|
/**
|
|
* Replace helpers
|
|
*/
|
|
void subInExp(const XMLCh* const repString,
|
|
const XMLCh* const origString,
|
|
const Match* subEx,
|
|
XMLBuffer &result,
|
|
MemoryManager* const manager) const;
|
|
/**
|
|
* Converts a token tree into an operation tree
|
|
*/
|
|
void compile(const Token* const token);
|
|
Op* compile(const Token* const token, Op* const next,
|
|
const bool reverse);
|
|
/**
|
|
* Helper methods used by compile
|
|
*/
|
|
Op* compileSingle(const Token* const token, Op* const next,
|
|
const Token::tokType tkType);
|
|
Op* compileUnion(const Token* const token, Op* const next,
|
|
const bool reverse);
|
|
Op* compileParenthesis(const Token* const token, Op* const next,
|
|
const bool reverse);
|
|
Op* compileConcat(const Token* const token, Op* const next,
|
|
const bool reverse);
|
|
Op* compileClosure(const Token* const token, Op* const next,
|
|
const bool reverse, const Token::tokType tkType);
|
|
|
|
bool doTokenOverlap(const Op* op, Token* token);
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Protected data members
|
|
// -----------------------------------------------------------------------
|
|
bool fHasBackReferences;
|
|
bool fFixedStringOnly;
|
|
int fNoGroups;
|
|
XMLSize_t fMinLength;
|
|
unsigned int fNoClosures;
|
|
unsigned int fOptions;
|
|
const BMPattern* fBMPattern;
|
|
XMLCh* fPattern;
|
|
XMLCh* fFixedString;
|
|
const Op* fOperations;
|
|
Token* fTokenTree;
|
|
RangeToken* fFirstChar;
|
|
static RangeToken* fWordRange;
|
|
OpFactory fOpFactory;
|
|
TokenFactory* fTokenFactory;
|
|
MemoryManager* fMemoryManager;
|
|
};
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
// RegularExpression: Static initialize and cleanup methods
|
|
// -----------------------------------------------------------------------
|
|
inline void RegularExpression::staticCleanup()
|
|
{
|
|
fWordRange = 0;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// RegularExpression: Cleanup methods
|
|
// ---------------------------------------------------------------------------
|
|
inline void RegularExpression::cleanUp() {
|
|
|
|
fMemoryManager->deallocate(fPattern);//delete [] fPattern;
|
|
fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
|
|
delete fBMPattern;
|
|
delete fTokenFactory;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// RegularExpression: Helper methods
|
|
// ---------------------------------------------------------------------------
|
|
inline bool RegularExpression::isSet(const int options, const int flag) {
|
|
|
|
return (options & flag) == flag;
|
|
}
|
|
|
|
inline Op* RegularExpression::compileSingle(const Token* const token,
|
|
Op* const next,
|
|
const Token::tokType tkType) {
|
|
|
|
Op* ret = 0;
|
|
|
|
switch (tkType) {
|
|
case Token::T_DOT:
|
|
ret = fOpFactory.createDotOp();
|
|
break;
|
|
case Token::T_CHAR:
|
|
ret = fOpFactory.createCharOp(token->getChar());
|
|
break;
|
|
case Token::T_ANCHOR:
|
|
ret = fOpFactory.createAnchorOp(token->getChar());
|
|
break;
|
|
case Token::T_RANGE:
|
|
case Token::T_NRANGE:
|
|
ret = fOpFactory.createRangeOp(token);
|
|
break;
|
|
case Token::T_EMPTY:
|
|
ret = next;
|
|
break;
|
|
case Token::T_STRING:
|
|
ret = fOpFactory.createStringOp(token->getString());
|
|
break;
|
|
case Token::T_BACKREFERENCE:
|
|
ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (tkType != Token::T_EMPTY)
|
|
ret->setNextOp(next);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
inline Op* RegularExpression::compileUnion(const Token* const token,
|
|
Op* const next,
|
|
const bool reverse) {
|
|
|
|
XMLSize_t tokSize = token->size();
|
|
UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
|
|
|
|
for (XMLSize_t i=0; i<tokSize; i++) {
|
|
|
|
uniOp->addElement(compile(token->getChild(i), next, reverse));
|
|
}
|
|
|
|
return uniOp;
|
|
}
|
|
|
|
|
|
inline Op* RegularExpression::compileParenthesis(const Token* const token,
|
|
Op* const next,
|
|
const bool reverse) {
|
|
|
|
if (token->getNoParen() == 0)
|
|
return compile(token->getChild(0), next, reverse);
|
|
|
|
Op* captureOp = 0;
|
|
|
|
if (reverse) {
|
|
|
|
captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
|
|
captureOp = compile(token->getChild(0), captureOp, reverse);
|
|
|
|
return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
|
|
}
|
|
|
|
captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
|
|
captureOp = compile(token->getChild(0), captureOp, reverse);
|
|
|
|
return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
|
|
}
|
|
|
|
inline Op* RegularExpression::compileConcat(const Token* const token,
|
|
Op* const next,
|
|
const bool reverse) {
|
|
|
|
Op* ret = next;
|
|
XMLSize_t tokSize = token->size();
|
|
|
|
if (!reverse) {
|
|
|
|
for (XMLSize_t i= tokSize; i>0; i--) {
|
|
ret = compile(token->getChild(i-1), ret, false);
|
|
}
|
|
}
|
|
else {
|
|
|
|
for (XMLSize_t i= 0; i< tokSize; i++) {
|
|
ret = compile(token->getChild(i), ret, true);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
inline Op* RegularExpression::compileClosure(const Token* const token,
|
|
Op* const next,
|
|
const bool reverse,
|
|
const Token::tokType tkType) {
|
|
|
|
Op* ret = 0;
|
|
Token* childTok = token->getChild(0);
|
|
int min = token->getMin();
|
|
int max = token->getMax();
|
|
|
|
if (min >= 0 && min == max) {
|
|
|
|
ret = next;
|
|
for (int i=0; i< min; i++) {
|
|
ret = compile(childTok, ret, reverse);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
if (min > 0 && max > 0)
|
|
max -= min;
|
|
|
|
if (max > 0) {
|
|
|
|
ret = next;
|
|
for (int i=0; i<max; i++) {
|
|
|
|
ChildOp* childOp = fOpFactory.createQuestionOp(
|
|
tkType == Token::T_NONGREEDYCLOSURE);
|
|
|
|
childOp->setNextOp(next);
|
|
childOp->setChild(compile(childTok, ret, reverse));
|
|
ret = childOp;
|
|
}
|
|
}
|
|
else {
|
|
|
|
ChildOp* childOp = 0;
|
|
|
|
if (tkType == Token::T_NONGREEDYCLOSURE) {
|
|
childOp = fOpFactory.createNonGreedyClosureOp();
|
|
}
|
|
else {
|
|
|
|
if (childTok->getMinLength() == 0)
|
|
childOp = fOpFactory.createClosureOp(fNoClosures++);
|
|
else
|
|
childOp = fOpFactory.createClosureOp(-1);
|
|
}
|
|
|
|
childOp->setNextOp(next);
|
|
if(next==NULL || !doTokenOverlap(next, childTok))
|
|
{
|
|
childOp->setOpType(tkType == Token::T_NONGREEDYCLOSURE?Op::O_FINITE_NONGREEDYCLOSURE:Op::O_FINITE_CLOSURE);
|
|
childOp->setChild(compile(childTok, NULL, reverse));
|
|
}
|
|
else
|
|
{
|
|
childOp->setChild(compile(childTok, childOp, reverse));
|
|
}
|
|
ret = childOp;
|
|
}
|
|
|
|
if (min > 0) {
|
|
|
|
for (int i=0; i< min; i++) {
|
|
ret = compile(childTok, ret, reverse);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
XERCES_CPP_NAMESPACE_END
|
|
|
|
#endif
|
|
/**
|
|
* End of file RegularExpression.hpp
|
|
*/
|
|
|