RegexInterpreter.h [plain text]
#ifndef RegexInterpreter_h
#define RegexInterpreter_h
#include <wtf/Platform.h>
#if ENABLE(YARR)
#include <wtf/unicode/Unicode.h>
#include "RegexParser.h"
#include "RegexPattern.h"
namespace JSC { namespace Yarr {
class ByteDisjunction;
struct ByteTerm {
enum Type {
TypeBodyAlternativeBegin,
TypeBodyAlternativeDisjunction,
TypeBodyAlternativeEnd,
TypeAlternativeBegin,
TypeAlternativeDisjunction,
TypeAlternativeEnd,
TypeSubpatternBegin,
TypeSubpatternEnd,
TypeAssertionBOL,
TypeAssertionEOL,
TypeAssertionWordBoundary,
TypePatternCharacterOnce,
TypePatternCharacterFixed,
TypePatternCharacterGreedy,
TypePatternCharacterNonGreedy,
TypePatternCasedCharacterOnce,
TypePatternCasedCharacterFixed,
TypePatternCasedCharacterGreedy,
TypePatternCasedCharacterNonGreedy,
TypeCharacterClass,
TypeBackReference,
TypeParenthesesSubpattern,
TypeParenthesesSubpatternOnceBegin,
TypeParenthesesSubpatternOnceEnd,
TypeParentheticalAssertionBegin,
TypeParentheticalAssertionEnd,
TypeCheckInput,
} type;
bool invertOrCapture;
union {
struct {
union {
UChar patternCharacter;
struct {
UChar lo;
UChar hi;
} casedCharacter;
CharacterClass* characterClass;
unsigned subpatternId;
};
union {
ByteDisjunction* parenthesesDisjunction;
unsigned parenthesesWidth;
};
QuantifierType quantityType;
unsigned quantityCount;
} atom;
struct {
int next;
int end;
} alternative;
unsigned checkInputCount;
};
unsigned frameLocation;
int inputPosition;
ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
: frameLocation(frameLocation)
{
switch (quantityType) {
case QuantifierFixedCount:
type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
break;
case QuantifierGreedy:
type = ByteTerm::TypePatternCharacterGreedy;
break;
case QuantifierNonGreedy:
type = ByteTerm::TypePatternCharacterNonGreedy;
break;
}
atom.patternCharacter = ch;
atom.quantityType = quantityType;
atom.quantityCount = quantityCount;
inputPosition = inputPos;
}
ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
: frameLocation(frameLocation)
{
switch (quantityType) {
case QuantifierFixedCount:
type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
break;
case QuantifierGreedy:
type = ByteTerm::TypePatternCasedCharacterGreedy;
break;
case QuantifierNonGreedy:
type = ByteTerm::TypePatternCasedCharacterNonGreedy;
break;
}
atom.casedCharacter.lo = lo;
atom.casedCharacter.hi = hi;
atom.quantityType = quantityType;
atom.quantityCount = quantityCount;
inputPosition = inputPos;
}
ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
: type(ByteTerm::TypeCharacterClass)
, invertOrCapture(invert)
{
atom.characterClass = characterClass;
atom.quantityType = QuantifierFixedCount;
atom.quantityCount = 1;
inputPosition = inputPos;
}
ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos)
: type(type)
, invertOrCapture(invertOrCapture)
{
atom.subpatternId = subpatternId;
atom.parenthesesDisjunction = parenthesesInfo;
atom.quantityType = QuantifierFixedCount;
atom.quantityCount = 1;
inputPosition = inputPos;
}
ByteTerm(Type type, bool invert = false)
: type(type)
, invertOrCapture(invert)
{
atom.quantityType = QuantifierFixedCount;
atom.quantityCount = 1;
}
ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos)
: type(type)
, invertOrCapture(invertOrCapture)
{
atom.subpatternId = subpatternId;
atom.quantityType = QuantifierFixedCount;
atom.quantityCount = 1;
inputPosition = inputPos;
}
static ByteTerm BOL(int inputPos)
{
ByteTerm term(TypeAssertionBOL);
term.inputPosition = inputPos;
return term;
}
static ByteTerm CheckInput(unsigned count)
{
ByteTerm term(TypeCheckInput);
term.checkInputCount = count;
return term;
}
static ByteTerm EOL(int inputPos)
{
ByteTerm term(TypeAssertionEOL);
term.inputPosition = inputPos;
return term;
}
static ByteTerm WordBoundary(bool invert, int inputPos)
{
ByteTerm term(TypeAssertionWordBoundary, invert);
term.inputPosition = inputPos;
return term;
}
static ByteTerm BackReference(unsigned subpatternId, int inputPos)
{
return ByteTerm(TypeBackReference, subpatternId, false, inputPos);
}
static ByteTerm BodyAlternativeBegin()
{
ByteTerm term(TypeBodyAlternativeBegin);
term.alternative.next = 0;
term.alternative.end = 0;
return term;
}
static ByteTerm BodyAlternativeDisjunction()
{
ByteTerm term(TypeBodyAlternativeDisjunction);
term.alternative.next = 0;
term.alternative.end = 0;
return term;
}
static ByteTerm BodyAlternativeEnd()
{
ByteTerm term(TypeBodyAlternativeEnd);
term.alternative.next = 0;
term.alternative.end = 0;
return term;
}
static ByteTerm AlternativeBegin()
{
ByteTerm term(TypeAlternativeBegin);
term.alternative.next = 0;
term.alternative.end = 0;
return term;
}
static ByteTerm AlternativeDisjunction()
{
ByteTerm term(TypeAlternativeDisjunction);
term.alternative.next = 0;
term.alternative.end = 0;
return term;
}
static ByteTerm AlternativeEnd()
{
ByteTerm term(TypeAlternativeEnd);
term.alternative.next = 0;
term.alternative.end = 0;
return term;
}
static ByteTerm SubpatternBegin()
{
return ByteTerm(TypeSubpatternBegin);
}
static ByteTerm SubpatternEnd()
{
return ByteTerm(TypeSubpatternEnd);
}
bool invert()
{
return invertOrCapture;
}
bool capture()
{
return invertOrCapture;
}
};
class ByteDisjunction {
public:
ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
: m_numSubpatterns(numSubpatterns)
, m_frameSize(frameSize)
{
}
Vector<ByteTerm> terms;
unsigned m_numSubpatterns;
unsigned m_frameSize;
};
struct BytecodePattern {
BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern)
: m_body(body)
, m_ignoreCase(pattern.m_ignoreCase)
, m_multiline(pattern.m_multiline)
{
newlineCharacterClass = pattern.newlineCharacterClass();
wordcharCharacterClass = pattern.wordcharCharacterClass();
m_allParenthesesInfo.append(allParenthesesInfo);
m_userCharacterClasses.append(pattern.m_userCharacterClasses);
pattern.m_userCharacterClasses.clear();
}
~BytecodePattern()
{
deleteAllValues(m_allParenthesesInfo);
deleteAllValues(m_userCharacterClasses);
}
OwnPtr<ByteDisjunction> m_body;
bool m_ignoreCase;
bool m_multiline;
CharacterClass* newlineCharacterClass;
CharacterClass* wordcharCharacterClass;
private:
Vector<ByteDisjunction*> m_allParenthesesInfo;
Vector<CharacterClass*> m_userCharacterClasses;
};
BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
} }
#endif
#endif // RegexInterpreter_h