#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#include "unicode/uregex.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uobject.h"
#include "umutex.h"
#include "uassert.h"
#include "cmemory.h"
struct URegularExpression: public UMemory {
public:
URegularExpression();
~URegularExpression();
int32_t fMagic;
RegexPattern *fPat;
int32_t *fPatRefCount;
UChar *fPatString;
int32_t fPatStringLen;
RegexMatcher *fMatcher;
const UChar *fText; int32_t fTextLength;
UnicodeString fTextString; };
static const int32_t REXP_MAGIC = 0x72657870;
U_NAMESPACE_USE
URegularExpression::URegularExpression() {
fMagic = REXP_MAGIC;
fPat = NULL;
fPatRefCount = NULL;
fPatString = NULL;
fPatStringLen = 0;
fMatcher = NULL;
fText = NULL;
fTextLength = 0;
}
URegularExpression::~URegularExpression() {
delete fMatcher;
fMatcher = NULL;
if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
delete fPat;
uprv_free(fPatString);
uprv_free(fPatRefCount);
}
fMagic = 0;
}
static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
if (U_FAILURE(*status)) {
return FALSE;
}
if (re == NULL || re->fMagic != REXP_MAGIC) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
if (requiresText && re->fText == NULL) {
*status = U_REGEX_INVALID_STATE;
return FALSE;
}
return TRUE;
}
U_CAPI URegularExpression * U_EXPORT2
uregex_open( const UChar *pattern,
int32_t patternLength,
uint32_t flags,
UParseError *pe,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
if (pattern == NULL || patternLength < -1 || patternLength == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
int32_t actualPatLen = patternLength;
if (actualPatLen == -1) {
actualPatLen = u_strlen(pattern);
}
URegularExpression *re = new URegularExpression;
int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
if (re == NULL || refC == NULL || patBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete re;
uprv_free(refC);
uprv_free(patBuf);
return NULL;
}
re->fPatRefCount = refC;
*re->fPatRefCount = 1;
re->fPatString = patBuf;
re->fPatStringLen = patternLength;
u_memcpy(patBuf, pattern, actualPatLen);
patBuf[actualPatLen] = 0;
UnicodeString patString(patternLength==-1, patBuf, patternLength);
if (pe != NULL) {
re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
} else {
re->fPat = RegexPattern::compile(patString, flags, *status);
}
if (U_FAILURE(*status)) {
goto ErrorExit;
}
re->fMatcher = re->fPat->matcher(*status);
if (U_SUCCESS(*status)) {
return re;
}
ErrorExit:
delete re;
return NULL;
}
U_CAPI URegularExpression * U_EXPORT2
uregex_openC( const char *pattern,
uint32_t flags,
UParseError *pe,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
if (pattern == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
UnicodeString patString(pattern);
URegularExpression *re = uregex_open(patString.getBuffer(), patString.length(), flags, pe, status);
return re;
}
U_CAPI void U_EXPORT2
uregex_close(URegularExpression *re) {
UErrorCode status = U_ZERO_ERROR;
if (validateRE(re, &status, FALSE) == FALSE) {
return;
}
delete re;
}
U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression *source, UErrorCode *status) {
if (validateRE(source, status, FALSE) == FALSE) {
return NULL;
}
URegularExpression *clone = new URegularExpression;
if (clone == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
clone->fMatcher = source->fPat->matcher(*status);
if (U_FAILURE(*status)) {
delete clone;
return NULL;
}
if (clone == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
clone->fPat = source->fPat;
clone->fPatRefCount = source->fPatRefCount;
clone->fPatString = source->fPatString;
clone->fPatStringLen = source->fPatStringLen;
umtx_atomic_inc(source->fPatRefCount);
return clone;
};
U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression *regexp,
int32_t *patLength,
UErrorCode *status) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return NULL;
}
if (patLength != NULL) {
*patLength = regexp->fPatStringLen;
}
return regexp->fPatString;
};
U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return 0;
}
int32_t flags = regexp->fPat->flags();
return flags;
};
U_CAPI void U_EXPORT2
uregex_setText(URegularExpression *regexp,
const UChar *text,
int32_t textLength,
UErrorCode *status) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return;
}
if (text == NULL || textLength < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
regexp->fText = text;
regexp->fTextLength = textLength;
UBool isTerminated = (textLength == -1);
regexp->fTextString.setTo(isTerminated, text, textLength);
regexp->fMatcher->reset(regexp->fTextString);
};
U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression *regexp,
int32_t *textLength,
UErrorCode *status) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return NULL;
}
if (textLength != NULL) {
*textLength = regexp->fTextLength;
}
return regexp->fText;
};
U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
UBool result = regexp->fMatcher->matches(startIndex, *status);
return result;
};
U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
return result;
};
U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
UBool result = regexp->fMatcher->find(startIndex, *status);
return result;
};
U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
UBool result = regexp->fMatcher->find();
return result;
};
U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->groupCount();
return result;
};
U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression *regexp,
int32_t groupNum,
UChar *dest,
int32_t destCapacity,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
int32_t endIx = regexp->fMatcher->end (groupNum, *status);
if (U_FAILURE(*status)) {
return 0;
}
int32_t fullLength = endIx - startIx;
int32_t copyLength = fullLength;
if (copyLength < destCapacity) {
dest[copyLength] = 0;
} else if (copyLength == destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
copyLength = destCapacity;
*status = U_BUFFER_OVERFLOW_ERROR;
}
if (copyLength > 0) {
u_memcpy(dest, ®exp->fText[startIx], copyLength);
}
return fullLength;
};
U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression *regexp,
int32_t groupNum,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->start(groupNum, *status);
return result;
};
U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression *regexp,
int32_t groupNum,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->end(groupNum, *status);
return result;
};
U_CAPI void U_EXPORT2
uregex_reset(URegularExpression *regexp,
int32_t index,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return;
}
regexp->fMatcher->reset(index, *status);
};
U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression *regexp,
UChar *replacementText,
int32_t replacementLength,
UChar *destBuf,
int32_t destCapacity,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
destBuf == NULL && destCapacity > 0 ||
destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t len = 0;
uregex_reset(regexp, 0, status);
while (uregex_findNext(regexp, status)) {
len += uregex_appendReplacement(regexp, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
return len;
};
U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression *regexp,
UChar *replacementText,
int32_t replacementLength,
UChar *destBuf,
int32_t destCapacity,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
destBuf == NULL && destCapacity > 0 ||
destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t len = 0;
UBool findSucceeded;
uregex_reset(regexp, 0, status);
findSucceeded = uregex_find(regexp, 0, status);
if (findSucceeded) {
len = uregex_appendReplacement(regexp, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
return len;
};
U_NAMESPACE_BEGIN
class RegexCImpl {
public:
inline static int32_t appendReplacement(URegularExpression *regexp,
UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
inline static int32_t appendTail(URegularExpression *regexp,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
};
U_NAMESPACE_END
U_CDECL_BEGIN
static UChar U_CALLCONV
unescape_charAt(int32_t offset, void *context) {
UChar c16 = ((UChar *)context)[offset];
return c16;
}
U_CDECL_END
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
if (*idx < bufCapacity) {
buf[*idx] = c;
}
(*idx)++;
}
int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
UBool pendingBufferOverflow = FALSE;
if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
pendingBufferOverflow = TRUE;
*status = U_ZERO_ERROR;
}
if (validateRE(regexp, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
destCapacity == NULL || destBuf == NULL ||
*destBuf == NULL && *destCapacity > 0 ||
*destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
RegexMatcher *m = regexp->fMatcher;
if (m->fMatch == FALSE) {
*status = U_REGEX_INVALID_STATE;
return 0;
}
UChar *dest = *destBuf;
int32_t capacity = *destCapacity;
int32_t destIdx = 0;
int32_t i;
if (replacementLength == -1) {
replacementLength = u_strlen(replacementText);
}
for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
}
int32_t replIdx = 0;
while (replIdx < replacementLength) {
UChar c = replacementText[replIdx];
replIdx++;
if (c != DOLLARSIGN && c != BACKSLASH) {
appendToBuf(c, &destIdx, dest, capacity);
continue;
}
if (c == BACKSLASH) {
if (replIdx >= replacementLength) {
break;
}
c = replacementText[replIdx];
if (c==0x55 || c==0x75) {
UChar32 escapedChar =
u_unescapeAt(unescape_charAt,
&replIdx, replacementLength, replacementText);
if (escapedChar != (UChar32)0xFFFFFFFF) {
if (escapedChar <= 0xffff) {
appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
} else {
appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
}
continue;
}
}
appendToBuf(c, &destIdx, dest, capacity);
replIdx++;
continue;
}
int32_t numDigits = 0;
int32_t groupNum = 0;
UChar32 digitC;
for (;;) {
if (replIdx >= replacementLength) {
break;
}
U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
if (u_isdigit(digitC) == FALSE) {
break;
}
U16_FWD_1(replacementText, replIdx, replacementLength);
groupNum=groupNum*10 + u_charDigitValue(digitC);
numDigits++;
if (numDigits >= m->fPattern->fMaxCaptureDigits) {
break;
}
}
if (numDigits == 0) {
appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
continue;
}
int32_t capacityRemaining = capacity - destIdx;
if (capacityRemaining < 0) {
capacityRemaining = 0;
}
destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
}
if (U_FAILURE(*status)) {
break;
}
}
if (destIdx < capacity) {
dest[destIdx] = 0;
} else if (destIdx == *destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
if (destIdx > 0 && *destCapacity > 0) {
if (destIdx < capacity) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} else {
*destBuf += capacity;
*destCapacity = 0;
}
}
if (pendingBufferOverflow && U_SUCCESS(*status)) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return destIdx;
}
U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression *regexp,
UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
return RegexCImpl::appendReplacement(
regexp, replacementText, replacementLength,destBuf, destCapacity, status);
}
int32_t RegexCImpl::appendTail(URegularExpression *regexp,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
UBool pendingBufferOverflow = FALSE;
if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
pendingBufferOverflow = TRUE;
*status = U_ZERO_ERROR;
}
if (validateRE(regexp, status) == FALSE) {
return 0;
}
if (destCapacity == NULL || destBuf == NULL ||
*destBuf == NULL && *destCapacity > 0 ||
*destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
RegexMatcher *m = regexp->fMatcher;
int32_t srcIdx;
if (m->fMatch) {
srcIdx = m->fMatchEnd;
} else {
srcIdx = m->fLastMatchEnd;
if (srcIdx == -1) {
srcIdx = 0;
}
}
int32_t destIdx = 0;
int32_t destCap = *destCapacity;
UChar *dest = *destBuf;
for (;;) {
if (srcIdx == regexp->fTextLength) {
break;
}
UChar c = regexp->fText[srcIdx];
if (c == 0 && regexp->fTextLength == -1) {
break;
}
if (destIdx < destCap) {
dest[destIdx] = c;
} else {
if (regexp->fTextLength > 0) {
destIdx += (regexp->fTextLength - srcIdx);
break;
}
}
srcIdx++;
destIdx++;
}
if (destIdx < destCap) {
dest[destIdx] = 0;
} else if (destIdx == destCap) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
if (destIdx < destCap) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} else {
*destBuf += destCap;
*destCapacity = 0;
}
if (pendingBufferOverflow && U_SUCCESS(*status)) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return destIdx;
};
U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression *regexp,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
}
static void copyString(UChar *destBuffer, int32_t destCapacity, int32_t *destIndex, const UChar *srcPtr, int32_t srcLen) {
int32_t si;
int32_t di = *destIndex;
UChar c;
for (si=0; si<srcLen; si++) {
c = srcPtr[si];
if (di < destCapacity) {
destBuffer[di] = c;
di++;
} else {
di += srcLen - si;
break;
}
}
destBuffer[di++] = 0;
*destIndex = di;
}
U_CAPI int32_t U_EXPORT2
uregex_split( URegularExpression *regexp,
UChar *destBuf,
int32_t destCapacity,
int32_t *requiredCapacity,
UChar *destFields[],
int32_t destFieldsCapacity,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
if (destBuf == NULL && destCapacity > 0 ||
destCapacity < 0 ||
destFields == NULL ||
destFieldsCapacity < 1 ) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
regexp->fMatcher->reset();
int32_t inputLen = regexp->fTextString.length();
int32_t nextOutputStringStart = 0;
if (inputLen == 0) {
return 0;
}
int32_t i; int32_t destIdx = 0; int32_t numCaptureGroups = regexp->fMatcher->groupCount();
for (i=0; ; i++) {
if (i>=destFieldsCapacity-1) {
int32_t remainingLength = inputLen-nextOutputStringStart;
if (remainingLength > 0) {
}
if (i >= destFieldsCapacity) {
i = destFieldsCapacity-1;
destIdx = (int32_t)(destFields[i] - destFields[0]);
}
destFields[i] = &destBuf[destIdx];
copyString(destBuf, destCapacity, &destIdx,
®exp->fText[nextOutputStringStart], remainingLength);
break;
}
if (regexp->fMatcher->find()) {
int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
destFields[i] = &destBuf[destIdx];
copyString(destBuf, destCapacity, &destIdx,
®exp->fText[nextOutputStringStart], fieldLen);
nextOutputStringStart = regexp->fMatcher->end(*status);
int32_t groupNum;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
if (i==destFieldsCapacity-1) {
break;
}
i++;
UErrorCode tStatus = U_ZERO_ERROR; int32_t remainingCapacity = destCapacity - destIdx;
if (remainingCapacity < 0) {
remainingCapacity = 0;
}
destFields[i] = &destBuf[destIdx];
int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
destIdx += t + 1; }
if (nextOutputStringStart == inputLen) {
break;
}
}
else
{
destFields[i] = &destBuf[destIdx];
copyString(destBuf, destCapacity, &destIdx,
®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
break;
}
}
int j;
for (j=i+1; j<destFieldsCapacity; j++) {
destFields[j] = NULL;
}
if (requiredCapacity != NULL) {
*requiredCapacity = destIdx;
}
if (*requiredCapacity > destCapacity) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return i+1;
}
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS