--- regcomp.c.orig 2010-06-21 14:05:04.000000000 -0700 +++ regcomp.c 2010-06-21 14:23:51.000000000 -0700 @@ -39,6 +39,8 @@ static char sccsid[] = "@(#)regcomp.c 8. #include <sys/cdefs.h> __FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $"); +#include "xlocale_private.h" + #include <sys/types.h> #include <stdio.h> #include <string.h> @@ -69,6 +71,9 @@ struct parse { sopno ssize; /* malloced strip size (allocated) */ sopno slen; /* malloced strip length (used) */ int ncsalloc; /* number of csets allocated */ +#if __DARWIN_UNIX03 + int zerorepeats; +#endif /* __DARWIN_UNIX03 */ struct re_guts *g; # define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ @@ -93,7 +98,7 @@ static void p_b_cclass(struct parse *p, static void p_b_eclass(struct parse *p, cset *cs); static wint_t p_b_symbol(struct parse *p); static wint_t p_b_coll_elem(struct parse *p, wint_t endc); -static wint_t othercase(wint_t ch); +static wint_t othercase(wint_t ch, locale_t loc); static void bothcases(struct parse *p, wint_t ch); static void ordinary(struct parse *p, wint_t ch); static void nonnewline(struct parse *p); @@ -104,7 +109,7 @@ static void freeset(struct parse *p, cse static void CHadd(struct parse *p, cset *cs, wint_t ch); static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); -static wint_t singleton(cset *cs); +static wint_t singleton(cset *cs, locale_t loc); static sopno dupl(struct parse *p, sopno start, sopno finish); static void doemit(struct parse *p, sop op, size_t opnd); static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); @@ -222,10 +227,14 @@ regcomp(regex_t * __restrict preg, p->end = p->next + len; p->error = 0; p->ncsalloc = 0; +#if __DARWIN_UNIX03 + p->zerorepeats = 0; +#endif /* __DARWIN_UNIX03 */ for (i = 0; i < NPAREN; i++) { p->pbegin[i] = 0; p->pend[i] = 0; } + g->loc = __current_locale(); g->sets = NULL; g->ncsets = 0; g->cflags = cflags; @@ -302,8 +311,12 @@ p_ere(struct parse *p, conc = HERE(); while (MORE() && (c = PEEK()) != '|' && c != stop) p_ere_exp(p); +#if __DARWIN_UNIX03 + if (!p->zerorepeats) REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ + else p->zerorepeats--; +#else (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - +#endif if (!EAT('|')) break; /* NOTE BREAK OUT */ @@ -410,7 +423,7 @@ p_ere_exp(struct parse *p) ordinary(p, wc); break; case '{': /* okay as ordinary except if digit follows */ - (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); + (void)REQUIRE(!MORE() || !isdigit_l((uch)PEEK(), p->g->loc), REG_BADRPT); /* FALLTHROUGH */ default: p->next--; @@ -424,7 +437,7 @@ p_ere_exp(struct parse *p) c = PEEK(); /* we call { a repetition if followed by a digit */ if (!( c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) + (c == '{' && MORE2() && isdigit_l((uch)PEEK2(), p->g->loc)) )) return; /* no repetition, we're done */ NEXT(); @@ -453,7 +466,7 @@ p_ere_exp(struct parse *p) case '{': count = p_count(p); if (EAT(',')) { - if (isdigit((uch)PEEK())) { + if (isdigit_l((uch)PEEK(), p->g->loc)) { count2 = p_count(p); (void)REQUIRE(count <= count2, REG_BADBR); } else /* single number with comma */ @@ -474,7 +487,7 @@ p_ere_exp(struct parse *p) return; c = PEEK(); if (!( c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) + (c == '{' && MORE2() && isdigit_l((uch)PEEK2(), p->g->loc)) ) ) return; SETERROR(REG_BADRPT); } @@ -486,7 +499,12 @@ p_ere_exp(struct parse *p) static void p_str(struct parse *p) { +#if __DARWIN_UNIX03 + if (!p->zerorepeats) REQUIRE(MORE(), REG_EMPTY); + else p->zerorepeats--; +#else /* !__DARWIN_UNIX03 */ (void)REQUIRE(MORE(), REG_EMPTY); +#endif /* __DARWIN_UNIX03 */ while (MORE()) ordinary(p, WGETNEXT()); } @@ -525,8 +543,12 @@ p_bre(struct parse *p, p->g->iflags |= USEEOL; p->g->neol++; } - +#if __DARWIN_UNIX03 + if (!p->zerorepeats) REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ + else p->zerorepeats--; +#else /* !__DARWIN_UNIX03 */ (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ +#endif /* __DARWIN_UNIX03 */ } /* @@ -599,12 +621,22 @@ p_simp_re(struct parse *p, i = (c&~BACKSL) - '0'; assert(i < NPAREN); if (p->pend[i] != 0) { +#if __DARWIN_UNIX03 + int skip = 1; +#endif /* __DARWIN_UNIX03 */ assert(i <= p->g->nsub); EMIT(OBACK_, i); assert(p->pbegin[i] != 0); assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); assert(OP(p->strip[p->pend[i]]) == ORPAREN); +#if __DARWIN_UNIX03 + if (OP(p->strip[p->pbegin[i]+skip]) == OBOL) { + skip++; /* don't dup anchor in subexp */ + } + (void) dupl(p, p->pbegin[i]+skip, p->pend[i]); +#else /* !__DARWIN_UNIX03 */ (void) dupl(p, p->pbegin[i]+1, p->pend[i]); +#endif /* __DARWIN_UNIX03 */ EMIT(O_BACK, i); } else SETERROR(REG_ESUBREG); @@ -627,9 +659,10 @@ p_simp_re(struct parse *p, INSERT(OQUEST_, pos); ASTERN(O_QUEST, pos); } else if (EATTWO('\\', '{')) { + (void)REQUIRE(MORE(), REG_EBRACE); count = p_count(p); if (EAT(',')) { - if (MORE() && isdigit((uch)PEEK())) { + if (MORE() && isdigit_l((uch)PEEK(), p->g->loc)) { count2 = p_count(p); (void)REQUIRE(count <= count2, REG_BADBR); } else /* single number with comma */ @@ -659,7 +692,7 @@ p_count(struct parse *p) int count = 0; int ndigits = 0; - while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { + while (MORE() && isdigit_l((uch)PEEK(), p->g->loc) && count <= DUPMAX) { count = count*10 + (GETNEXT() - '0'); ndigits++; } @@ -697,10 +730,22 @@ p_bracket(struct parse *p) cs->icase = 1; if (EAT('^')) cs->invert = 1; +#if __DARWIN_UNIX03 + if (PEEK2() != '-' && PEEK2() != ']') { /* Don't eat '-' or ']' if they're part of ranges + * but do process [^-] */ if (EAT(']')) CHadd(p, cs, ']'); else if (EAT('-')) CHadd(p, cs, '-'); + } + if (MORE() && !SEETWO('-',']')) /* Parse RE []-'] */ + p_b_term(p, cs); +#else /* !__DARWIN_UNIX03 */ + if (EAT(']')) + CHadd(p, cs, ']'); + else if (EAT('-')) + CHadd(p, cs, '-'); +#endif /* __DARWIN_UNIX03 */ while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) p_b_term(p, cs); if (EAT('-')) @@ -713,7 +758,7 @@ p_bracket(struct parse *p) if (cs->invert && p->g->cflags®_NEWLINE) cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); - if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ + if ((ch = singleton(cs, p->g->loc)) != OUT) { /* optimize singleton sets */ ordinary(p, ch); freeset(p, cs); } else @@ -737,8 +782,16 @@ p_b_term(struct parse *p, cset *cs) c = (MORE2()) ? PEEK2() : '\0'; break; case '-': +#if __DARWIN_UNIX03 + if (PEEK2() != '-') { /* Allow [---] */ + SETERROR(REG_ERANGE); + return; /* NOTE RETURN */ + } else + c = '-'; +#else /* !__DARWIN_UNIX03 */ SETERROR(REG_ERANGE); return; /* NOTE RETURN */ +#endif /* __DARWIN_UNIX03 */ break; default: c = '\0'; @@ -759,7 +812,11 @@ p_b_term(struct parse *p, cset *cs) NEXT2(); (void)REQUIRE(MORE(), REG_EBRACK); c = PEEK(); +#if __DARWIN_UNIX03 + REQUIRE(c != '-', REG_ECOLLATE); /* allow [=]=] */ +#else /* !__DARWIN_UNIX03 */ (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); +#endif /* __DARWIN_UNIX03 */ p_b_eclass(p, cs); (void)REQUIRE(MORE(), REG_EBRACK); (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); @@ -778,14 +835,14 @@ p_b_term(struct parse *p, cset *cs) if (start == finish) CHadd(p, cs, start); else { - if (__collate_load_error) { + if (p->g->loc->__collate_load_error) { (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); CHaddrange(p, cs, start, finish); } else { - (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); + (void)REQUIRE(__collate_range_cmp(start, finish, p->g->loc) <= 0, REG_ERANGE); for (i = 0; i <= UCHAR_MAX; i++) { - if ( __collate_range_cmp(start, i) <= 0 - && __collate_range_cmp(i, finish) <= 0 + if ( __collate_range_cmp(start, i, p->g->loc) <= 0 + && __collate_range_cmp(i, finish, p->g->loc) <= 0 ) CHadd(p, cs, i); } @@ -807,7 +864,7 @@ p_b_cclass(struct parse *p, cset *cs) wctype_t wct; char clname[16]; - while (MORE() && isalpha((uch)PEEK())) + while (MORE() && isalpha_l((uch)PEEK(), p->g->loc)) NEXT(); len = p->next - sp; if (len >= sizeof(clname) - 1) { @@ -816,7 +873,7 @@ p_b_cclass(struct parse *p, cset *cs) } memcpy(clname, sp, len); clname[len] = '\0'; - if ((wct = wctype(clname)) == 0) { + if ((wct = wctype_l(clname, p->g->loc)) == 0) { SETERROR(REG_ECTYPE); return; } @@ -826,14 +883,38 @@ p_b_cclass(struct parse *p, cset *cs) /* - p_b_eclass - parse an equivalence-class name and deal with it == static void p_b_eclass(struct parse *p, cset *cs); - * - * This implementation is incomplete. xxx */ static void p_b_eclass(struct parse *p, cset *cs) { - wint_t c; - + char *sp = p->next; + int len, ec; + mbstate_t mbs; + int *newequiv_classes; + wint_t c; + + while (MORE() && !SEETWO('=', ']')) + NEXT(); + if (!MORE()) { + SETERROR(REG_EBRACK); + return; + } + len = p->next - sp; + memset(&mbs, 0, sizeof(mbs)); + ec = __collate_equiv_class(sp, len, &mbs, p->g->loc); + if (ec > 0) { + newequiv_classes = realloc(cs->equiv_classes, + (cs->nequiv_classes + 1) * sizeof(*cs->equiv_classes)); + if (newequiv_classes == NULL) { + SETERROR(REG_ESPACE); + return; + } + cs->equiv_classes = newequiv_classes; + cs->equiv_classes[cs->nequiv_classes++] = ec; + return; + } + /* not an equivalence class, so fallback to a collating element */ + p->next = sp; c = p_b_coll_elem(p, '='); CHadd(p, cs, c); } @@ -866,10 +947,10 @@ p_b_coll_elem(struct parse *p, wint_t endc) /* name ended by endc,']' */ { char *sp = p->next; - struct cname *cp; + const struct cname *cp; int len; mbstate_t mbs; - wchar_t wc; + wchar_t wbuf[16]; size_t clen; while (MORE() && !SEETWO(endc, ']')) @@ -883,9 +964,10 @@ p_b_coll_elem(struct parse *p, if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') return(cp->code); /* known name */ memset(&mbs, 0, sizeof(mbs)); - if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) - return (wc); /* single character */ - else if (clen == (size_t)-1 || clen == (size_t)-2) + clen = __collate_collating_symbol(wbuf, 16, sp, len, &mbs, p->g->loc); + if (clen == 1) + return (*wbuf); /* single character */ + else if (clen == (size_t)-1) SETERROR(REG_ILLSEQ); else SETERROR(REG_ECOLLATE); /* neither */ @@ -894,16 +976,16 @@ p_b_coll_elem(struct parse *p, /* - othercase - return the case counterpart of an alphabetic - == static char othercase(int ch); + == static char othercase(wint_t ch, locale_t loc); */ static wint_t /* if no counterpart, return ch */ -othercase(wint_t ch) +othercase(wint_t ch, locale_t loc) { - assert(iswalpha(ch)); - if (iswupper(ch)) - return(towlower(ch)); - else if (iswlower(ch)) - return(towupper(ch)); + assert(iswalpha_l(ch, loc)); + if (iswupper_l(ch, loc)) + return(towlower_l(ch, loc)); + else if (iswlower_l(ch, loc)) + return(towupper_l(ch, loc)); else /* peculiar, but could happen */ return(ch); } @@ -923,10 +1005,10 @@ bothcases(struct parse *p, wint_t ch) size_t n; mbstate_t mbs; - assert(othercase(ch) != ch); /* p_bracket() would recurse */ + assert(othercase(ch, p->g->loc) != ch); /* p_bracket() would recurse */ p->next = bracket; memset(&mbs, 0, sizeof(mbs)); - n = wcrtomb(bracket, ch, &mbs); + n = wcrtomb_l(bracket, ch, &mbs, p->g->loc); assert(n != (size_t)-1); bracket[n] = ']'; bracket[n + 1] = '\0'; @@ -946,7 +1028,7 @@ ordinary(struct parse *p, wint_t ch) { cset *cs; - if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) + if ((p->g->cflags®_ICASE) && iswalpha_l(ch, p->g->loc) && othercase(ch, p->g->loc) != ch) bothcases(p, ch); else if ((ch & OPDMASK) == ch) EMIT(OCHAR, ch); @@ -1012,10 +1094,22 @@ repeat(struct parse *p, switch (REP(MAP(from), MAP(to))) { case REP(0, 0): /* must be user doing this */ DROP(finish-start); /* drop the operand */ +#if __DARWIN_UNIX03 + p->zerorepeats++; +#endif /* __DARWIN_UNIX03 */ break; + case REP(0, INF): /* as x{1,}? */ +#if __DARWIN_UNIX03 + /* this case does not require the (y|) trick, noKLUDGE */ + /* Just like * =+? */ + INSERT(OPLUS_, start); + ASTERN(O_PLUS, start); + INSERT(OQUEST_, start); + ASTERN(O_QUEST, start); + break; +#endif /* __DARWIN_UNIX03 */ case REP(0, 1): /* as x{1,1}? */ case REP(0, N): /* as x{1,n}? */ - case REP(0, INF): /* as x{1,}? */ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ INSERT(OCH_, start); /* offset is wrong... */ repeat(p, start+1, 1, to); @@ -1029,6 +1123,10 @@ repeat(struct parse *p, /* done */ break; case REP(1, N): /* as x?x{1,n-1} */ +#if __DARWIN_UNIX03 + INSERT(OQUEST_, start); + ASTERN(O_QUEST, start); +#else /* !__DARWIN_UNIX03 */ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ INSERT(OCH_, start); ASTERN(OOR1, start); @@ -1036,6 +1134,7 @@ repeat(struct parse *p, EMIT(OOR2, 0); /* offset very wrong... */ AHEAD(THERE()); /* ...so fix it */ ASTERN(O_CH, THERETHERE()); +#endif /* __DARWIN_UNIX03 */ copy = dupl(p, start+1, finish+1); assert(copy == finish+4); repeat(p, copy, 1, to-1); @@ -1071,7 +1170,7 @@ wgetnext(struct parse *p) size_t n; memset(&mbs, 0, sizeof(mbs)); - n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); + n = mbrtowc_l(&wc, p->next, p->end - p->next, &mbs, p->g->loc); if (n == (size_t)-1 || n == (size_t)-2) { SETERROR(REG_ILLSEQ); return (0); @@ -1139,12 +1238,12 @@ freeset(struct parse *p, cset *cs) - returning it if so, otherwise returning OUT. */ static wint_t -singleton(cset *cs) +singleton(cset *cs, locale_t loc) { wint_t i, s, n; for (i = n = 0; i < NC; i++) - if (CHIN(cs, i)) { + if (CHIN(cs, i, loc)) { n++; s = i; } @@ -1178,9 +1277,9 @@ CHadd(struct parse *p, cset *cs, wint_t cs->wides[cs->nwides++] = ch; } if (cs->icase) { - if ((nch = towlower(ch)) < NC) + if ((nch = towlower_l(ch, p->g->loc)) < NC) cs->bmp[nch >> 3] |= 1 << (nch & 7); - if ((nch = towupper(ch)) < NC) + if ((nch = towupper_l(ch, p->g->loc)) < NC) cs->bmp[nch >> 3] |= 1 << (nch & 7); } } @@ -1219,7 +1318,7 @@ CHaddtype(struct parse *p, cset *cs, wct wctype_t *newtypes; for (i = 0; i < NC; i++) - if (iswctype(i, wct)) + if (iswctype_l(i, wct, p->g->loc)) CHadd(p, cs, i); newtypes = realloc(cs->types, (cs->ntypes + 1) * sizeof(*cs->types)); @@ -1391,6 +1490,7 @@ findmust(struct parse *p, struct re_guts char buf[MB_LEN_MAX]; size_t clen; mbstate_t mbs; + struct __xlocale_st_runelocale *rl = p->g->loc->__lc_ctype; /* avoid making error situations worse */ if (p->error != 0) @@ -1401,8 +1501,8 @@ findmust(struct parse *p, struct re_guts * multibyte character strings, but it's safe for at least * UTF-8 (see RFC 3629). */ - if (MB_CUR_MAX > 1 && - strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) + if (rl->__mb_cur_max > 1 && + strcmp(rl->_CurrentRuneLocale.__encoding, "UTF-8") != 0) return; /* find the longest OCHAR sequence in strip */ @@ -1418,7 +1518,7 @@ findmust(struct parse *p, struct re_guts memset(&mbs, 0, sizeof(mbs)); newstart = scan - 1; } - clen = wcrtomb(buf, OPND(s), &mbs); + clen = wcrtomb_l(buf, OPND(s), &mbs, p->g->loc); if (clen == (size_t)-1) goto toohard; newlen += clen; @@ -1537,7 +1637,7 @@ findmust(struct parse *p, struct re_guts while (cp < g->must + g->mlen) { while (OP(s = *scan++) != OCHAR) continue; - clen = wcrtomb(cp, OPND(s), &mbs); + clen = wcrtomb_l(cp, OPND(s), &mbs, p->g->loc); assert(clen != (size_t)-1); cp += clen; }