#ifndef _GNU_SOURCE
# define _GNU_SOURCE 1
#endif
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <assert.h>
#include <sys/types.h>
#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
# define MBS_SUPPORT
# include <wchar.h>
# include <wctype.h>
#endif
#include "system.h"
#include "grep.h"
#include "regex.h"
#include "dfa.h"
#include "kwset.h"
#include "error.h"
#include "xalloc.h"
#ifdef HAVE_LIBPCRE
# include <pcre.h>
#endif
#ifdef HAVE_LANGINFO_CODESET
# include <langinfo.h>
#endif
#define NCHAR (UCHAR_MAX + 1)
#define WCHAR(C) (ISALNUM(C) || (C) == '_')
static struct dfa dfa;
static struct patterns
{
struct re_pattern_buffer regexbuf;
struct re_registers regs;
} patterns0;
struct patterns *patterns;
size_t pcount;
static kwset_t kwset;
static int kwset_exact_matches;
static int using_utf8;
static void kwsinit PARAMS ((void));
static void kwsmusts PARAMS ((void));
static void Gcompile PARAMS ((char const *, size_t));
static void Ecompile PARAMS ((char const *, size_t));
static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
static void Fcompile PARAMS ((char const *, size_t));
static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
static void Pcompile PARAMS ((char const *, size_t ));
static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
void
check_utf8 (void)
{
#ifdef HAVE_LANGINFO_CODESET
if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
using_utf8 = 1;
#endif
}
void
dfaerror (char const *mesg)
{
error (2, 0, mesg);
}
static void
kwsinit (void)
{
static char trans[NCHAR];
int i;
if (match_icase)
for (i = 0; i < NCHAR; ++i)
trans[i] = TOLOWER (i);
if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
error (2, 0, _("memory exhausted"));
}
static void
kwsmusts (void)
{
struct dfamust const *dm;
char const *err;
if (dfa.musts)
{
kwsinit ();
for (dm = dfa.musts; dm; dm = dm->next)
{
if (!dm->exact)
continue;
++kwset_exact_matches;
if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
error (2, 0, err);
}
for (dm = dfa.musts; dm; dm = dm->next)
{
if (dm->exact)
continue;
if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
error (2, 0, err);
}
if ((err = kwsprep (kwset)) != 0)
error (2, 0, err);
}
}
static void
Gcompile (char const *pattern, size_t size)
{
const char *err;
char const *sep;
size_t total = size;
char const *motif = pattern;
check_utf8 ();
re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
do
{
size_t len;
sep = memchr (motif, '\n', total);
if (sep)
{
len = sep - motif;
sep++;
total -= (len + 1);
}
else
{
len = total;
total = 0;
}
patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
if (patterns == NULL)
error (2, errno, _("memory exhausted"));
patterns[pcount] = patterns0;
if ((err = re_compile_pattern (motif, len,
&(patterns[pcount].regexbuf))) != 0)
error (2, 0, err);
pcount++;
motif = sep;
} while (sep && total != 0);
if (match_words || match_lines)
{
static char const line_beg[] = "^\\(";
static char const line_end[] = "\\)$";
static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
size_t i;
strcpy (n, match_lines ? line_beg : word_beg);
i = strlen (n);
memcpy (n + i, pattern, size);
i += size;
strcpy (n + i, match_lines ? line_end : word_end);
i += strlen (n + i);
pattern = n;
size = i;
}
dfacomp (pattern, size, &dfa, 1);
kwsmusts ();
}
static void
Ecompile (char const *pattern, size_t size)
{
const char *err;
const char *sep;
size_t total = size;
char const *motif = pattern;
check_utf8 ();
if (strcmp (matcher, "awk") == 0)
{
re_set_syntax (RE_SYNTAX_AWK);
dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
}
else
{
re_set_syntax (RE_SYNTAX_POSIX_EGREP);
dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
}
do
{
size_t len;
sep = memchr (motif, '\n', total);
if (sep)
{
len = sep - motif;
sep++;
total -= (len + 1);
}
else
{
len = total;
total = 0;
}
patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
if (patterns == NULL)
error (2, errno, _("memory exhausted"));
patterns[pcount] = patterns0;
if ((err = re_compile_pattern (motif, len,
&(patterns[pcount].regexbuf))) != 0)
error (2, 0, err);
pcount++;
motif = sep;
} while (sep && total != 0);
if (match_words || match_lines)
{
static char const line_beg[] = "^(";
static char const line_end[] = ")$";
static char const word_beg[] = "(^|[^[:alnum:]_])(";
static char const word_end[] = ")([^[:alnum:]_]|$)";
char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
size_t i;
strcpy (n, match_lines ? line_beg : word_beg);
i = strlen(n);
memcpy (n + i, pattern, size);
i += size;
strcpy (n + i, match_lines ? line_end : word_end);
i += strlen (n + i);
pattern = n;
size = i;
}
dfacomp (pattern, size, &dfa, 1);
kwsmusts ();
}
static size_t
EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
register char const *buflim, *beg, *end;
char eol = eolbyte;
int backref, start, len;
struct kwsmatch kwsm;
size_t i, ret_val;
#ifdef MBS_SUPPORT
int mb_cur_max = MB_CUR_MAX;
mbstate_t mbs;
memset (&mbs, '\0', sizeof (mbstate_t));
#endif
buflim = buf + size;
for (beg = end = buf; end < buflim; beg = end)
{
if (!exact)
{
if (kwset)
{
#ifdef MBS_SUPPORT
size_t bytes_left = 0;
#endif
size_t offset;
#ifdef MBS_SUPPORT
if (match_icase && mb_cur_max > 1)
offset = 0;
else
#endif
offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
if (offset == (size_t) -1)
goto failure;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && !using_utf8)
{
bytes_left = offset;
while (bytes_left)
{
size_t mlen = mbrlen (beg, bytes_left, &mbs);
if (mlen == (size_t) -1 || mlen == 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
beg++;
bytes_left--;
continue;
}
if (mlen == (size_t) -2)
break;
beg += mlen;
bytes_left -= mlen;
}
}
else
#endif
beg += offset;
end = memchr(beg, eol, buflim - beg);
if (end)
end++;
else
end = buflim;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && bytes_left)
continue;
#endif
while (beg > buf && beg[-1] != eol)
--beg;
if (
#ifdef MBS_SUPPORT
!(match_icase && mb_cur_max > 1) &&
#endif
(kwsm.index < kwset_exact_matches))
goto success_in_beg_and_end;
if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
continue;
}
else
{
#ifdef MBS_SUPPORT
size_t bytes_left = 0;
#endif
size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
if (offset == (size_t) -1)
break;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && !using_utf8)
{
bytes_left = offset;
while (bytes_left)
{
size_t mlen = mbrlen (beg, bytes_left, &mbs);
if (mlen == (size_t) -1 || mlen == 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
beg++;
bytes_left--;
continue;
}
if (mlen == (size_t) -2)
break;
beg += mlen;
bytes_left -= mlen;
}
}
else
#endif
beg += offset;
end = memchr (beg, eol, buflim - beg);
if (end)
end++;
else
end = buflim;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && bytes_left)
continue;
#endif
while (beg > buf && beg[-1] != eol)
--beg;
}
if (!backref)
goto success_in_beg_and_end;
}
else
end = beg + size;
for (i = 0; i < pcount; i++)
{
patterns[i].regexbuf.not_eol = 0;
if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
end - beg - 1, 0,
end - beg - 1, &(patterns[i].regs))))
{
len = patterns[i].regs.end[0] - start;
if (exact && !match_words)
goto success_in_start_and_len;
if ((!match_lines && !match_words)
|| (match_lines && len == end - beg - 1))
goto success_in_beg_and_end;
if (match_words)
while (start >= 0)
{
if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
&& (len == end - beg - 1
|| !WCHAR ((unsigned char) beg[start + len])))
goto success_in_beg_and_end;
if (len > 0)
{
--len;
patterns[i].regexbuf.not_eol = 1;
len = re_match (&(patterns[i].regexbuf), beg,
start + len, start,
&(patterns[i].regs));
}
if (len <= 0)
{
if (start == end - beg - 1)
break;
++start;
patterns[i].regexbuf.not_eol = 0;
start = re_search (&(patterns[i].regexbuf), beg,
end - beg - 1,
start, end - beg - 1 - start,
&(patterns[i].regs));
len = patterns[i].regs.end[0] - start;
}
}
}
}
}
failure:
return (size_t) -1;
success_in_beg_and_end:
len = end - beg;
start = beg - buf;
success_in_start_and_len:
*match_size = len;
return start;
}
#ifdef MBS_SUPPORT
static int f_i_multibyte;
static struct
{
wchar_t **patterns;
size_t count, maxlen;
unsigned char *match;
} Fimb;
#endif
static void
Fcompile (char const *pattern, size_t size)
{
int mb_cur_max = MB_CUR_MAX;
char const *beg, *lim, *err;
check_utf8 ();
#ifdef MBS_SUPPORT
if (match_icase && mb_cur_max > 1)
{
mbstate_t mbs;
wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
const char *patternend = pattern;
size_t wcsize;
kwset_t fimb_kwset = NULL;
char *starts = NULL;
wchar_t *wcbeg, *wclim;
size_t allocated = 0;
memset (&mbs, '\0', sizeof (mbs));
# ifdef __GNU_LIBRARY__
wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
if (patternend != pattern + size)
wcsize = (size_t) -1;
# else
{
char *patterncopy = xmalloc (size + 1);
memcpy (patterncopy, pattern, size);
patterncopy[size] = '\0';
patternend = patterncopy;
wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
if (patternend != patterncopy + size)
wcsize = (size_t) -1;
free (patterncopy);
}
# endif
if (wcsize + 2 <= 2)
{
fimb_fail:
free (wcpattern);
free (starts);
if (fimb_kwset)
kwsfree (fimb_kwset);
free (Fimb.patterns);
Fimb.patterns = NULL;
}
else
{
if (!(fimb_kwset = kwsalloc (NULL)))
error (2, 0, _("memory exhausted"));
starts = xmalloc (mb_cur_max * 3);
wcbeg = wcpattern;
do
{
int i;
size_t wclen;
if (Fimb.count >= allocated)
{
if (allocated == 0)
allocated = 128;
else
allocated *= 2;
Fimb.patterns = xrealloc (Fimb.patterns,
sizeof (wchar_t *) * allocated);
}
Fimb.patterns[Fimb.count++] = wcbeg;
for (wclim = wcbeg;
wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
*wclim = towlower (*wclim);
*wclim = L'\0';
wclen = wclim - wcbeg;
if (wclen > Fimb.maxlen)
Fimb.maxlen = wclen;
if (wclen > 3)
wclen = 3;
if (wclen == 0)
{
if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
error (2, 0, err);
}
else
for (i = 0; i < (1 << wclen); i++)
{
char *p = starts;
int j, k;
for (j = 0; j < wclen; ++j)
{
wchar_t wc = wcbeg[j];
if (i & (1 << j))
{
wc = towupper (wc);
if (wc == wcbeg[j])
continue;
}
k = wctomb (p, wc);
if (k <= 0)
goto fimb_fail;
p += k;
}
if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
error (2, 0, err);
}
if (wclim < wcpattern + wcsize)
++wclim;
wcbeg = wclim;
}
while (wcbeg < wcpattern + wcsize);
f_i_multibyte = 1;
kwset = fimb_kwset;
free (starts);
Fimb.match = xmalloc (Fimb.count);
if ((err = kwsprep (kwset)) != 0)
error (2, 0, err);
return;
}
}
#endif
kwsinit ();
beg = pattern;
do
{
for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
;
if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
error (2, 0, err);
if (lim < pattern + size)
++lim;
beg = lim;
}
while (beg < pattern + size);
if ((err = kwsprep (kwset)) != 0)
error (2, 0, err);
}
#ifdef MBS_SUPPORT
static int
Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
{
size_t len, letter, i;
int ret = -1;
mbstate_t mbs;
wchar_t wc;
int patterns_left;
assert (match_icase && f_i_multibyte == 1);
assert (MB_CUR_MAX > 1);
memset (&mbs, '\0', sizeof (mbs));
memset (Fimb.match, '\1', Fimb.count);
letter = len = 0;
patterns_left = 1;
while (patterns_left && len <= size)
{
size_t c;
patterns_left = 0;
if (len < size)
{
c = mbrtowc (&wc, buf + len, size - len, &mbs);
if (c + 2 <= 2)
return ret;
wc = towlower (wc);
}
else
{
c = 1;
wc = L'\0';
}
for (i = 0; i < Fimb.count; i++)
{
if (Fimb.match[i])
{
if (Fimb.patterns[i][letter] == L'\0')
{
*plen = len;
if (!exact && !match_words)
return 0;
else
{
ret = 0;
Fimb.match[i] = '\0';
continue;
}
}
if (Fimb.patterns[i][letter] == wc)
patterns_left = 1;
else
Fimb.match[i] = '\0';
}
}
len += c;
letter++;
}
return ret;
}
#endif
static size_t
Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
register char const *beg, *try, *end;
register size_t len;
char eol = eolbyte;
struct kwsmatch kwsmatch;
size_t ret_val;
#ifdef MBS_SUPPORT
int mb_cur_max = MB_CUR_MAX;
mbstate_t mbs;
memset (&mbs, '\0', sizeof (mbstate_t));
const char *last_char = NULL;
#endif
for (beg = buf; beg <= buf + size; ++beg)
{
size_t offset;
offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
if (offset == (size_t) -1)
goto failure;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && !using_utf8)
{
size_t bytes_left = offset;
while (bytes_left)
{
size_t mlen = mbrlen (beg, bytes_left, &mbs);
last_char = beg;
if (mlen == (size_t) -1 || mlen == 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
beg++;
bytes_left--;
continue;
}
if (mlen == (size_t) -2)
break;
beg += mlen;
bytes_left -= mlen;
}
if (bytes_left)
continue;
}
else
#endif
beg += offset;
#ifdef MBS_SUPPORT
if (f_i_multibyte
&& Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
goto next_char;
#endif
len = kwsmatch.size[0];
if (exact && !match_words)
goto success_in_beg_and_len;
if (match_lines)
{
if (beg > buf && beg[-1] != eol)
goto next_char;
if (beg + len < buf + size && beg[len] != eol)
goto next_char;
goto success;
}
else if (match_words)
{
while (len)
{
int word_match = 0;
if (beg > buf)
{
#ifdef MBS_SUPPORT
if (mb_cur_max > 1)
{
const char *s;
int mr;
wchar_t pwc;
if (using_utf8)
{
s = beg - 1;
while (s > buf
&& (unsigned char) *s >= 0x80
&& (unsigned char) *s <= 0xbf)
--s;
}
else
s = last_char;
mr = mbtowc (&pwc, s, beg - s);
if (mr <= 0)
memset (&mbs, '\0', sizeof (mbstate_t));
else if ((iswalnum (pwc) || pwc == L'_')
&& mr == (int) (beg - s))
goto next_char;
}
else
#endif
if (WCHAR ((unsigned char) beg[-1]))
goto next_char;
}
#ifdef MBS_SUPPORT
if (mb_cur_max > 1)
{
wchar_t nwc;
int mr;
mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
if (mr <= 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
word_match = 1;
}
else if (!iswalnum (nwc) && nwc != L'_')
word_match = 1;
}
else
#endif
if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
word_match = 1;
if (word_match)
{
if (!exact)
goto success;
else
goto success_in_beg_and_len;
}
if (len > 0)
{
--len;
offset = kwsexec (kwset, beg, len, &kwsmatch);
if (offset == -1)
goto next_char;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && !using_utf8)
{
size_t bytes_left = offset;
while (bytes_left)
{
size_t mlen = mbrlen (beg, bytes_left, &mbs);
last_char = beg;
if (mlen == (size_t) -1 || mlen == 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
beg++;
bytes_left--;
continue;
}
if (mlen == (size_t) -2)
{
break;
}
beg += mlen;
bytes_left -= mlen;
}
if (bytes_left)
{
memset (&mbs, '\0', sizeof (mbstate_t));
goto next_char;
}
}
else
#endif
beg += offset;
#ifdef MBS_SUPPORT
if (f_i_multibyte
&& Fimbexec (beg, len - offset, &kwsmatch.size[0],
exact))
goto next_char;
#endif
len = kwsmatch.size[0];
}
}
}
else
goto success;
next_char:;
#ifdef MBS_SUPPORT
if (mb_cur_max > 1)
{
if (using_utf8)
{
unsigned char c = *beg;
if (c >= 0xc2)
{
if (c < 0xe0)
++beg;
else if (c < 0xf0)
beg += 2;
else if (c < 0xf8)
beg += 3;
else if (c < 0xfc)
beg += 4;
else if (c < 0xfe)
beg += 5;
}
}
else
{
size_t l = mbrlen (beg, buf + size - beg, &mbs);
last_char = beg;
if (l + 2 >= 2)
beg += l - 1;
else
memset (&mbs, '\0', sizeof (mbstate_t));
}
}
#endif
}
failure:
return -1;
success:
#ifdef MBS_SUPPORT
if (mb_cur_max > 1 && !using_utf8)
{
end = beg + len;
while (end < buf + size)
{
size_t mlen = mbrlen (end, buf + size - end, &mbs);
if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
mlen = 1;
}
if (mlen == 1 && *end == eol)
break;
end += mlen;
}
}
else
#endif
end = memchr (beg + len, eol, (buf + size) - (beg + len));
if (end)
end++;
else
end = buf+size;
while (buf < beg && beg[-1] != eol)
--beg;
len = end - beg;
success_in_beg_and_len:
*match_size = len;
return beg - buf;
}
#if HAVE_LIBPCRE
static pcre *cre;
static pcre_extra *extra;
#endif
static void
Pcompile (char const *pattern, size_t size)
{
#if !HAVE_LIBPCRE
error (2, 0, _("The -P option is not supported"));
#else
int e;
char const *ep;
char *re = xmalloc (4 * size + 7);
int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
char const *patlim = pattern + size;
char *n = re;
char const *p;
char const *pnul;
if (eolbyte != '\n')
error (2, 0, _("The -P and -z options cannot be combined"));
*n = '\0';
if (match_lines)
strcpy (n, "^(");
if (match_words)
strcpy (n, "\\b(");
n += strlen (n);
for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
{
memcpy (n, p, pnul - p);
n += pnul - p;
for (p = pnul; pattern < p && p[-1] == '\\'; p--)
continue;
n -= (pnul - p) & 1;
strcpy (n, "\\000");
n += 4;
}
memcpy (n, p, patlim - p);
n += patlim - p;
*n = '\0';
if (match_words)
strcpy (n, ")\\b");
if (match_lines)
strcpy (n, ")$");
cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
if (!cre)
error (2, 0, ep);
extra = pcre_study (cre, 0, &ep);
if (ep)
error (2, 0, ep);
free (re);
#endif
}
static size_t
Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
#if !HAVE_LIBPCRE
abort ();
return -1;
#else
int sub[300];
int e = pcre_exec (cre, extra, buf, size, 0, 0,
sub, sizeof sub / sizeof *sub);
if (e <= 0)
{
switch (e)
{
case PCRE_ERROR_NOMATCH:
return -1;
case PCRE_ERROR_NOMEMORY:
error (2, 0, _("Memory exhausted"));
default:
abort ();
}
}
else
{
char const *beg = buf + sub[0];
char const *end = buf + sub[1];
char const *buflim = buf + size;
char eol = eolbyte;
if (!exact)
{
end = memchr (end, eol, buflim - end);
if (end)
end++;
else
end = buflim;
while (buf < beg && beg[-1] != eol)
--beg;
}
*match_size = end - beg;
return beg - buf;
}
#endif
}
struct matcher const matchers[] = {
{ "default", Gcompile, EGexecute },
{ "grep", Gcompile, EGexecute },
{ "egrep", Ecompile, EGexecute },
{ "awk", Ecompile, EGexecute },
{ "fgrep", Fcompile, Fexecute },
{ "perl", Pcompile, Pexecute },
{ "", 0, 0 },
};