strmatch.c   [plain text]


/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*           Copyright (c) 1985-2007 AT&T Knowledge Ventures            *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                      by AT&T Knowledge Ventures                      *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                  David Korn <dgk@research.att.com>                   *
*                   Phong Vo <kpv@research.att.com>                    *
*                                                                      *
***********************************************************************/
#pragma prototyped

/*
 * D. G. Korn
 * G. S. Fowler
 * AT&T Research
 *
 * match shell file patterns -- derived from Bourne and Korn shell gmatch()
 *
 *	sh pattern	egrep RE	description
 *	----------	--------	-----------
 *	*		.*		0 or more chars
 *	?		.		any single char
 *	[.]		[.]		char class
 *	[!.]		[^.]		negated char class
 *	[[:.:]]		[[:.:]]		ctype class
 *	[[=.=]]		[[=.=]]		equivalence class
 *	[[...]]		[[...]]		collation element
 *	*(.)		(.)*		0 or more of
 *	+(.)		(.)+		1 or more of
 *	?(.)		(.)?		0 or 1 of
 *	(.)		(.)		1 of
 *	@(.)		(.)		1 of
 *	a|b		a|b		a or b
 *	\#				() subgroup back reference [1-9]
 *	a&b				a and b
 *	!(.)				none of
 *
 * \ used to escape metacharacters
 *
 *	*, ?, (, |, &, ), [, \ must be \'d outside of [...]
 *	only ] must be \'d inside [...]
 *
 * BUG: unbalanced ) terminates top level pattern
 */

#include <ast.h>
#include <ctype.h>
#include <hashkey.h>

#ifndef	isblank
#define	isblank(x)	((x)==' '||(x)=='\t')
#endif

#ifndef isgraph
#define	isgraph(x)	(isprint(x)&&!isblank(x))
#endif

#define MAXGROUP	10

typedef struct
{
	char*		beg[MAXGROUP];
	char*		end[MAXGROUP];
	char*		next_s;
	short		groups;
} Group_t;

typedef struct
{
	Group_t		current;
	Group_t		best;
	char*		last_s;
	char*		next_p;
} Match_t;

#define mbgetchar(p)	(*p++)

#ifndef isxdigit
#define isxdigit(c)	((c)>='0'&&(c)<='9'||(c)>='a'&&(c)<='f'||(c)>='A'&&(c)<='F')
#endif

#define getsource(s,e)	(((s)>=(e))?0:mbgetchar(s))

#define COLL_MAX	3

/*
 * gobble chars up to <sub> or ) keeping track of (...) and [...]
 * sub must be one of { '|', '&', 0 }
 * 0 returned if s runs out
 */

static char*
gobble(Match_t* mp, register char* s, register int sub, int* g, int clear)
{
	register int	p = 0;
	register char*	b = 0;
	int		c = 0;
	int		n;

	for (;;)
		switch (mbgetchar(s))
		{
		case '\\':
			if (mbgetchar(s))
				break;
			/*FALLTHROUGH*/
		case 0:
			return 0;
		case '[':
			if (!b)
			{
				if (*s == '!')
					mbgetchar(s);
				b = s;
			}
			else if (*s == '.' || *s == '=' || *s == ':')
				c = *s;
			break;
		case ']':
			if (b)
			{
				if (*(s - 2) == c)
					c = 0;
				else if (b != (s - 1))
					b = 0;
			}
			break;
		case '(':
			if (!b)
			{
				p++;
				n = (*g)++;
				if (clear)
				{
					if (!sub)
						n++;
					if (n < MAXGROUP)
						mp->current.beg[n] = mp->current.end[n] = 0;
				}
			}
			break;
		case ')':
			if (!b && p-- <= 0)
				return sub ? 0 : s;
			break;
		case '|':
			if (!b && !p && sub == '|')
				return s;
			break;
		}
}

static int	grpmatch(Match_t*, int, char*, register char*, char*, int);

/*
 * match a single pattern
 * e is the end (0) of the substring in s
 * r marks the start of a repeated subgroup pattern
 */

static int
onematch(Match_t* mp, int g, char* s, char* p, char* e, char* r, int flags)
{
	register int 	pc;
	register int 	sc;
	register int	n;
	register int	icase;
	char*		olds;
	char*		oldp;

	icase = flags & STR_ICASE;
	do
	{
		olds = s;
		sc = getsource(s, e);
		if (icase && isupper(sc))
			sc = tolower(sc);
		oldp = p;
		switch (pc = mbgetchar(p))
		{
		case '(':
		case '*':
		case '?':
		case '+':
		case '@':
		case '!':
			if (pc == '(' || *p == '(')
			{
				char*	subp;
				int	oldg;

				s = olds;
				subp = p + (pc != '(');
				oldg = g;
				n = ++g;
				if (g < MAXGROUP && (!r || g > mp->current.groups))
					mp->current.beg[g] = mp->current.end[g] = 0;
				if (!(p = gobble(mp, subp, 0, &g, !r)))
					return 0;
				if (pc == '*' || pc == '?' || pc == '+' && oldp == r)
				{
					if (onematch(mp, g, s, p, e, NiL, flags))
						return 1;
					if (!sc || !getsource(s, e))
					{
						mp->current.groups = oldg;
						return 0;
					}
				}
				if (pc == '*' || pc == '+')
				{
					p = oldp;
					sc = n - 1;
				}
				else
					sc = g;
				pc = (pc != '!');
				do
				{
					if (grpmatch(mp, n, olds, subp, s, flags) == pc)
					{
						if (n < MAXGROUP)
						{
							if (!mp->current.beg[n] || mp->current.beg[n] > olds)
								mp->current.beg[n] = olds;
							if (s > mp->current.end[n])
								mp->current.end[n] = s;
						}
						if (onematch(mp, sc, s, p, e, oldp, flags))
						{
							if (p == oldp && n < MAXGROUP)
							{
								if (!mp->current.beg[n] || mp->current.beg[n] > olds)
									mp->current.beg[n] = olds;
								if (s > mp->current.end[n])
									mp->current.end[n] = s;
							}
							return 1;
						}
					}
				} while (s < e && mbgetchar(s));
				mp->current.groups = oldg;
				return 0;
			}
			else if (pc == '*')
			{
				/*
				 * several stars are the same as one
				 */

				while (*p == '*' && *(p + 1) != '(')
					p++;
				oldp = p;
				switch (pc = mbgetchar(p))
				{
				case '@':
				case '!':
				case '+':
					n = *p == '(';
					break;
				case '(':
				case '[':
				case '?':
				case '*':
					n = 1;
					break;
				case 0:
				case '|':
				case '&':
				case ')':
					mp->current.next_s = (flags & STR_MAXIMAL) ? e : olds;
					mp->next_p = oldp;
					mp->current.groups = g;
					if (!pc && (!mp->best.next_s || (flags & STR_MAXIMAL) && mp->current.next_s > mp->best.next_s || !(flags & STR_MAXIMAL) && mp->current.next_s < mp->best.next_s))
						mp->best = mp->current;
					return 1;
				case '\\':
					if (!(pc = mbgetchar(p)))
						return 0;
					if (pc >= '0' && pc <= '9')
					{
						n = pc - '0';
						if (n <= g && mp->current.beg[n])
							pc = *mp->current.beg[n];
					}
					/*FALLTHROUGH*/
				default:
					if (icase && isupper(pc))
						pc = tolower(pc);
					n = 0;
					break;
				}
				p = oldp;
				for (;;)
				{
					if ((n || pc == sc) && onematch(mp, g, olds, p, e, NiL, flags))
						return 1;
					if (!sc)
						return 0;
					olds = s;
					sc = getsource(s, e);
					if ((flags & STR_ICASE) && isupper(sc))
						sc = tolower(sc);
				}
			}
			else if (pc != '?' && pc != sc)
				return 0;
			break;
		case 0:
			if (!(flags & STR_MAXIMAL))
				sc = 0;
			/*FALLTHROUGH*/
		case '|':
		case '&':
		case ')':
			if (!sc)
			{
				mp->current.next_s = olds;
				mp->next_p = oldp;
				mp->current.groups = g;
			}
			if (!pc && (!mp->best.next_s || (flags & STR_MAXIMAL) && olds > mp->best.next_s || !(flags & STR_MAXIMAL) && olds < mp->best.next_s))
			{
				mp->best = mp->current;
				mp->best.next_s = olds;
				mp->best.groups = g;
			}
			return !sc;
		case '[':
			{
				/*UNDENT...*/

	int	invert;
	int	x;
	int	ok = 0;
	char*	range;

	if (!sc)
		return 0;
	range = 0;
	n = 0;
	if (invert = *p == '!')
		p++;
	for (;;)
	{
		oldp = p;
		if (!(pc = mbgetchar(p)))
			return 0;
		else if (pc == '[' && (*p == ':' || *p == '=' || *p == '.'))
		{
			x = 0;
			n = mbgetchar(p);
			oldp = p;
			for (;;)
			{
				if (!(pc = mbgetchar(p)))
					return 0;
				if (pc == n && *p == ']')
					break;
				x++;
			}
			mbgetchar(p);
			if (ok)
				/*NOP*/;
			else if (n == ':')
			{
				switch (HASHNKEY5(x, oldp[0], oldp[1], oldp[2], oldp[3], oldp[4]))
				{
				case HASHNKEY5(5,'a','l','n','u','m'):
					if (isalnum(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'a','l','p','h','a'):
					if (isalpha(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'b','l','a','n','k'):
					if (isblank(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'c','n','t','r','l'):
					if (iscntrl(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'d','i','g','i','t'):
					if (isdigit(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'g','r','a','p','h'):
					if (isgraph(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'l','o','w','e','r'):
					if (islower(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'p','r','i','n','t'):
					if (isprint(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'p','u','n','c','t'):
					if (ispunct(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'s','p','a','c','e'):
					if (isspace(sc))
						ok = 1;
					break;
				case HASHNKEY5(5,'u','p','p','e','r'):
					if (icase ? islower(sc) : isupper(sc))
						ok = 1;
					break;
				case HASHNKEY5(6,'x','d','i','g','i'):
					if (oldp[5] == 't' && isxdigit(sc))
						ok = 1;
					break;
				}
			}
			else if (range)
				goto getrange;
			else if (*p == '-' && *(p + 1) != ']')
			{
				mbgetchar(p);
				range = oldp;
			}
			else if (isalpha(*oldp) && isalpha(*olds) && tolower(*oldp) == tolower(*olds) || sc == mbgetchar(oldp))
				ok = 1;
			n = 1;
		}
		else if (pc == ']' && n)
		{
			if (ok != invert)
				break;
			return 0;
		}
		else if (pc == '\\' && (oldp = p, !(pc = mbgetchar(p))))
			return 0;
		else if (ok)
			/*NOP*/;
		else if (range)
		{
		getrange:
			if (icase && isupper(pc))
				pc = tolower(pc);
			x = mbgetchar(range);
			if (icase && isupper(x))
				x = tolower(x);
			if (sc == x || sc == pc || sc > x && sc < pc)
				ok = 1;
			if (*p == '-' && *(p + 1) != ']')
			{
				mbgetchar(p);
				range = oldp;
			}
			else
				range = 0;
			n = 1;
		}
		else if (*p == '-' && *(p + 1) != ']')
		{
			mbgetchar(p);
			range = oldp;
			n = 1;
		}
		else
		{
			if (icase && isupper(pc))
				pc = tolower(pc);
			if (sc == pc)
				ok = 1;
			n = pc;
		}
	}

				/*...INDENT*/
			}
			break;
		case '\\':
			if (!(pc = mbgetchar(p)))
				return 0;
			if (pc >= '0' && pc <= '9')
			{
				n = pc - '0';
				if (n <= g && (oldp = mp->current.beg[n]))
				{
					while (oldp < mp->current.end[n])
						if (!*olds || *olds++ != *oldp++)
							return 0;
					s = olds;
					break;
				}
			}
			/*FALLTHROUGH*/
		default:
			if (icase && isupper(pc))
				pc = tolower(pc);
			if (pc != sc)
				return 0;
			break;
		}
	} while (sc);
	return 0;
}

/*
 * match any pattern in a group
 * | and & subgroups are parsed here
 */

static int
grpmatch(Match_t* mp, int g, char* s, register char* p, char* e, int flags)
{
	register char*	a;

	do
	{
		for (a = p; onematch(mp, g, s, a, e, NiL, flags); a++)
			if (*(a = mp->next_p) != '&')
				return 1;
	} while (p = gobble(mp, p, '|', &g, 1));
	return 0;
}

/*
 * subgroup match
 * 0 returned if no match
 * otherwise number of subgroups matched returned
 * match group begin offsets are even elements of sub
 * match group end offsets are odd elements of sub
 * the matched string is from s+sub[0] up to but not
 * including s+sub[1]
 */

int
strgrpmatch(const char* b, const char* p, int* sub, int n, int flags)
{
	register int	i;
	register char*	s;
	char*		e;
	Match_t		match;

	s = (char*)b;
	match.last_s = e = s + strlen(s);
	for (;;)
	{
		match.best.next_s = 0;
		match.current.groups = 0;
		if ((i = grpmatch(&match, 0, s, (char*)p, e, flags)) || match.best.next_s)
		{
			if (!i)
				match.current = match.best;
			match.current.groups++;
			match.current.end[0] = match.current.next_s;
			break;
		}
		if ((flags & STR_LEFT) || s >= e)
			return 0;
		s++;
	}
	if ((flags & STR_RIGHT) && match.current.next_s != e)
		return 0;
	if (!sub)
		return 1;
	match.current.beg[0] = s;
	s = (char*)b;
	if (n > match.current.groups)
		n = match.current.groups;
	for (i = 0; i < n; i++)
	{
		sub[i * 2] = match.current.end[i] ? match.current.beg[i] - s : 0;
		sub[i * 2 + 1] = match.current.end[i] ? match.current.end[i] - s : 0;
	}
	return n;
}

/*
 * compare the string s with the shell pattern p
 * returns 1 for match 0 otherwise
 */

int
strmatch(const char* s, const char* p)
{
	return strgrpmatch(s, p, NiL, 0, STR_MAXIMAL|STR_LEFT|STR_RIGHT);
}