regcmp.c   [plain text]


/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*           Copyright (c) 1985-2007 AT&T Knowledge Ventures            *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                      by AT&T Knowledge Ventures                      *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                  David Korn <dgk@research.att.com>                   *
*                   Phong Vo <kpv@research.att.com>                    *
*                                                                      *
***********************************************************************/
#pragma prototyped
/*
 * regcmp implementation
 */

#include <ast.h>
#include <libgen.h>
#include <regex.h>
#include <align.h>

#define INC		(2*1024)
#define TOT		(16*1024)
#define SUB		10

typedef struct
{
	char*		cur;
	regex_t		re;
	unsigned char	sub[SUB];
	int		nsub;
	size_t		size;
	char		buf[ALIGN_BOUND2];
} Regex_t;

__DEFINE__(char*, __loc1, 0);

static void*
block(void* handle, void* data, size_t size)
{
	register Regex_t*	re = (Regex_t*)handle;

	if (data || (size = roundof(size, ALIGN_BOUND2)) > (re->buf + re->size - re->cur))
		return 0;
	data = (void*)re->cur;
	re->cur += size;
	return data;
}

char*
regcmp(const char* pattern, ...)
{
	register char*		s;
	register Regex_t*	re;
	register size_t		n;
	register int		c;
	register int		p;
	int			b;
	int			i;
	int			j;
	int			nsub;
	register Sfio_t*	sp;
	unsigned char		paren[128];
	unsigned char		sub[SUB];
	va_list			ap;

	va_start(ap, pattern);
	if (!pattern || !*pattern || !(sp = sfstropen()))
		return 0;
	memset(paren, 0, sizeof(paren));
	n = 0;
	p = -1;
	b = 0;
	nsub = 0;
	s = (char*)pattern;
	do
	{
		while (c = *s++)
		{
			if (c == '\\')
			{
				sfputc(sp, c);
				if (!(c = *s++))
					break;
			}
			else if (b)
			{
				if (c == ']')
					b = 0;
			}
			else if (c == '[')
			{
				b = 1;
				if (*s == '^')
				{
					sfputc(sp, c);
					c = *s++;
				}
				if (*s == ']')
				{
					sfputc(sp, c);
					c = *s++;
				}
			}
			else if (c == '(')
			{
				/*
				 * someone explain in one sentence why
				 * a cast is needed to make this work
				 */

				if (p < (int)(elementsof(paren) - 1))
					p++;
				paren[p] = ++n;
			}
			else if (c == ')' && p >= 0)
			{
				for (i = p; i > 0; i--)
					if (paren[i])
						break;
				if (*s == '$' && (j = *(s + 1)) >= '0' && j <= '9')
				{
					s += 2;
					j -= '0';
					if (nsub <= j)
					{
						if (!nsub)
							memset(sub, 0, sizeof(sub));
						nsub = j + 1;
					}
					sub[j] = paren[i] + 1;
				}
				paren[i] = 0;
			}
			sfputc(sp, c);
		}
	} while (s = va_arg(ap, char*));
	va_end(ap);
	if (!(s = sfstruse(sp)))
	{
		sfstrclose(sp);
		return 0;
	}
	re = 0;
	n = 0;
	do
	{
		if ((n += INC) > TOT || !(re = newof(re, Regex_t, 0, n)))
		{
			if (re)
				free(re);
			sfstrclose(sp);
			return 0;
		}
		re->cur = re->buf;
		re->size = n + ALIGN_BOUND2 - sizeof(Regex_t);
		regalloc(re, block, REG_NOFREE);
		c = regcomp(&re->re, s, REG_EXTENDED|REG_LENIENT|REG_NULL);
		regalloc(NiL, NiL, 0);
	} while (c == REG_ESPACE);
	sfstrclose(sp);
	if (c)
	{
		free(re);
		return 0;
	}
	if (re->nsub = nsub)
		memcpy(re->sub, sub, (nsub + 1) * sizeof(sub[0]));
	return (char*)re;
}

char*
regex(const char* handle, const char* subject, ...)
{
	register Regex_t*	re;
	register int		n;
	register int		i;
	register int		k;
	char*			sub[SUB + 1];
	regmatch_t		match[SUB + 1];
	va_list			ap;

	va_start(ap, subject);
	if (!(re = (Regex_t*)handle) || !subject)
		return 0;
	for (n = 0; n < re->nsub; n++)
		sub[n] = va_arg(ap, char*);
	va_end(ap);
	if (regexec(&re->re, subject, SUB + 1, match, 0))
		return 0;
	for (n = 0; n < re->nsub; n++)
		if (i = re->sub[n])
		{
			i--;
			k = match[i].rm_eo - match[i].rm_so;
			strncpy(sub[n], subject + match[i].rm_so, k);
			*(sub[n] + k) = 0;
		}
	__loc1 = (char*)subject + match[0].rm_so;
	return (char*)subject + match[0].rm_eo;
}