reo.c   [plain text]


/*
 * Copyright (c) 2002 by The XFree86 Project, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Except as contained in this notice, the name of the XFree86 Project shall
 * not be used in advertising or otherwise to promote the sale, use or other
 * dealings in this Software without prior written authorization from the
 * XFree86 Project.
 *
 * Author: Paulo César Pereira de Andrade
 */

/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */

#include "rep.h"

/*
 *  This file is a placeholder to add code to analyse and optimize the
 * intermediate data structure generated in rep.c.
 *  Character ranges are optimized while being generated.
 */

/*
 * Types
 */
typedef struct _orec_inf {
    rec_alt *alt;			/* Main alternatives list */
    rec_grp *grp;			/* Current group pointer */
    int flags;
    int ecode;
} orec_inf;

/*
 * Prototypes
 */
static int orec_alt(orec_inf*, rec_alt*);
static int orec_pat(orec_inf*, rec_pat*);
static int orec_grp(orec_inf*, rec_grp*);
static int orec_pat_bad_rpt(orec_inf*, rec_pat*);
static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*);
static int orec_pat_rng(orec_inf*, rec_pat*);
static int orec_pat_cse(orec_inf*, rec_pat*);
static int orec_pat_cse_can(orec_inf*, rec_pat*);
static int orec_str_list(orec_inf*, rec_alt*, int, int);

/*
 * Initialization
 */
extern unsigned char re__alnum[256];
extern unsigned char re__odigit[256];
extern unsigned char re__ddigit[256];
extern unsigned char re__xdigit[256];
extern unsigned char re__control[256];

/*
 * Implementation
 */
int
orec_comp(rec_alt *alt, int flags)
{
    orec_inf inf;

    inf.alt = alt;
    inf.grp = NULL;
    inf.flags = flags;
    inf.ecode = 0;

    orec_alt(&inf, alt);

    return (inf.ecode);
}

void
orec_free_stl(rec_stl *stl)
{
    int i;

    for (i = 0; i < stl->nstrs; i++) {
	if (stl->lens[i] > 2)
	    free(stl->strs[i]);
    }

    free(stl->lens);
    free(stl->strs);
    free(stl);
}


static int
orec_alt(orec_inf *inf, rec_alt *alt)
{
    if (alt) {
	rec_alt *ptr = alt;
	int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0;

	/* Check if can build a string list */
	if (ptr->next) {
	    /* If more than one alternative */
	    while (ptr && (str || cstr)) {
		if (ptr->pat == NULL || ptr->pat->rep != NULL) {
		    cstr = str = 0;
		    break;
		}
		if ((inf->flags & RE_ICASE)) {
		    if (!(ret = orec_pat_cse_can(inf, ptr->pat))) {
			cstr = str = 0;
			break;
		    }
		    if (ret == 1)
			++lits;
		    else if (ret == 2)
			++clits;
		}
		else if (ptr->pat->next == NULL) {
		    if (ptr->pat->type != Rep_String) {
			if (ptr->pat->type != Rep_Literal) {
			    str = 0;
			    if (ptr->pat->type != Rep_CaseString) {
				if (ptr->pat->type != Rep_CaseLiteral)
				    cstr = 0;
				else
				    ++clits;
			    }
			    else if (strlen((char*)ptr->pat->data.str) >= 255)
				str = cstr = 0;
			}
			else
			    ++lits;
		    }
		    else if (strlen((char*)ptr->pat->data.str) >= 255)
			str = cstr = 0;
		}
		else {
		    str = cstr = 0;
		    break;
		}
		if (++count >= 255)
		    str = cstr = 0;
		ptr = ptr->next;
	    }

	    if (str || cstr) {
		if (inf->flags & RE_ICASE) {
		    for (ptr = alt; ptr; ptr = ptr->next) {
			if (orec_pat_cse(inf, ptr->pat))
			    return (inf->ecode);
		    }
		    str = 0;
		}
		return (orec_str_list(inf, alt, str, count));
	    }
	}
	else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) {
	    /* If the toplevel single alternative */
	    switch (alt->pat->type) {
		/*  One of these will always be true for RE_NOSPEC,
		 * but can also be optimized for simple patterns */
		case Rep_Literal:
		    alt->pat->type = Rep_SearchLiteral;
		    break;
		case Rep_CaseLiteral:
		    alt->pat->type = Rep_SearchCaseLiteral;
		    break;
		case Rep_String:
		    alt->pat->type = Rep_SearchString;
		    break;
		case Rep_CaseString:
		    alt->pat->type = Rep_SearchCaseString;
		    break;
		default:
		    break;
	    }
	}

	while (alt) {
	    orec_pat(inf, alt->pat);
	    alt = alt->next;
	}
    }

    return (inf->ecode);
}

static int
orec_pat(orec_inf *inf, rec_pat *pat)
{
    rec_pat *next;

    while (pat) {
	switch (pat->type) {
	    case Rep_AnyAnyTimes:
		if (pat->next == NULL) {
		    rec_grp *grp = inf->grp;

		    next = NULL;
		    while (grp) {
			next = grp->parent->next;
			/* Cannot check if is .*$ as the input
			 * may be a substring */
			if (next)
			    break;
			grp = grp->pgrp;
		    }
		    if (next == NULL) {
			/*    <re>.*    */
			pat->type = Rep_AnyEatAnyTimes;
			grp = inf->grp;
			while (grp) {
			    --grp->comp;
			    next = grp->parent->next;
			    if (next)
				break;
			    grp = grp->pgrp;
			}
		    }
		    else if (orec_pat_bad_rpt(inf, next))
			return (inf->ecode);
		}
		else if (orec_pat_bad_rpt(inf, pat->next))
		    return (inf->ecode);
		break;
	    case Rep_AnyMaybe:
		if (pat->next == NULL) {
		    rec_grp *grp = inf->grp;

		    next = NULL;
		    while (grp) {
			next = grp->parent->next;
			if (next)
			    break;
			grp = grp->pgrp;
		    }
		    if (next == NULL) {
			/*    <re>.?    */
			pat->type = Rep_AnyEatMaybe;
			grp = inf->grp;
			while (grp) {
			    --grp->comp;
			    next = grp->parent->next;
			    if (next)
				break;
			    grp = grp->pgrp;
			}
		    }
		    else if (orec_pat_bad_rpt(inf, next))
			return (inf->ecode);
		}
		else if (orec_pat_bad_rpt(inf, pat->next))
		    return (inf->ecode);
		break;
	    case Rep_AnyAtLeast:
		if (pat->next == NULL) {
		    rec_grp *grp = inf->grp;

		    next = NULL;
		    while (grp) {
			next = grp->parent->next;
			if (next)
			    break;
			grp = grp->pgrp;
		    }
		    if (next == NULL) {
			/*    <re>.+    */
			pat->type = Rep_AnyEatAtLeast;
			grp = inf->grp;
			while (grp) {
			    --grp->comp;
			    next = grp->parent->next;
			    if (next)
				break;
			    grp = grp->pgrp;
			}
		    }
		    else if (orec_pat_bad_rpt(inf, next))
			return (inf->ecode);
		}
		else if (orec_pat_bad_rpt(inf, pat->next))
		    return (inf->ecode);
		break;
	    case Rep_Range:
	    case Rep_RangeNot:
		orec_pat_rng(inf, pat);
		break;
	    case Rep_Group:
		orec_grp(inf, pat->data.grp);
		break;
	    default:
		break;
	}
	pat = pat->next;
    }

    return (inf->ecode);
}

static int
orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat)
{
    switch (pat->type) {
	/* Not really an error, but aren't supported by the library.
	 * Includes:  .*.*, .+<re>?  .*<re>*, (.*)(<re>*), etc.
	 */

	/* Not a repetition, but mathes anything... */
	case Rep_Any:

	/* Zero length matches */
	case Rep_Eol:
	    if (!(inf->flags & RE_NEWLINE))
		break;
	case Rep_Bol:
	case Rep_Bow:
	case Rep_Eow:

	/* Repetitions */
	case Rep_AnyAnyTimes:
	case Rep_AnyMaybe:
	case Rep_AnyAtLeast:
	    inf->ecode = RE_BADRPT;
	    break;

	/* Check if the first group element is a complex pattern */
	case Rep_Group:
	    if (pat->rep == NULL) {
		if (pat->data.grp->alt) {
		    for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) {
			if (orec_pat_bad_rpt(inf, pat))
			    break;
		    }
		}
		break;
	    }
	    /*FALLTHROUGH*/
	default:
	    if (pat->rep)
		inf->ecode = RE_BADRPT;
	    break;
    }

    if (!inf->ecode && pat && pat->next)
	orec_pat_bad_forward_rpt(inf, pat->next);

    return (inf->ecode);
}

static int
orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat)
{
    if (pat->rep) {
	switch (pat->rep->type) {
	    case Rer_MinMax:
		if (pat->rep->mine > 0)
		    break;
	    case Rer_AnyTimes:
	    case Rer_Maybe:
	    case Rer_Max:
		inf->ecode = RE_BADRPT;
	    default:
		break;
	}
    }
    else if (pat->type == Rep_Group &&
	     pat->data.grp->alt &&
	     pat->data.grp->alt->pat)
	orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat);

    return (inf->ecode);
}

static int
orec_grp(orec_inf *inf, rec_grp *grp)
{
    rec_grp *prev = inf->grp;

    inf->grp = grp;
    orec_alt(inf, grp->alt);
    /* Could also just say: inf->grp = grp->gparent */
    inf->grp = prev;

    return (inf->ecode);
}

static int
orec_pat_rng(orec_inf *inf, rec_pat *pat)
{
    int i, j[2], count;
    rec_pat_t type = pat->type;
    unsigned char *range = pat->data.rng->range;

    for (i = count = j[0] = j[1] = 0; i < 256; i++) {
	if (range[i]) {
	    if (count == 2) {
		++count;
		break;
	    }
	    j[count++] = i;
	}
    }

    if (count == 1 ||
	(count == 2 &&
	 ((islower(j[0]) && toupper(j[0]) == j[1]) ||
	  (isupper(j[0]) && tolower(j[0]) == j[1])))) {
	free(pat->data.rng);
	if (count == 1) {
	    pat->data.chr = j[0];
	    pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot;
	}
	else {
	    pat->data.cse.upper = j[0];
	    pat->data.cse.lower = j[1];
	    pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot;
	}
    }
    else {
	if (memcmp(re__alnum, range, 256) == 0)
	    type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot;
	else if (memcmp(re__odigit, range, 256) == 0)
	    type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot;
	else if (memcmp(re__ddigit, range, 256) == 0)
	    type = type == Rep_Range ? Rep_Digit : Rep_DigitNot;
	else if (memcmp(re__xdigit, range, 256) == 0)
	    type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot;
	else if (memcmp(re__control, range, 256) == 0)
	    type = type == Rep_Range ? Rep_Control : Rep_ControlNot;

	if (type != pat->type) {
	    free(pat->data.rng);
	    pat->type = type;
	}
    }

    return (inf->ecode);
}

/*  Join patterns if required, will only fail on memory allocation failure:
 */
static int
orec_pat_cse(orec_inf *inf, rec_pat *pat)
{
    rec_pat_t type;
    int i, len, length;
    rec_pat *ptr, *next;
    unsigned char *str, *tofree;

    if (pat->next == NULL && pat->type == Rep_CaseString)
	return (inf->ecode);

    type = Rep_CaseString;

    /* First calculate how many bytes will be required */
    for (ptr = pat, length = 1; ptr; ptr = ptr->next) {
	switch (ptr->type) {
	    case Rep_Literal:
		length += 2;
		break;
	    case Rep_String:
		length += strlen((char*)ptr->data.str) << 1;
		break;
	    case Rep_CaseLiteral:
		length += 2;
		break;
	    case Rep_CaseString:
		length += strlen((char*)ptr->data.str);
		break;
	    default:
		break;
	}
    }

    if ((str = malloc(length)) == NULL)
	return (inf->ecode = RE_ESPACE);

    for (ptr = pat, length = 0; ptr; ptr = next) {
	tofree = NULL;
	next = ptr->next;
	switch (ptr->type) {
	    case Rep_Literal:
		str[length++] = ptr->data.chr;
		str[length++] = ptr->data.chr;
		break;
	    case Rep_String:
		tofree = ptr->data.str;
		len = strlen((char*)tofree);
		for (i = 0; i < len; i++) {
		    str[length++] = tofree[i];
		    str[length++] = tofree[i];
		}
		break;
	    case Rep_CaseLiteral:
		str[length++] = ptr->data.cse.lower;
		str[length++] = ptr->data.cse.upper;
		break;
	    case Rep_CaseString:
		tofree = ptr->data.str;
		len = strlen((char*)tofree);
		memcpy(str + length, tofree, len);
		length += len;
		break;
	    default:
		break;
	}
	if (tofree)
	    free(tofree);
	if (ptr != pat)
	    free(ptr);
    }
    str[length] = '\0';

    pat->type = type;
    pat->data.str = str;
    pat->next = NULL;

    return (inf->ecode);
}

/*  Return 0 if the patterns in the list cannot be merged, 1 if will
 * be a simple string, 2 if a case string.
 *  This is useful when building an alternative list that is composed
 * only of strings, but the regex is case insensitive, in wich case
 * the first pass may have splited some patterns, but if it is a member
 * of an alternatives list, the cost of using a string list is smaller */
static int
orec_pat_cse_can(orec_inf *inf, rec_pat *pat)
{
    int ret;

    if (pat == NULL)
	return (0);

    for (ret = 1; pat; pat = pat->next) {
	if (pat->rep)
	    return (0);
	switch (pat->type) {
	    case Rep_Literal:
	    case Rep_String:
		break;
	    case Rep_CaseLiteral:
	    case Rep_CaseString:
		ret = 2;
		break;
	    default:
		return (0);
	}
    }

    return (ret);
}


/*  XXX If everything is a (case) byte, the pattern should be
 * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE)
 * as a string list works fine, but as a character range
 * should be faster, and maybe could be converted here. But not
 * very important, if performance is required, it should have already
 * been done in the pattern.
 */
static int
orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count)
{
    rec_stl *stl;
    rec_pat *pat;
    rec_alt *ptr, *next;
    int i, j, tlen, len, is;

    if ((stl = calloc(1, sizeof(rec_stl))) == NULL)
	return (inf->ecode = RE_ESPACE);

    if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) {
	free(stl);
	return (inf->ecode = RE_ESPACE);
    }

    if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) {
	free(stl->lens);
	free(stl);
	return (inf->ecode = RE_ESPACE);
    }

    if ((pat = calloc(1, sizeof(rec_pat))) == NULL) {
	free(stl->strs);
	free(stl->lens);
	free(stl);
	return (inf->ecode = RE_ESPACE);
    }

    pat->data.stl = stl;
    pat->type = Rep_StringList;
    stl->type = str ? Resl_StringList : Resl_CaseStringList;
    for (i = tlen = 0, ptr = alt; i < count; i++) {
	next = ptr->next;
	switch (ptr->pat->type) {
	    case Rep_Literal:
		is = len = 1;
		break;
	    case Rep_CaseLiteral:
		is = len = 2;
		break;
	    default:
		is = 0;
		len = strlen((char*)ptr->pat->data.str);
		break;
	}
	tlen += len;
	stl->lens[i] = len;
	if (!is) {
	    if (len > 2)
		stl->strs[i] = ptr->pat->data.str;
	    else {
		if (len == 1)
		    stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]);
		else
		    stl->strs[i] = (void*)(long)
				   (ptr->pat->data.str[0] |
				    ((int)ptr->pat->data.str[1] << 8));
		free(ptr->pat->data.str);
	    }
	}
	else {
	    if (is == 1)
		stl->strs[i] = (void*)(long)ptr->pat->data.chr;
	    else
		stl->strs[i] = (void*)(long)
			       (ptr->pat->data.cse.lower |
				(ptr->pat->data.cse.upper << 8));
	}
	free(ptr->pat);
	if (i)
	    free(ptr);
	ptr = next;
    }
    stl->tlen = tlen;
    stl->nstrs = count;

    alt->pat = pat;
    alt->next = NULL;

    {
	int li, lj;
	unsigned char ci, cj, *str;

	/*  Don't need a stable sort, there shouldn't be duplicated strings,
	 * but don't check for it either. Only need to make sure that all
	 * strings that start with the same byte are together */
	for (i = 0; i < count; i++) {
	    li = stl->lens[i];
	    ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff;
	    for (j = i + 1; j < count; j++) {
		lj = stl->lens[j];
		cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff;
		if ((count >= LARGE_STL_COUNT && cj < ci) ||
		    (cj == ci && lj > li)) {
		    /* If both strings start with the same byte,
		     * put the longer first */
		    str = stl->strs[j];
		    stl->strs[j] = stl->strs[i];
		    stl->strs[i] = str;
		    stl->lens[j] = li;
		    stl->lens[i] = lj;
		    li ^= lj; lj ^= li; li ^= lj;
		    ci ^= cj; cj ^= ci; ci ^= cj;
		}
	    }
	}
    }

    return (inf->ecode);
}