rep.h   [plain text]


/*
 * Copyright (c) 2002 by The XFree86 Project, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Except as contained in this notice, the name of the XFree86 Project shall
 * not be used in advertising or otherwise to promote the sale, use or other
 * dealings in this Software without prior written authorization from the
 * XFree86 Project.
 *
 * Author: Paulo César Pereira de Andrade
 */

/* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */

#include "re.h"

#ifndef _rep_h
#define _rep_h

/*
 * Local defines
 */

#ifdef MIN
#undef MIN
#endif
#define MIN(a, b)	((a) < (b) ? (a) : (b))

#ifdef MAX
#undef MAX
#endif
#define MAX(a, b)	((a) > (b) ? (a) : (b))

/*  This value can not be larger than 255, a depth value is the nesting of
 * repetition operations and alternatives. The number of nested parenthesis
 * does not matter, but a repetition on the pattern inside the parenthesis
 * does. Note also that you cannot have more than 9 parenthesis pairs in
 * an expression.
 *  Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
 * 7 complex repetitions. A complex repetition is a dot followed by an
 * repetition operator. It is called a complex repetition because dot
 * matches anything but the empty string, so the engine needs to test
 * all possible combinations until the end of the string is found.
 *  Repetitions like .* use one depth until the end of the string is found,
 * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
 */
#define MAX_DEPTH	8

/*  Minimum number of strings to generate a "large" string list, that is,
 * sort the strings and allocate 512 extra bytes to map the first string
 * with a given initial byte. */
#define LARGE_STL_COUNT	16

/*
 * Local types
 */
/* Intermediate compilation types declaration */
	/* (r)egular (e)xpression (c)ompile (c)a(se) */
typedef struct _rec_cse rec_cse;

	/* (r)egular (e)xpression (c)ompile (r)a(ng)e */
typedef struct _rec_rng rec_rng;

	/* (r)egular (e)xpression (c)ompile (pat)tern */
typedef struct _rec_pat rec_pat;

	/* (r)egular (e)xpression (c)ompile (rep)etition */
typedef struct _rec_rep rec_rep;

	/* (r)egular (e)xpression (c)ompile (gr)ou(p) */
typedef struct _rec_grp rec_grp;

	/* (r)egular (e)xpression (c)ompile (alt)ernatives */
typedef struct _rec_alt rec_alt;


/* Optimization types */
	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
typedef struct _rec_stl rec_stl;

/* Final compilation and execution types */
	/* (re)gular expression (inf)ormation */
typedef struct _re_inf re_inf;

	/* (re)gular expression (eng)ine */
typedef struct _re_eng re_eng;


/* Codes used by the engine */
typedef enum {
    /* Grouping */
    Re_Open,			/* ( */
    Re_Close,			/* ) */
    Re_Update,			/* Like Re_Close, but is inside a loop */

    /* Alternatives */
    Re_Alt,			/* Start alternative list, + next offset */
    Re_AltNext,			/* Next alternative, + next offset */
    Re_AltDone,			/* Finish alternative list */

    /* Repetition */
    Re_AnyTimes,		/* * */
    Re_Maybe,			/* ? */
    Re_AtLeast,			/* +, at least one */

    /* Repetition like */
    Re_AnyAnyTimes,		/* .*<re> */
    Re_AnyMaybe,		/* .?<re> */
    Re_AnyAtLeast,		/* .+<re> */

    Re_AnyEatAnyTimes,		/* Expression ends with .* */
    Re_AnyEatMaybe,		/* Expression ends with .? */
    Re_AnyEatAtLeast,		/* Expression ends with .+ */

    /* Repetition with arguments */
    Re_Exact,			/* {e} */
    Re_Min,			/* {n,} */
    Re_Max,			/* {,m} */
    Re_MinMax,			/* {n,m} */

    /* Repetition helper instruction */
    Re_RepJump,			/* Special code, go back to repetition */
    Re_RepLongJump,		/* Jump needs two bytes */
	/*  After the repetition data, all repetitions have an offset
	 * to the code after the repetition */

    /* Matching */
    Re_Any,			/* . */
    Re_Odigit,			/* \o */
    Re_OdigitNot,		/* \O */
    Re_Digit,			/* \d */
    Re_DigitNot,		/* \D */
    Re_Xdigit,			/* \x */
    Re_XdigitNot,		/* \x */
    Re_Space,			/* \s */
    Re_SpaceNot,		/* \S */
    Re_Tab,			/* \t */
    Re_Newline,			/* \n */
    Re_Lower,			/* \l */
    Re_Upper,			/* \u */
    Re_Alnum,			/* \w */
    Re_AlnumNot,		/* \W */
    Re_Control,			/* \c */
    Re_ControlNot,		/* \C */
    Re_Bol,			/* ^ */
    Re_Eol,			/* $ */
    Re_Bow,			/* \< */
    Re_Eow,			/* \> */

    /* Range matching information */
    Re_Range,			/* + 256 bytes */
    Re_RangeNot,		/* + 256 bytes */

    /* Matching with arguments */
    Re_Literal,			/* + character */
    Re_CaseLiteral,		/* + lower + upper */
    Re_LiteralNot,		/* + character */
    Re_CaseLiteralNot,		/* + lower + upper */
    Re_String,			/* + length + string */
    Re_CaseString,		/* + length + string in format lower-upper */

    /* These are useful to start matching, or when RE_NOSPEC is used. */
    Re_SearchLiteral,
    Re_SearchCaseLiteral,
    Re_SearchString,
    Re_SearchCaseString,

    Re_StringList,		/* + total-length + lengths + strings */
    Re_CaseStringList,		/* + total-length + lengths + strings */

    Re_LargeStringList,		/* + total-length + lengths + map + strings */
    Re_LargeCaseStringList,	/* + total-length + lengths + map + strings */

    /* Backreference */
    Re_Backref,			/* + reference number */

    /* The last codes */
    Re_DoneIf,			/* Done if at end of input */
    Re_MaybeDone,		/* Done */
    Re_Done			/* If this code found, finished execution */
} ReCode;


/* (r)egular (e)xpresssion (pat)rern (t)ype */
typedef enum _rec_pat_t {
    Rep_Literal			= Re_Literal,
    Rep_CaseLiteral		= Re_CaseLiteral,
    Rep_LiteralNot		= Re_LiteralNot,
    Rep_CaseLiteralNot		= Re_CaseLiteralNot,
    Rep_Range			= Re_Range,
    Rep_RangeNot		= Re_RangeNot,
    Rep_String			= Re_String,
    Rep_CaseString		= Re_CaseString,
    Rep_SearchLiteral		= Re_SearchLiteral,
    Rep_SearchCaseLiteral	= Re_SearchCaseLiteral,
    Rep_SearchString		= Re_SearchString,
    Rep_SearchCaseString	= Re_SearchCaseString,
    Rep_Any			= Re_Any,
    Rep_AnyAnyTimes		= Re_AnyAnyTimes,
    Rep_AnyEatAnyTimes		= Re_AnyEatAnyTimes,
    Rep_AnyMaybe		= Re_AnyMaybe,
    Rep_AnyEatMaybe		= Re_AnyEatMaybe,
    Rep_AnyAtLeast		= Re_AnyAtLeast,
    Rep_AnyEatAtLeast		= Re_AnyEatAtLeast,
    Rep_Odigit			= Re_Odigit,
    Rep_OdigitNot		= Re_OdigitNot,
    Rep_Digit			= Re_Digit,
    Rep_DigitNot		= Re_DigitNot,
    Rep_Xdigit			= Re_Xdigit,
    Rep_XdigitNot		= Re_XdigitNot,
    Rep_Space			= Re_Space,
    Rep_SpaceNot		= Re_SpaceNot,
    Rep_Tab			= Re_Tab,
    Rep_Newline			= Re_Newline,
    Rep_Lower			= Re_Lower,
    Rep_Upper			= Re_Upper,
    Rep_Alnum			= Re_Alnum,
    Rep_AlnumNot		= Re_AlnumNot,
    Rep_Control			= Re_Control,
    Rep_ControlNot		= Re_ControlNot,
    Rep_Bol			= Re_Bol,
    Rep_Eol			= Re_Eol,
    Rep_Bow			= Re_Bow,
    Rep_Eow			= Re_Eow,
    Rep_Backref			= Re_Backref,
    Rep_StringList		= Re_StringList,
    Rep_Group			= Re_Open
} rec_pat_t;


/* (r)egular (e)xpression (rep)etition (t)ype */
typedef enum _rec_rep_t {
    Rer_AnyTimes		= Re_AnyTimes,
    Rer_AtLeast			= Re_AtLeast,
    Rer_Maybe			= Re_Maybe,
    Rer_Exact			= Re_Exact,
    Rer_Min			= Re_Min,
    Rer_Max			= Re_Max,
    Rer_MinMax			= Re_MinMax
} rec_rep_t;


/*  Decide at re compilation time what is lowercase and what is uppercase */
struct _rec_cse {
    unsigned char lower;
    unsigned char upper;
};


/*  A rec_rng is used only during compilation, just a character map */
struct _rec_rng {
    unsigned char range[256];
};


/*  A rec_pat is used only during compilation, and can be viewed as
 * a regular expression element like a match to any character, a match
 * to the beginning or end of the line, etc.
 *  It is implemented as a linked list, and does not have nesting.
 *  The data field can contain:
 *	chr:	the value of a single character to match.
 *	cse:	the upper and lower case value of a character to match.
 *	rng:	a character map to match or not match.
 *	str:	a simple string or a string where every two bytes
 *		represents the character to match, in lower/upper
 *		case sequence.
 *  The rep field is not used for strings, strings are broken in the
 * last character in this case. That is, strings are just a concatenation
 * of several character matches.
 */
struct _rec_pat {
    rec_pat_t type;
    rec_pat *next, *prev;	/* Linked list information */
    union {
	unsigned char chr;
	rec_cse cse;
	rec_rng *rng;
	rec_grp *grp;
	unsigned char *str;
	rec_stl *stl;
    } data;
    rec_rep *rep;		/* Pattern repetition information */
};


/*  A rec_rep is used only during compilation, and can be viewed as:
 *
 *	? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
 *
 * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
 *  In the compiled step it can also be just a NULL pointer, that
 * is actually equivalent to {1}.
 */
struct _rec_rep {
    rec_rep_t type;
    short mine;			/* minimum or exact number of matches */
    short maxc;			/* maximum number of matches */
};


/*  A rec_alt is used only during compilation, and can be viewed as:
 *
 *	<re>|<re>
 *
 * where <re> is any regular expression. The expressions are nested
 * using the grp field of the rec_pat structure.
 */
struct _rec_alt {
    rec_alt *next, *prev;	/* Linked list information */
    rec_pat *pat;
};


/*  A rec_grp is a place holder for expressions enclosed in parenthesis
 * and is linked to the compilation data by an rec_pat structure. */
struct _rec_grp {
    rec_pat *parent;		/* Reference to parent pattern */
    rec_alt *alt;		/* The pattern information */
    rec_alt *palt;		/* Parent alternative */
    rec_grp *pgrp;		/* Nested groups */
    int comp;			/* (comp)lex repetition pattern inside group */
};


/* Optimization compilation types definition */
	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
typedef enum {
    Resl_StringList		= Re_StringList,
    Resl_CaseStringList		= Re_CaseStringList
} rec_stl_t;

struct _rec_stl {
    rec_stl_t type;
    int nstrs;			/* Number of strings in list */
    int tlen;			/* Total length of all strings */
    unsigned char *lens;	/* Vector of string lengths */
    unsigned char **strs;	/* The strings */
};


/*
 * Prototypes
 */
	/* rep.c */
rec_alt *irec_comp(const char*, const char*, int, int*);
void irec_free_alt(rec_alt*);

	/* reo.c */
int orec_comp(rec_alt*, int);
void orec_free_stl(rec_stl*);

#endif /* _rep_h */