ref: f48b71b1a985df4b87e9169c893ee79eae456ac5
dir: /sys/src/cmd/awk/re.c/
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
Permission to use, copy, modify, and distribute this software and
its documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appear in all
copies and that both that the copyright notice and this
permission notice and warranty disclaimer appear in supporting
documentation, and that the name Lucent Technologies or any of
its entities not be used in advertising or publicity pertaining
to distribution of the software without specific, written prior
permission.
LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
#include <u.h>
#include <libc.h>
#include <ctype.h>
#include <bio.h>
#include <regexp.h>
#include "awk.h"
#include "y.tab.h"
	/* This file provides the interface between the main body of
	 * awk and the pattern matching package.  It preprocesses
	 * patterns prior to compilation to provide awk-like semantics
	 * to character sequences not supported by the pattern package.
	 * The following conversions are performed:
	 *
	 *	"()"		->	"[]"
	 *	"[-"		->	"[\-"
	 *	"[^-"		->	"[^\-"
	 *	"-]"		->	"\-]"
	 *	"[]"		->	"[]*"
	 *	"\xdddd"	->	"\z" where 'z' is the UTF sequence
	 *					for the hex value
	 *	"\ddd"		->	"\o" where 'o' is a char octal value
	 *	"\b"		->	"\B"	where 'B' is backspace
	 *	"\t"		->	"\T"	where 'T' is tab
	 *	"\f"		->	"\F"	where 'F' is form feed
	 *	"\n"		->	"\N"	where 'N' is newline
	 *	"\r"		->	"\r"	where 'C' is cr
	 */
#define	MAXRE	512
static char	re[MAXRE];	/* copy buffer */
char	*patbeg;
int	patlen;			/* number of chars in pattern */
#define	NPATS	20		/* number of slots in pattern cache */
static struct pat_list		/* dynamic pattern cache */
{
	char	*re;
	int	use;
	Reprog	*program;
} pattern[NPATS];
static int npats;		/* cache fill level */
	/* Compile a pattern */
void
*compre(char *pat)
{
	int i, j, inclass;
	char c, *p, *s;
	Reprog *program;
	if (!compile_time) {	/* search cache for dynamic pattern */
		for (i = 0; i < npats; i++)
			if (!strcmp(pat, pattern[i].re)) {
				pattern[i].use++;
				return((void *) pattern[i].program);
			}
	}
		/* Preprocess Pattern for compilation */
	p = re;
	s = pat;
	inclass = 0;
	while (c = *s++) {
		if (c == '\\') {
			quoted(&s, &p, re+MAXRE);
			continue;
		}
		else if (!inclass && c == '(' && *s == ')') {
			if (p < re+MAXRE-2) {	/* '()' -> '[]*' */
				*p++ = '[';
				*p++ = ']';
				c = '*';
				s++;
			}
			else overflow();
		}
		else if (c == '['){			/* '[-' -> '[\-' */
			inclass = 1;
			if (*s == '-') {
				if (p < re+MAXRE-2) {
					*p++ = '[';
					*p++ = '\\';
					c = *s++;
				}
				else overflow();
			}				/* '[^-' -> '[^\-'*/
			else if (*s == '^' && s[1] == '-'){
				if (p < re+MAXRE-3) {
					*p++ = '[';
					*p++ = *s++;
					*p++ = '\\';
					c = *s++;
				}
				else overflow();
			}
			else if (*s == '['){		/* skip '[[' */
				if (p < re+MAXRE-1)
					*p++ = c;
				else overflow();
				c = *s++;
			}
			else if (*s == '^' && s[1] == '[') {	/* skip '[^['*/
				if (p < re+MAXRE-2) {
					*p++ = c;
					*p++ = *s++;
					c = *s++;
				}
				else overflow();
			}
			else if (*s == ']') {		/* '[]' -> '[]*' */
				if (p < re+MAXRE-2) {
					*p++ = c;
					*p++ = *s++;
					c = '*';
					inclass = 0;
				}
				else overflow();
			}
		}
		else if (c == '-' && *s == ']') {	/* '-]' -> '\-]' */
			if (p < re+MAXRE-1)
				*p++ = '\\';
			else overflow();
		}
		else if (c == ']')
			inclass = 0;
		if (p < re+MAXRE-1)
			*p++ = c;
		else overflow();
	}
	*p = 0;
	program = regcomp(re);		/* compile pattern */
	if (!compile_time) {
		if (npats < NPATS)	/* Room in cache */
			i = npats++;
		else {			/* Throw out least used */
			int use = pattern[0].use;
			i = 0;
			for (j = 1; j < NPATS; j++) {
				if (pattern[j].use < use) {
					use = pattern[j].use;
					i = j;
				}
			}
			xfree(pattern[i].program);
			xfree(pattern[i].re);
		}
		pattern[i].re = tostring(pat);
		pattern[i].program = program;
		pattern[i].use = 1;
	}
	return((void *) program);
}
	/* T/F match indication - matched string not exported */
int
match(void *p, char *s, char *)
{
	return regexec((Reprog *) p, (char *) s, 0, 0);
}
	/* match and delimit the matched string */
int
pmatch(void *p, char *s, char *start)
{
	Resub m;
	m.sp = start;
	m.ep = 0;
	if (regexec((Reprog *) p, (char *) s, &m, 1)) {
		patbeg = m.sp;
		patlen = m.ep-m.sp;
		return 1;
	}
	patlen = -1;
	patbeg = start;
	return 0;
}
	/* perform a non-empty match */
int
nematch(void *p, char *s, char *start)
{
	if (pmatch(p, s, start) == 1 && patlen > 0)
		return 1;
	patlen = -1;
	patbeg = start; 
	return 0;
}
/* in the parsing of regular expressions, metacharacters like . have */
/* to be seen literally;  \056 is not a metacharacter. */
hexstr(char **pp)	/* find and eval hex string at pp, return new p */
{
	char c;
	int n = 0;
	int i;
	for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
		if (isdigit(c))
			n = 16 * n + c - '0';
		else if ('a' <= c && c <= 'f')
			n = 16 * n + c - 'a' + 10;
		else if ('A' <= c && c <= 'F')
			n = 16 * n + c - 'A' + 10;
	}
	*pp += i;
	return n;
}
	/* look for awk-specific escape sequences */
#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
void
quoted(char **s, char **to, char *end)	/* handle escaped sequence */
{
	char *p = *s;
	char *t = *to;
	Rune c;
	switch(c = *p++) {
	case 't':
		c = '\t';
		break;
	case 'n':
		c = '\n';
		break;
	case 'f':
		c = '\f';
		break;
	case 'r':
		c = '\r';
		break;
	case 'b':
		c = '\b';
		break;
	default:
		if (t < end-1)		/* all else must be escaped */
			*t++ = '\\';
		if (c == 'x') {		/* hexadecimal goo follows */
			c = hexstr(&p);
			if (t < end-UTFmax)
				t += runelen(c);
			else overflow();
			*to = t;
			*s = p;
			return;
		} else if (isoctdigit(c)) {	/* \d \dd \ddd */
			c -= '0';
			if (isoctdigit(*p)) {
				c = 8 * c + *p++ - '0';
				if (isoctdigit(*p))
					c = 8 * c + *p++ - '0';
			}
		}
		break;
	}
	if (t < end-1)
		*t++ = c;
	*s = p;
	*to = t;
}
	/* pattern package error handler */
void
regerror(char *s)
{
	FATAL("%s", s);
}
void
overflow(void)
{
	FATAL("%s", "regular expression too big");
}