ref: d63acbc0752d218326e3372f4ee78a9be954f9b1
dir: /sys/src/cmd/upas/scanmail/common.c/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include "spam.h"
enum {
	Quanta	= 8192,
	Minbody = 6000,
	HdrMax	= 15,
};
typedef struct keyword Keyword;
typedef struct word Word;
struct word{
	char	*string;
	int	n;
};
struct	keyword{
	char	*string;
	int	value;
};
Word	htmlcmds[] =
{
	"html",		4,
	"!doctype html", 13,
	0,
};
Word	hrefs[] =
{
	"a href=",	7,
	"a title=",	8,
	"a target=",	9,
	"base href=",	10,
	"img src=",	8,
	"img border=",	11,
	"form action=", 12,
	"!--",		3,
	0,
};
/*
 *	RFC822 header keywords to look for for fractured header.
 *	all lengths must be less than HdrMax defined above.
 */
Word	hdrwords[] =
{
	"cc:",			3,
	"bcc:", 		4,
	"to:",			3,
	0,			0,
};
Keyword	keywords[] =
{
	"header",	HoldHeader,
	"line",		SaveLine,
	"hold",		Hold,
	"dump",		Dump,
	"loff",		Lineoff,
	0,		Nactions,
};
Patterns patterns[] = {
[Dump]		{ "DUMP:", 0, 0 },
[HoldHeader]	{ "HEADER:", 0, 0 },
[Hold]		{ "HOLD:", 0, 0 },
[SaveLine]	{ "LINE:", 0, 0 },
[Lineoff]	{ "LINEOFF:", 0, 0 },
[Nactions]	{ 0, 0, 0 },
};
static char*	endofhdr(char*, char*);
static	int	escape(char**);
static	int	extract(char*);
static	int	findkey(char*);
static	int	hash(int);
static	int	isword(Word*, char*, int);
static	void	parsealt(Biobuf*, char*, Spat**);
/*
 *	The canonicalizer: convert input to canonical representation
 */
char*
readmsg(Biobuf *bp, int *hsize, int *bufsize)
{
	char *p, *buf;
	int n, offset, eoh, bsize, delta;
	buf = 0;
	offset = 0;
	if(bufsize)
		*bufsize = 0;
	if(hsize)
		*hsize = 0;
	for(;;) {
		buf = Realloc(buf, offset+Quanta+1);
		n = Bread(bp, buf+offset, Quanta);
		if(n < 0){
			free(buf);
			return 0;
		}
		p = buf+offset;			/* start of this chunk */
		offset += n;			/* end of this chunk */
		buf[offset] = 0;
		if(n == 0){
			if(offset == 0)
				return 0;
			break;
		}
		if(hsize == 0)			/* don't process header */
			break;
		if(p != buf && p[-1] == '\n')	/* check for EOH across buffer split */
			p--;
		p = endofhdr(p, buf+offset);
		if(p)
			break;
		if(offset >= Maxread)		/* gargantuan header - just punt*/
		{
			if(hsize)
				*hsize = offset;
			if(bufsize)
				*bufsize = offset;
			return buf;
		}
	}
	eoh = p-buf;				/* End of header */
	bsize = offset - eoh;			/* amount of body already read */
		/* Read at least Minbody bytes of the body */
	if (bsize < Minbody){
		delta = Minbody-bsize;
		buf = Realloc(buf, offset+delta+1);
		n = Bread(bp, buf+offset, delta);
		if(n > 0) {
			offset += n;
			buf[offset] = 0;
		}
	}
	if(hsize)
		*hsize = eoh;
	if(bufsize)
		*bufsize = offset;
	return buf;
}
static	int
isword(Word *wp, char *text, int len)
{
	for(;wp->string; wp++)
		if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
			return 1;
	return 0;
}
static char*
endofhdr(char *raw, char *end)
{
	int i;
	char *p, *q;
	char buf[HdrMax];
	/*
 	 * can't use strchr to search for newlines because
	 * there may be embedded NULL's.
	 */
	for(p = raw; p < end; p++){
		if(*p != '\n' || p[1] != '\n')
			continue;
		p++;
		for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
			buf[i++] = tolower(*q);
			if(*q == ':' || *q == '\n')
				break;
		}
		if(!isword(hdrwords, buf, i))
			return p+1;
	}
	return 0;
}
static	int
htmlmatch(Word *wp, char *text, char *end, int *n)
{
	char *cp;
	int i, c, lastc;
	char buf[MaxHtml];
	/*
	 * extract a string up to '>'
	 */
	i = lastc = 0;
	cp = text;
	while (cp < end && i < sizeof(buf)-1){
		c = *cp++;
		if(c == '=')
			c = escape(&cp);
		switch(c){
		case 0:
		case '\r':
			continue;
		case '>':
			goto out;
		case '\n':
		case ' ':
		case '\t':
			if(lastc == ' ')
				continue;
			c = ' ';
			break;
		default:
			c = tolower(c);
			break;
		}
		buf[i++] = lastc = c;
	}
out:
	buf[i] = 0;
	if(n)
		*n = cp-text;
	return isword(wp, buf, i);
}
static int
escape(char **msg)
{
	int c;
	char *p;
	p = *msg;
	c = *p;
	if(c == '\n'){
		p++;
		c = *p++;
	} else
	if(c == '2'){
		c = tolower(p[1]);
		if(c == 'e'){
			p += 2;
			c = '.';
		}else
		if(c == 'f'){
			p += 2;
			c = '/';
		}else
		if(c == '0'){
			p += 2;
			c = ' ';
		}
		else c = '=';
	} else {
		if(c == '3' && tolower(p[1]) == 'd')
			p += 2;
		c = '=';
	}
	*msg = p;
	return c;
}
static int
htmlchk(char **msg, char *end)
{
	int n;
	char *p;
	static int ishtml;
	p = *msg;
	if(ishtml == 0){
		ishtml = htmlmatch(htmlcmds, p, end, &n);
	
		/* If not an HTML keyword, check if it's
		 * an HTML comment (<!comment>).  if so,
		 * skip over it; otherwise copy it in.
		 */
		if(ishtml == 0 && *p != '!')	/* not comment */
			return '<';		/* copy it */
	} else if(htmlmatch(hrefs, p, end, &n))	/* if special HTML string  */
		return '<';			/* copy it */
	
	/*
	 * this is an uninteresting HTML command; skip over it.
	 */
	p += n;
	*msg = p+1;
	return *p;
}
/*
 * decode a base 64 encode body
 */
void
conv64(char *msg, char *end, char *buf, int bufsize)
{
	int len, i;
	char *cp;
	len = end - msg;
	i = (len*3)/4+1;	// room for max chars + null
	cp = Malloc(i);
	len = dec64((uchar*)cp, i, msg, len);
	convert(cp, cp+len, buf, bufsize, 1);
	free(cp);
}
int
convert(char *msg, char *end, char *buf, int bufsize, int isbody)
{
	char *p;
	int c, lastc, base64;
	lastc = 0;
	base64 = 0;
	while(msg < end && bufsize > 0){
		c = *msg++;
		/*
		 * In the body only, try to strip most HTML and
		 * replace certain MIME escape sequences with the character
		 */
		if(isbody) {
			do{
				p = msg;
				if(c == '<')
					c = htmlchk(&msg, end);
				if(c == '=')
					c = escape(&msg);
			} while(p != msg && p < end);
		}
		switch(c){
		case 0:
		case '\r':
			continue;
		case '\t':
		case ' ':
		case '\n':
			if(lastc == ' ')
				continue;
			c = ' ';
			break;
		case 'C':	/* check for MIME base 64 encoding in header */
		case 'c':
			if(isbody == 0)
			if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
			if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
				base64 = 1;
			c = 'c';
			break;
		default:
			c = tolower(c);
			break;
		}
		*buf++ = c;
		lastc = c;
		bufsize--;
	}
	*buf = 0;
	return base64;
}
/*
 *	The pattern parser: build data structures from the pattern file
 */
static int
hash(int c)
{
	return c & 127;
}
static	int
findkey(char *val)
{
	Keyword *kp;
	for(kp = keywords; kp->string; kp++)
		if(strcmp(val, kp->string) == 0)
				break;
	return kp->value;
}
#define	whitespace(c)	((c) == ' ' || (c) == '\t')
void
parsepats(Biobuf *bp)
{
	Pattern *p, *new;
	char *cp, *qp;
	int type, action, n, h;
	Spat *spat;
	for(;;){
		cp = Brdline(bp, '\n');
		if(cp == 0)
			break;
		cp[Blinelen(bp)-1] = 0;
		while(*cp == ' ' || *cp == '\t')
			cp++;
		if(*cp == '#' || *cp == 0)
			continue;
		type = regexp;
		if(*cp == '*'){
			type = string;
			cp++;
		}
		qp = strchr(cp, ':');
		if(qp == 0)
			continue;
		*qp = 0;
		if(debug)
			fprint(2, "action = %s\n", cp);
		action = findkey(cp);
		if(action >= Nactions)
			continue;
		cp = qp+1;
		n = extract(cp);
		if(n <= 0 || *cp == 0)
			continue;
		qp = strstr(cp, "~~");
		if(qp){
			*qp = 0;
			n = strlen(cp);
		}
		if(debug)
			fprint(2, " Pattern: `%s'\n", cp);
			/* Hook regexps into a chain */
		if(type == regexp) {
			new = Malloc(sizeof(Pattern));
			new->action = action;
			new->pat = regcomp(cp);
			if(new->pat == 0){
				free(new);
				continue;
			}
			new->type = regexp;
			new->alt = 0;
			new->next = 0;
			if(qp)
				parsealt(bp, qp+2, &new->alt);
			new->next = patterns[action].regexps;
			patterns[action].regexps = new;
			continue;
		}
			/* not a Regexp - hook strings into Pattern hash chain */
		spat = Malloc(sizeof(*spat));
		spat->next = 0;
		spat->alt = 0;
		spat->len = n;
		spat->string = Malloc(n+1);
		spat->c1 = cp[1];
		strcpy(spat->string, cp);
		if(qp)
			parsealt(bp, qp+2, &spat->alt);
		p = patterns[action].strings;
		if(p == 0) {
			p = Malloc(sizeof(Pattern));
			memset(p, 0, sizeof(*p));
			p->action = action;
			p->type = string;
			patterns[action].strings = p;
		}
		h = hash(*spat->string);
		spat->next = p->spat[h];
		p->spat[h] = spat;
	}
}
static void
parsealt(Biobuf *bp, char *cp, Spat** head)
{
	char *p;
	Spat *alt;
	while(cp){
		if(*cp == 0){		/*escaped newline*/
			do{
				cp = Brdline(bp, '\n');
				if(cp == 0)
					return;
				cp[Blinelen(bp)-1] = 0;
			} while(extract(cp) <= 0 || *cp == 0);
		}
		p = cp;
		cp = strstr(p, "~~");
		if(cp){
			*cp = 0;
			cp += 2;
		}
		if(strlen(p)){
			alt = Malloc(sizeof(*alt));
			alt->string = strdup(p);
			alt->next = *head;
			*head = alt;
		}
	}
}
static int
extract(char *cp)
{
	int c;
	char *p, *q, *r;
	p = q = r = cp;
	while(whitespace(*p))
		p++;
	while(c = *p++){
		if (c == '#')
			break;
		if(c == '"'){
			while(*p && *p != '"'){
				if(*p == '\\' && p[1] == '"')
					p++;
				if('A' <= *p && *p <= 'Z')
					*q++ = *p++ + ('a'-'A');
				else
					*q++ = *p++;
			}
			if(*p)
				p++;
			r = q;		/* never back up over a quoted string */
		} else {
			if('A' <= c && c <= 'Z')
				c += ('a'-'A');
			*q++ = c;
		}
	}
	while(q > r && whitespace(q[-1]))
		q--;
	*q = 0;
	return q-cp;
}
/*
 *	The matching engine: compare canonical input to pattern structures
 */
static Spat*
isalt(char *message, Spat *alt)
{
	while(alt) {
		if(*cmd)
		if(message != cmd && strstr(cmd, alt->string))
			break;
		if(message != header+1 && strstr(header+1, alt->string))
			break;
		if(strstr(message, alt->string))
			break;
		alt = alt->next;
	}
	return alt;
}
int
matchpat(Pattern *p, char *message, Resub *m)
{
	Spat *spat;
	char *s;
	int c, c1;
	if(p->type == string){
		c1 = *message;
		for(s=message; c=c1; s++){
			c1 = s[1];
			for(spat=p->spat[hash(c)]; spat; spat=spat->next){
				if(c1 == spat->c1)
				if(memcmp(s, spat->string, spat->len) == 0)
				if(!isalt(message, spat->alt)){
					m->sp = s;
					m->ep = s + spat->len;
					return 1;
				}
			}
		}
		return 0;
	}
	m->sp = m->ep = 0;
	if(regexec(p->pat, message, m, 1) == 0)
		return 0;
	if(isalt(message, p->alt))
		return 0;
	return 1;
}
void
xprint(int fd, char *type, Resub *m)
{
	char *p, *q;
	int i;
	if(m->sp == 0 || m->ep == 0)
		return;
		/* back up approx 30 characters to whitespace */
	for(p = m->sp, i = 0; *p && i < 30; i++, p--)
			;
	while(*p && *p != ' ')
		p--;
	p++;
		/* grab about 30 more chars beyond the end of the match */
	for(q = m->ep, i = 0; *q && i < 30; i++, q++)
			;
	while(*q && *q != ' ')
		q++;
	fprint(fd, "%s %.*s~%.*s~%.*s\n", type, 
		utfnlen(p, m->sp-p), p,
		utfnlen(m->sp, m->ep-m->sp), m->sp,
		utfnlen(m->ep, q-m->ep), m->ep);
}
enum {
	INVAL=	255
};
static uchar t64d[256] = {
/*00 */	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*10*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*20*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL,    62, INVAL, INVAL, INVAL,    63,
/*30*/	   52,	  53,	 54,	55,    56,    57,    58,    59,
	   60,	  61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*40*/	INVAL,    0,      1,     2,     3,     4,     5,     6,
	    7,    8,      9,    10,    11,    12,    13,    14,
/*50*/	   15,   16,     17,    18,    19,    20,    21,    22,
	   23,   24,     25, INVAL, INVAL, INVAL, INVAL, INVAL,
/*60*/	INVAL,   26,     27,    28,    29,    30,    31,    32,
	   33,   34,     35,    36,    37,    38,    39,    40,
/*70*/	   41,   42,     43,    44,    45,    46,    47,    48,
	   49,   50,     51, INVAL, INVAL, INVAL, INVAL, INVAL,
/*80*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*90*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*A0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*B0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*C0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*D0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*E0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*F0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
};