ref: 3b2d76319d53a02b1336609e7f1311d927595ce4
dir: /sys/src/cmd/upas/bayes/dfa.c/
#include <u.h>
#include <libc.h>
#include <bin.h>
#include <bio.h>
#include "regexp.h"
#include "regcomp.h"
#include "dfa.h"
void rdump(Reprog*);
void dump(Dreprog*);
/*
 * Standard NFA determinization and DFA minimization.
 */
typedef struct Deter Deter;
typedef struct Reiset Reiset;
void ddump(Deter*);
/* state of determinization */
struct Deter
{
	jmp_buf kaboom;	/* jmp on error */
	Bin *bin;		/* bin for temporary allocations */
	Reprog *p;	/* program being determinized */
	uint ninst;		/* number of instructions in program */
	Reiset *alloc;	/* chain of all Reisets */
	Reiset **last;
	Reiset **hash;	/* hash of all Reisets */
	uint nhash;
	Reiset *tmp;	/* temporaries for walk */
	uchar *bits;
	Rune *c;		/* ``interesting'' characters */
	uint nc;
};
/* set of Reinsts: perhaps we should use a bit list instead of the indices? */
struct Reiset
{
	uint *inst;		/* indices of instructions in set */
	uint ninst;		/* size of set */
	Reiset *next;	/* d.alloc chain */
	Reiset *hash;	/* d.hash chain */
	Reiset **delta;	/* where to go on each interesting char */
	uint id;		/* assigned id during minimization */
	uint isfinal;	/* is an accepting (final) state */
};
static Reiset*
ralloc(Deter *d, int ninst)
{
	Reiset *t;
	t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
	if(t == nil)
		longjmp(d->kaboom, 1);
	t->delta = (Reiset**)&t[1];
	t->inst = (uint*)&t->delta[2*d->nc];
	return t;
}
/* find the canonical form a given Reiset */
static Reiset*
findreiset(Deter *d, Reiset *s)
{
	int i, szinst;
	uint h;
	Reiset *t;
	h = 0;
	for(i=0; i<s->ninst; i++)
		h = h*1000003 + s->inst[i];
	h %= d->nhash;
	szinst = s->ninst*sizeof(s->inst[0]);
	for(t=d->hash[h]; t; t=t->hash)
		if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
			return t;
	t = ralloc(d, s->ninst);
	t->hash = d->hash[h];
	d->hash[h] = t;
	*d->last = t;
	d->last = &t->next;
	t->next = 0;
	t->ninst = s->ninst;
	memmove(t->inst, s->inst, szinst);
	/* delta is filled in later */
	return t;
}
/* convert bits to a real reiset */
static Reiset*
bits2reiset(Deter *d, uchar *bits)
{
	int k;
	Reiset *s;
	s = d->tmp;
	s->ninst = 0;
	for(k=0; k<d->ninst; k++)
		if(bits[k])
			s->inst[s->ninst++] = k;
	return findreiset(d, s);
}
/* add n to state set; if n < k, need to go around again */
static int
add(int n, uchar *bits, int k)
{
	if(bits[n])
		return 0;
	bits[n] = 1;
	return n < k;
}
/* update bits to follow all the empty (non-character-related) transitions possible */
static void
followempty(Deter *d, uchar *bits, int bol, int eol)
{
	int again, k;
	Reinst *i;
	do{
		again = 0;
		for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
			if(!bits[k])
				continue;
			switch(i->type){
			case RBRA:
			case LBRA:
				again |= add(i->next - d->p->firstinst, bits, k);
				break;
			case OR:
				again |= add(i->left - d->p->firstinst, bits, k);
				again |= add(i->right - d->p->firstinst, bits, k);
				break;
			case BOL:
				if(bol)
					again |= add(i->next - d->p->firstinst, bits, k);
				break;
			case EOL:
				if(eol)
					again |= add(i->next - d->p->firstinst, bits, k);
				break;
			}
		}
	}while(again);
	/*
	 * Clear bits for useless transitions.  We could do this during
	 * the switch above, but then we have no guarantee of termination
	 * if we get a loop in the regexp.
	 */
	for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
		if(!bits[k])
			continue;
		switch(i->type){
		case RBRA:
		case LBRA:
		case OR:
		case BOL:
		case EOL:
			bits[k] = 0;
			break;
		}
	}
}
/*
 * Where does s go if it sees rune r?
 * Eol is true if a $ matches the string at the position just after r.
 */
static Reiset*
transition(Deter *d, Reiset *s, Rune r, uint eol)
{
	int k;
	uchar *bits;
	Reinst *i, *inst0;
	Rune *rp, *ep;
	bits = d->bits;
	memset(bits, 0, d->ninst);
	inst0 = d->p->firstinst;
	for(k=0; k < s->ninst; k++){
		i = inst0 + s->inst[k];
		switch(i->type){
		default:
			werrstr("bad reprog: got type %d", i->type);
			longjmp(d->kaboom, 1);
		case RBRA:
		case LBRA:
		case OR:
		case BOL:
		case EOL:
			werrstr("internal error: got type %d", i->type);
			longjmp(d->kaboom, 1);
		case RUNE:
			if(r == i->r)
				bits[i->next - inst0] = 1;
			break;
		case ANY:
			if(r != L'\n')
				bits[i->next - inst0] = 1;
			break;
		case ANYNL:
			bits[i->next - inst0] = 1;
			break;
		case NCCLASS:
			if(r == L'\n')
				break;
			/* fall through */
		case CCLASS:
			ep = i->cp->end;
			for(rp = i->cp->spans; rp < ep; rp += 2)
				if(rp[0] <= r && r <= rp[1])
					break;
			if((rp < ep) ^! (i->type == CCLASS))
				bits[i->next - inst0] = 1;
			break;
		case END:
			break;
		}
	}
	followempty(d, bits, r=='\n', eol);
	return bits2reiset(d, bits);
}
static int
countinst(Reprog *pp)
{
	int n;
	Reinst *l;
	n = 0;
	l = pp->firstinst;
	while(l++->type)
		n++;
	return n;
}
static void
set(Deter *d, u32int **tab, Rune r)
{
	u32int *u;
	if((u = tab[r/4096]) == nil){
		u = binalloc(&d->bin, 4096/8, 1);
		if(u == nil)
			longjmp(d->kaboom, 1);
		tab[r/4096] = u;
	}
	u[(r%4096)/32] |= 1<<(r%32);
}
/*
 * Compute the list of important characters. 
 * Other characters behave like the ones that surround them.
 */
static void
findchars(Deter *d, Reprog *p)
{
	u32int *tab[65536/4096], *u, x;
	Reinst *i;
	Rune *rp, *ep;
	int k, m, n, a;
	memset(tab, 0, sizeof tab);
	set(d, tab, 0);
	set(d, tab, 0xFFFF);
	for(i=p->firstinst; i->type; i++){
		switch(i->type){
		case ANY:
			set(d, tab, L'\n'-1);
			set(d, tab, L'\n');
			set(d, tab, L'\n'+1);
			break;
		case RUNE:
			set(d, tab, i->r-1);
			set(d, tab, i->r);
			set(d, tab, i->r+1);
			break;
		case NCCLASS:
			set(d, tab, L'\n'-1);
			set(d, tab, L'\n');
			set(d, tab, L'\n'+1);
			/* fall through */
		case CCLASS:
			ep = i->cp->end;
			for(rp = i->cp->spans; rp < ep; rp += 2){
				set(d, tab, rp[0]-1);
				set(d, tab, rp[0]);
				set(d, tab, rp[1]);
				set(d, tab, rp[1]+1);
			}
			break;
		}
	}
	n = 0;
	for(k=0; k<nelem(tab); k++){
		if((u = tab[k]) == nil)
			continue;
		for(m=0; m<4096/32; m++){
			if((x = u[m]) == 0)
				continue;
			for(a=0; a<32; a++)
				if(x&(1<<a))
					n++;
		}
	}
	d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
	if(d->c == 0)
		longjmp(d->kaboom, 1);
	d->nc = n;
	n = 0;
	for(k=0; k<nelem(tab); k++){
		if((u = tab[k]) == nil)
			continue;
		for(m=0; m<4096/32; m++){
			if((x = u[m]) == 0)
				continue;
			for(a=0; a<32; a++)
				if(x&(1<<a))
					d->c[n++] = k*4096+m*32+a;
		}
	}
	d->c[n] = 0;
	if(n != d->nc)
		abort();
}
/*
 * convert the Deter and Reisets into a Dreprog.
 * if dp and c are nil, just return the count of Drecases needed.
 */
static int
buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
{
	int i, j, id, n, nn;
	Dreinst *di;
	Reiset *s;
	nn = 0;
	di = 0;
	for(i=0; i<nid; i++){
		s = id2set[i];
		if(c){
			di = &dp->inst[i];
			di->isfinal = s->isfinal;
		}
		n = 0;
		id = -1;
		for(j=0; j<2*d->nc; j++){
			if(s->delta[j]->id != id){
				id = s->delta[j]->id;
				if(c){
					c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
					c[n].next = &dp->inst[id];
				}
				n++;
			}
		}
		if(c){
			if(n == 1 && c[0].next == di)
				di->isloop = 1;
			di->c = c;
			di->nc = n;
			c += n;
		}
		nn += n;
	}
	return nn;
}
Dreprog*
dregcvt(Reprog *p)
{
	uchar *bits;
	uint again, n, nid, id;
	Deter d;
	Reiset **id2set, *s, *t, *start[4];
	Dreprog *dp;
	Drecase *c;
	memset(&d, 0, sizeof d);
	if(setjmp(d.kaboom)){
		binfree(&d.bin);
		return nil;
	}
	d.p = p;
	d.ninst = countinst(p);
	d.last = &d.alloc;
	n = d.ninst;
	/* round up to power of two; this loop is the least of our efficiency problems */
	while(n&(n-1))
		n++;
	d.nhash = n;
	d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
	/* get list of important runes */
	findchars(&d, p);
#ifdef DUMP
	print("relevant chars are: «%S»\n", d.c+1);
#endif
	d.bits = bits = binalloc(&d.bin, d.ninst, 0);
	d.tmp = ralloc(&d, d.ninst);
	/*
	 * Convert to DFA
	 */
	/* 4 start states, depending on initial bol, eol */
	for(n=0; n<4; n++){
		memset(bits, 0, d.ninst);
		bits[p->startinst - p->firstinst] = 1;
		followempty(&d, bits, n&1, n&2);
		start[n] = bits2reiset(&d, bits);
	}
	/* explore the reiset space */
	for(s=d.alloc; s; s=s->next)
		for(n=0; n<2*d.nc; n++)
			s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
#ifdef DUMP
	nid = 0;
	for(s=d.alloc; s; s=s->next)
		s->id = nid++;
	ddump(&d);
#endif
	/*
	 * Minimize.
	 */
	/* first class division is final or not */
	for(s=d.alloc; s; s=s->next){
		s->isfinal = 0;
		for(n=0; n<s->ninst; n++)
			if(p->firstinst[s->inst[n]].type == END)
				s->isfinal = 1;
		s->id = s->isfinal;
	}
	/* divide states with different transition tables in id space */
	nid = 2;
	do{
		again = 0;
		for(s=d.alloc; s; s=s->next){
			id = -1;
			for(t=s->next; t; t=t->next){
				if(s->id != t->id)
					continue;
				for(n=0; n<2*d.nc; n++){
					/* until we finish the for(t) loop, s->id and id are same */
					if((s->delta[n]->id == t->delta[n]->id)
					|| (s->delta[n]->id == s->id && t->delta[n]->id == id)
					|| (s->delta[n]->id == id && t->delta[n]->id == s->id))
						continue;
					break;
				}
				if(n == 2*d.nc)
					continue;
				if(id == -1)
					id = nid++;
				t->id = id;
				again = 1;
			}
		}
	}while(again);
#ifdef DUMP
	ddump(&d);
#endif
	/* build dreprog */
	id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
	if(id2set == nil)
		longjmp(d.kaboom, 1);
	for(s=d.alloc; s; s=s->next)
		id2set[s->id] = s;
	n = buildprog(&d, id2set, nid, nil, nil);
	dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
	if(dp == nil)
		longjmp(d.kaboom, 1);
	c = (Drecase*)&dp->inst[nid];
	buildprog(&d, id2set, nid, dp, c);
	for(n=0; n<4; n++)
		dp->start[n] = &dp->inst[start[n]->id];
	dp->ninst = nid;
	binfree(&d.bin);
	return dp;
}
int
dregexec(Dreprog *p, char *s, int bol)
{
	Rune r;
	ulong rr;
	Dreinst *i;
	Drecase *c, *ec;
	int best, n;
	char *os;
	i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
	best = -1;
	os = s;
	for(; *s; s+=n){
		if(i->isfinal)
			best = s - os;
		if(i->isloop){
			if(i->isfinal)
				return strlen(os);
			else
				return best;
		}
		if((*s&0xFF) < Runeself){
			r = *s;
			n = 1;
		}else
			n = chartorune(&r, s);
		c = i->c;
		ec = c+i->nc;
		rr = r;
		if(s[n] == '\n' || s[n] == '\0')
			rr |= 0x10000;
		for(; c<ec; c++){
			if(c->start > rr){
				i = c[-1].next;
				goto Out;
			}
		}
		i = ec[-1].next;
	Out:;
	}
	if(i->isfinal)
		best = s - os;
	return best;
}
#ifdef DUMP
void
ddump(Deter *d)
{
	int i, id;
	Reiset *s;
	for(s=d->alloc; s; s=s->next){
		print("%d ", s->id);
		id = -1;
		for(i=0; i<2*d->nc; i++){
			if(id != s->delta[i]->id){
				if(i==0)
					print(" [");
				else if(i/d->nc)
					print(" [%C$", d->c[i%d->nc]);
				else
					print(" [%C", d->c[i%d->nc]);
				print(" %d]", s->delta[i]->id);
				id = s->delta[i]->id;
			}
		}
		print("\n");
	}
}
void
rdump(Reprog *pp)
{
	Reinst *l;
	Rune *p;
	l = pp->firstinst;
	do{
		print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
			l->left-pp->firstinst, l->right-pp->firstinst);
		if(l->type == RUNE)
			print("\t%C\n", l->r);
		else if(l->type == CCLASS || l->type == NCCLASS){
			print("\t[");
			if(l->type == NCCLASS)
				print("^");
			for(p = l->cp->spans; p < l->cp->end; p += 2)
				if(p[0] == p[1])
					print("%C", p[0]);
				else
					print("%C-%C", p[0], p[1]);
			print("]\n");
		} else
			print("\n");
	}while(l++->type);
}
void
dump(Dreprog *pp)
{
	int i, j;
	Dreinst *l;
	print("start %ld %ld %ld %ld\n",
		pp->start[0]-pp->inst,
		pp->start[1]-pp->inst,
		pp->start[2]-pp->inst,
		pp->start[3]-pp->inst);
	for(i=0; i<pp->ninst; i++){
		l = &pp->inst[i];
		print("%d:", i);
		for(j=0; j<l->nc; j++){
			print(" [");
			if(j == 0)
				if(l->c[j].start != 1)
					abort();
			if(j != 0)
				print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
			print("-");
			if(j != l->nc-1)
				print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
			print("] %ld", l->c[j].next - pp->inst);
		}
		if(l->isfinal)
			print(" final");
		if(l->isloop)
			print(" loop");
		print("\n");
	}
}
void
main(int argc, char **argv)
{
	int i;
	Reprog *p;
	Dreprog *dp;
	i = 1;
		p = regcomp(argv[i]);
		if(p == 0){
			print("=== %s: bad regexp\n", argv[i]);
		}
	//	print("=== %s\n", argv[i]);
	//	rdump(p);
		dp = dregcvt(p);
		print("=== dfa\n");
		dump(dp);
	
	for(i=2; i<argc; i++)
		print("match %d\n", dregexec(dp, argv[i], 0));
	exits(0);
}
#endif
void
Bprintdfa(Biobuf *b, Dreprog *p)
{
	int i, j, nc;
	Bprint(b, "# dreprog\n");
	nc = 0;
	for(i=0; i<p->ninst; i++)
		nc += p->inst[i].nc;
	Bprint(b, "%d %d %zd %zd %zd %zd\n", p->ninst, nc,
		p->start[0]-p->inst, p->start[1]-p->inst,
		p->start[2]-p->inst, p->start[3]-p->inst);
	for(i=0; i<p->ninst; i++){
		Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
		for(j=0; j<p->inst[i].nc; j++)
			Bprint(b, " %d %zd", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
		Bprint(b, "\n");
	}
}
static char*
egetline(Biobuf *b, int c, jmp_buf jb)
{
	char *p;
	p = Brdline(b, c);
	if(p == nil)
		longjmp(jb, 1);
	p[Blinelen(b)-1] = '\0';
	return p;
}
static void
egetc(Biobuf *b, int c, jmp_buf jb)
{
	if(Bgetc(b) != c)
		longjmp(jb, 1);
}
static int
egetnum(Biobuf *b, int want, jmp_buf jb)
{
	int c;
	int n, first;
	n = 0;
	first = 1;
	while((c = Bgetc(b)) != Beof){
		if(c < '0' || c > '9'){
			if(want == 0){
				Bungetc(b);
				c = 0;
			}
			if(first || c != want){
				werrstr("format error");
				longjmp(jb, 1);
			}
			return n;
		}
		n = n*10 + c - '0';
		first = 0;
	}
	werrstr("unexpected eof");
	longjmp(jb, 1);
	return -1;
}
Dreprog*
Breaddfa(Biobuf *b)
{
	char *s;
	int ninst, nc;
	jmp_buf jb;
	Dreprog *p;
	Drecase *c;
	Dreinst *l;
	int j, k;
	p = nil;
	if(setjmp(jb)){
		free(p);
		return nil;
	}
	s = egetline(b, '\n', jb);
	if(strcmp(s, "# dreprog") != 0){
		werrstr("format error");
		longjmp(jb, 1);
	}
	ninst = egetnum(b, ' ', jb);
	nc = egetnum(b, ' ', jb);
	p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
	if(p == nil)
		longjmp(jb, 1);
	c = (Drecase*)&p->inst[ninst];
	p->start[0] = &p->inst[egetnum(b, ' ', jb)];
	p->start[1] = &p->inst[egetnum(b, ' ', jb)];
	p->start[2] = &p->inst[egetnum(b, ' ', jb)];
	p->start[3] = &p->inst[egetnum(b, '\n', jb)];
	for(j=0; j<ninst; j++){
		l = &p->inst[j];
		l->isfinal = egetnum(b, ' ', jb);
		l->isloop = egetnum(b, ' ', jb);
		l->nc = egetnum(b, 0, jb);
		l->c = c;
		for(k=0; k<l->nc; k++){
			egetc(b, ' ', jb);
			c->start = egetnum(b, ' ', jb);
			c->next = &p->inst[egetnum(b, 0, jb)];
			c++;
		}
		egetc(b, '\n', jb);
	}
	return p;
}