ref: a890ad992a9949d6c5f095b85aa1ef568566dc7b
dir: /sys/src/cmd/upas/bayes/msgclass.c/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "msgdb.h"
void
usage(void)
{
	fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
	exits("usage");
}
enum
{
	MAXBEST = 32,
	MAXLEN = 64,
	MAXTAB = 256,
};
typedef struct Ndb Ndb;
struct Ndb
{
	char *name;
	char *file;
	Msgdb *db;
	double p;
	long nmsg;
};
typedef struct Word Word;
struct Word
{
	char s[MAXLEN];
	int count[MAXTAB];
	double p[MAXTAB];
	double mp;
	int mi; /* w.p[w.mi] = w.mp */
	int nmsg;
};
Ndb db[MAXTAB];
int ndb;
int add;
int mul;
Msgdb *indb;
Word best[MAXBEST];
int mbest = 15;
int nbest;
void process(Biobuf*, char*);
void lockfile(char*);
void
noteword(Word *w, char *s)
{
	int i;
	for(i=nbest-1; i>=0; i--)
		if(w->mp < best[i].mp)
			break;
	i++;
	if(i >= mbest)
		return;
	if(nbest == mbest)
		nbest--;
	if(i < nbest)
		memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
	best[i] = *w;
	strecpy(best[i].s, best[i].s+MAXLEN, s);
	nbest++;
}
void
main(int argc, char **argv)
{
	int i, bad, m, tot, nn, j;
	Biobuf bin, *b, bout;
	char *s, *lf;
	double totp, p, thresh;
	long n;
	Word w;
	lf = nil;
	thresh = 0;
	ARGBEGIN{
	case 'a':
		add = 1;
		break;
	case 'd':
		if(ndb >= MAXTAB)
			sysfatal("too many db classes");
		db[ndb].name = EARGF(usage());
		db[ndb].file = EARGF(usage());
		ndb++;
		break;
	case 'l':
		lf = EARGF(usage());
		break;
	case 'm':
		mul = atoi(EARGF(usage()));
		break;
	case 't':
		thresh = atof(EARGF(usage()));
		break;
	default:
		usage();
	}ARGEND
	if(ndb == 0){
		fprint(2, "must have at least one -d option\n");
		usage();
	}
	indb = mdopen(nil, 1);
	if(argc == 0){
		Binit(&bin, 0, OREAD);
		process(&bin, "<stdin>");
		Bterm(&bin);
	}else{
		bad = 0;
		for(i=0; i<argc; i++){
			if((b = Bopen(argv[i], OREAD)) == nil){
				fprint(2, "opening %s: %r\n", argv[i]);
				bad = 1;
				continue;
			}
			process(b, argv[i]);
			Bterm(b);
		}
		if(bad)
			exits("open inputs");
	}
	lockfile(lf);
	bad = 0;
	for(i=0; i<ndb; i++){
		if((db[i].db = mdopen(db[i].file, 0)) == nil){
			fprint(2, "opendb %s: %r\n", db[i].file);
			bad = 1;
		}
		db[i].nmsg = mdget(db[i].db, "*From*");
	}
	if(bad)
		exits("open databases");
	/* run conditional probabilities of input words, getting 15 most specific */
	mdenum(indb);
	nbest = 0;
	while(mdnext(indb, &s, &n) >= 0){
		tot = 0;
		totp = 0.0;
		for(i=0; i<ndb; i++){
			nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
			tot += nn;
			w.count[i] = nn;
			p = w.count[i]/(double)db[i].nmsg;
			if(p >= 1.0)
				p = 1.0;
			w.p[i] = p;
			totp += p;
		}
//fprint(2, "%s tot %d totp %g\n", s, tot, totp);
		if(tot < 2)
			continue;
		w.mp = 0.0;
		for(i=0; i<ndb; i++){
			p = w.p[i];
			p /= totp;
			if(p < 0.001)
				p = 0.001;
			else if(p > 0.999)
				p = 0.999;
			if(p > w.mp){
				w.mp = p;
				w.mi = i;
			}
			w.p[i] = p;
		}
		noteword(&w, s);
	}
	/* compute conditional probabilities of message classes using 15 most specific */
	totp = 0.0;
	for(i=0; i<ndb; i++){
		p = 1.0;
		for(j=0; j<nbest; j++)
			p *= best[j].p[i];
		db[i].p = p;
		totp += p;
	}
	for(i=0; i<ndb; i++)
		db[i].p /= totp;
	m = 0;
	for(i=1; i<ndb; i++)
		if(db[i].p > db[m].p)
			m = i;
	Binit(&bout, 1, OWRITE);
	if(db[m].p < thresh)
		m = -1;
	if(m >= 0)
		Bprint(&bout, "%s", db[m].name);
	else
		Bprint(&bout, "inconclusive");
	for(j=0; j<ndb; j++)
		Bprint(&bout, " %s=%g", db[j].name, db[j].p);
	Bprint(&bout, "\n");
	for(i=0; i<nbest; i++){
		Bprint(&bout, "%s", best[i].s);
		for(j=0; j<ndb; j++)
			Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
		Bprint(&bout, "\n");
	}
		Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
	Bterm(&bout);
	if(m >= 0 && add){
		mdenum(indb);
		while(mdnext(indb, &s, &n) >= 0)
			mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
		mdclose(db[m].db);
	}
	exits(nil);
}
void
process(Biobuf *b, char*)
{
	char *s;
	char *p;
	long n;
	while((s = Brdline(b, '\n')) != nil){
		s[Blinelen(b)-1] = 0;
		if((p = strrchr(s, ' ')) != nil){
			*p++ = 0;
			n = atoi(p);
		}else
			n = 1;
		mdput(indb, s, mdget(indb, s)+n);
	}
}
int tpid;
void
killtickle(void)
{
	postnote(PNPROC, tpid, "die");
}
void
lockfile(char *s)
{
	int fd, t, w;
	char err[ERRMAX];
	if(s == nil)
		return;
	w = 50;
	t = 0;
	for(;;){
		fd = open(s, OREAD);
		if(fd >= 0)
			break;
		rerrstr(err, sizeof err);
		if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
			break;
		sleep(w);
		t += w;
		if(w < 1000)
			w = (w*3)/2;
		if(t > 120*1000)
			break;
	}
	if(fd < 0)
		sysfatal("could not lock %s", s);
	switch(tpid = fork()){
	case -1:
		sysfatal("fork: %r");
	case 0:
		for(;;){
			sleep(30*1000);
			free(dirfstat(fd));
		}
		_exits(nil);
	default:
		break;
	}
	close(fd);
	atexit(killtickle);
}