code: purgatorio

ref: 82b046f36f8084a22bbb5d71edd0edd9179561eb
dir: /appl/cmd/diff.b/

View raw version
implement Diff;

#	diff - differential file comparison
#
#	Uses an algorithm due to Harold Stone, which finds
#	a pair of longest identical subsequences in the two
#	files.
#
#	The major goal is to generate the match vector J.
#	J[i] is the index of the line in file1 corresponding
#	to line i file0. J[i] = 0 if there is no
#	such line in file1.
#
#	Lines are hashed so as to work in core. All potential
#	matches are located by sorting the lines of each file
#	on the hash (called value). In particular, this
#	collects the equivalence classes in file1 together.
#	Subroutine equiv replaces the value of each line in
#	file0 by the index of the first element of its 
#	matching equivalence in (the reordered) file1.
#	To save space equiv squeezes file1 into a single
#	array member in which the equivalence classes
#	are simply concatenated, except that their first
#	members are flagged by changing sign.
#
#	Next the indices that point into member are unsorted into
#	array class according to the original order of file0.
#
#	The cleverness lies in routine stone. This marches
#	through the lines of file0, developing a vector klist
#	of "k-candidates". At step i a k-candidate is a matched
#	pair of lines x,y (x in file0 y in file1) such that
#	there is a common subsequence of lenght k
#	between the first i lines of file0 and the first y 
#	lines of file1, but there is no such subsequence for
#	any smaller y. x is the earliest possible mate to y
#	that occurs in such a subsequence.
#
#	Whenever any of the members of the equivalence class of
#	lines in file1 matable to a line in file0 has serial number 
#	less than the y of some k-candidate, that k-candidate 
#	with the smallest such y is replaced. The new 
#	k-candidate is chained (via pred) to the current
#	k-1 candidate so that the actual subsequence can
#	be recovered. When a member has serial number greater
#	that the y of all k-candidates, the klist is extended.
#	At the end, the longest subsequence is pulled out
#	and placed in the array J by unravel.
#
#	With J in hand, the matches there recorded are
#	check'ed against reality to assure that no spurious
#	matches have crept in due to hashing. If they have,
#	they are broken, and "jackpot " is recorded--a harmless
#	matter except that a true match for a spuriously
#	mated line may now be unnecessarily reported as a change.
#
#	Much of the complexity of the program comes simply
#	from trying to minimize core utilization and
#	maximize the range of doable problems by dynamically
#	allocating what is needed and reusing what is not.
#	The core requirements for problems larger than somewhat
#	are (in words) 2*length(file0) + length(file1) +
#	3*(number of k-candidates installed),  typically about
#	6n words for files of length n. 
#
# 

include "sys.m";
	sys: Sys;

include "bufio.m";
	bufio : Bufio;
Iobuf : import bufio;

include "draw.m";
	draw: Draw;
include "readdir.m";
	readdir : Readdir;
include "string.m";
	str : String;
include "arg.m";

Diff : module  
{
	init: fn(ctxt: ref Draw->Context, argv: list of string);
};

stderr: ref Sys->FD;

mode : int;			# '\0', 'e', 'f', 'h' 
bflag : int;			# ignore multiple and trailing blanks 
rflag : int;			# recurse down directory trees 
mflag : int;			# pseudo flag: doing multiple files, one dir

REG,
BIN: con iota;

HALFINT : con 16;
Usage : con  "usage: diff [ -efbwr ] file1 ... file2";

cand : adt {
	x : int;
	y : int;
	pred : int;
};

line : adt {
	serial : int;
	value : int;
};

out : ref Iobuf;
file := array[2] of array of line;
sfile := array[2] of array of line;	# shortened by pruning common prefix and suffix
slen := array[2] of int;
ilen := array[2] of int;
pref, suff, clen : int;			# length of prefix and suffix
firstchange : int;
clist : array of cand;			# merely a free storage pot for candidates
J : array of int;			# will be overlaid on class
ixold, ixnew : array of int;
input := array[2] of ref Iobuf ;
file1, file2 : string;
tmpname := array[] of {"/tmp/diff1", "/tmp/diff2"};
whichtmp : int;
anychange := 0;

init(nil: ref Draw->Context, args: list of string)
{
	sys = load Sys Sys->PATH;
	stderr = sys->fildes(2);
	draw = load Draw Draw->PATH;
	bufio = load Bufio Bufio->PATH;
	readdir = load Readdir Readdir->PATH;	
	str = load String String->PATH;
	if (bufio==nil)
		fatal(sys->sprint("cannot load %s: %r", Bufio->PATH));
	if (readdir==nil)
		fatal(sys->sprint("cannot load %s: %r", Readdir->PATH));
	if (str==nil)
		fatal(sys->sprint("cannot load %s: %r", String->PATH));
	arg := load Arg Arg->PATH;
	if (arg==nil)
		fatal(sys->sprint("cannot load %s: %r", Arg->PATH));
	fsb, tsb : Sys->Dir;
	arg->init(args);
	while((o := arg->opt()) != 0)
		case o {
		'e' or 'f' =>
			mode = o;
		'w' =>
			bflag = 2;
		'b' =>
			bflag = 1;
		'r' =>
			rflag = 1;
		'm' =>
			mflag = 1;
		* =>
			fatal(Usage);
		}
	tmp := arg->argv();
	arg = nil;
	j := len tmp;
	if (j < 2)
		fatal(Usage);
	arr := array[j] of string;
	for(i:=0;i<j;i++){
		arr[i]= hd tmp;
		tmp = tl tmp;
	}

	(i,tsb)=sys->stat(arr[j-1]);
	if (i == -1)
		fatal(sys->sprint("can't stat %s: %r", arr[j-1]));
	if (j > 2) {
		if (!(tsb.qid.qtype&Sys->QTDIR))
			fatal(Usage);
		mflag = 1;
	}
	else {
		(i,fsb)=sys->stat(arr[0]);
		if (i == -1)
			fatal(sys->sprint("can't stat %s: %r", arr[0]));
		if ((fsb.qid.qtype&Sys->QTDIR) && (tsb.qid.qtype&Sys->QTDIR))
			mflag = 1;
	}
	out=bufio->fopen(sys->fildes(1),Bufio->OWRITE);
	for (i = 0; i < j-1; i++) {
		diff(arr[i], arr[j-1], 0);
		rmtmpfiles();
	}
	rmtmpfiles();
	out.flush();
	if (anychange)
		raise "fail:some";
}

############################# diffreg from here ....

# shellsort CACM #201

sort(a : array of line, n : int)
{
	w : line;
	m := 0;
	for (i := 1; i <= n; i *= 2)
		m = 2*i - 1;
	for (m /= 2; m != 0; m /= 2) {
		for (j := 1; j <= n-m ; j++) {
			ai:=j;
			aim:=j+m;
			do {
				if (a[aim].value > a[ai].value ||
				   a[aim].value == a[ai].value &&
				   a[aim].serial > a[ai].serial)
					break;
				w = a[ai];
				a[ai] = a[aim];
				a[aim] = w;
				aim=ai;
				ai-=m;
			} while (ai > 0 && aim >= ai);
		}
	}
}

unsort(f : array of line, l : int) : array of int 
{
	i : int;
	a := array[l+1] of int;
	for(i=1;i<=l;i++)
		a[f[i].serial] = f[i].value;
	return a;
}

prune() 
{
	for(pref=0;pref< ilen[0]&&pref< ilen[1]&&
		file[0][pref+1].value==file[1][pref+1].value;
		pref++ ) ;
	for(suff=0;suff< ilen[0]-pref&&suff< ilen[1]-pref&&
		file[0][ilen[0]-suff].value==file[1][ilen[1]-suff].value;
		suff++) ;
	for(j:=0;j<2;j++) {
		sfile[j] = file[j][pref:];
		slen[j]= ilen[j]-pref-suff;
		for(i:=0;i<=slen[j];i++)
			sfile[j][i].serial = i;
	}
}

equiv(a: array of line, n:int , b: array of line, m: int, c : array of int) 
{
	i := 1;
	j := 1;
	while(i<=n && j<=m) {
		if(a[i].value < b[j].value)
			a[i++].value = 0;
		else if(a[i].value == b[j].value)
			a[i++].value = j;
		else
			j++;
	}
	while(i <= n)
		a[i++].value = 0;
	b[m+1].value = 0; # huh ?
	j = 1;
	while(j <= m) {
		c[j] = -b[j].serial;
		while(b[j+1].value == b[j].value) {
			j++;
			c[j] = b[j].serial;
		}
		j++;
	}
	c[j] = -1;
}

newcand(x, y, pred : int) : int 
{
	if (clen==len clist){
		q := array[clen*2] of cand;
		q[0:]=clist;
		clist= array[clen*2] of cand;
		clist[0:]=q;
		q=nil;
	}
	clist[clen].x=x;
	clist[clen].y=y;
	clist[clen].pred=pred;
	return clen++;
}

search(c : array of int, k,y : int) : int 
{
	if(clist[c[k]].y < y)	# quick look for typical case
		return k+1;
	i := 0;
	j := k+1;
	while((l:=(i+j)/2) > i) {
		t := clist[c[l]].y;
		if(t > y)
			j = l;
		else if(t < y)
			i = l;
		else
			return l;
	}
	return l+1;
}

stone(a : array of int ,n : int, b: array of int , c : array of int) : int 
{
	oldc, oldl, tc, l ,y : int;
	k := 0;
	c[0] = newcand(0,0,0);
	for(i:=1; i<=n; i++) {
		j := a[i];
		if(j==0)
			continue;
		y = -b[j];
		oldl = 0;
		oldc = c[0];
		do {
			if(y <= clist[oldc].y)
				continue;
			l = search(c, k, y);
			if(l!=oldl+1)
				oldc = c[l-1];
			if(l<=k) {
				if(clist[c[l]].y <= y)
					continue;
				tc = c[l];
				c[l] = newcand(i,y,oldc);
				oldc = tc;
				oldl = l;
			} else {
				c[l] = newcand(i,y,oldc);
				k++;
				break;
			}
		} while((y=b[j+=1]) > 0);
	}
	return k;
}

unravel(p : int) 
{
	for(i:=0; i<=ilen[0]; i++) {
		if (i <= pref)
			J[i] = i;
		else if (i > ilen[0]-suff)
			J[i] = i+ ilen[1]-ilen[0];
		else
			J[i] = 0;
	}
	for(q:=clist[p];q.y!=0;q=clist[q.pred])
		J[q.x+pref] = q.y+pref;
}

output() 
{
	i1: int;
	m := ilen[0];
	J[0] = 0;
	J[m+1] = ilen[1]+1;
	if (mode != 'e') {
		for (i0 := 1; i0 <= m; i0 = i1+1) {
			while (i0 <= m && J[i0] == J[i0-1]+1)
				i0++;
			j0 := J[i0-1]+1;
			i1 = i0-1;
			while (i1 < m && J[i1+1] == 0)
				i1++;
			j1 := J[i1+1]-1;
			J[i1] = j1;
			change(i0, i1, j0, j1);
		}
	}
	else {
		for (i0 := m; i0 >= 1; i0 = i1-1) {
			while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
				i0--;
			j0 := J[i0+1]-1;
			i1 = i0+1;
			while (i1 > 1 && J[i1-1] == 0)
				i1--;
			j1 := J[i1-1]+1;
			J[i1] = j1;
			change(i1 , i0, j1, j0);
		}
	}
	if (m == 0)
		change(1, 0, 1, ilen[1]);
	out.flush();
}

diffreg(f,t : string) 
{
	k : int;

	(b0, b0type) := prepare(0, f);
	if (b0==nil)
		return;
	(b1, b1type) := prepare(1, t);
	if (b1==nil) {
		b0=nil;
		return;
	}
	if (b0type == BIN || b1type == BIN) {
		if (cmp(b0, b1)) {
			out.puts(sys->sprint("Binary files %s %s differ\n", f, t));
			anychange = 1;
		}
		b0 = nil;
		b1 = nil;
		return;
	}
	clen=0;
	prune();
	file[0]=nil;
	file[1]=nil;
	sort(sfile[0],slen[0]);
	sort(sfile[1],slen[1]);
	member := array[slen[1]+2] of int;
	equiv(sfile[0], slen[0],sfile[1],slen[1], member);
	class:=unsort(sfile[0],slen[0]);
	sfile[0]=nil;
	sfile[1]=nil;
	klist := array[slen[0]+2] of int;
	clist = array[1] of cand;
	k = stone(class, slen[0], member, klist);
	J = array[ilen[0]+2] of int;
	unravel(klist[k]);
	clist=nil;
	klist=nil;
	class=nil;
	member=nil;
	ixold = array[ilen[0]+2] of int;
	ixnew = array[ilen[1]+2] of int;

	b0.seek(big 0, 0); 
	b1.seek(big 0, 0);
	check(b0, b1);
	output();
	ixold=nil;
	ixnew=nil;
	b0=nil; 
	b1=nil;			
}

######################## diffio starts here...


# hashing has the effect of
# arranging line in 7-bit bytes and then
# summing 1-s complement in 16-bit hunks 

readhash(bp : ref Iobuf) : int 
{
	sum := 1;
	shift := 0;
	buf := bp.gets('\n');
	if (buf == nil)
		return 0;
	buf = buf[0:len buf -1];
	p := 0;
	case bflag {
		# various types of white space handling 
		0 =>
			while (p< len buf) {
				sum += (buf[p] << (shift &= (HALFINT-1)));
				p++;
				shift += 7;
			}
		1 =>
			
			 # coalesce multiple white-space
			 
			for (space := 0; p< len buf; p++) {
				if (buf[p]==' ' || buf[p]=='\t') {
					space++;
					continue;
				}
				if (space) {
					shift += 7;
					space = 0;
				}
				sum +=  (buf[p] << (shift &= (HALFINT-1)));
				p++;
				shift += 7;
			}
		* =>
			
			 # strip all white-space
			 
			while (p< len buf) {
				if (buf[p]==' ' || buf[p]=='\t') {
					p++;
					continue;
				}
				sum +=  (buf[p] << (shift &= (HALFINT-1)));
				p++;
				shift += 7;
			}
	}
	return sum;
}

prepare(i : int, arg : string) : (ref Iobuf, int)
{
	h : int;
	bp := bufio->open(arg,Bufio->OREAD);
	if (bp==nil) {
		error(sys->sprint("cannot open %s: %r", arg));
		return (nil, 0);
	}
	buf := array[1024] of byte;
	n :=bp.read(buf, len buf);
	str1 := string buf[0:n];
	for (j:=0;j<len str1 -2;j++)
		if (str1[j] == Sys->UTFerror)
			return (bp, BIN);
	bp.seek(big 0, Sys->SEEKSTART);
	p := array[4] of line;
	for (j = 0; h = readhash(bp); p[j].value = h){
		j++;
		if (j+3>=len p){
			newp:=array[len p*2] of line;
			newp[0:]=p[0:];
			p=array[len p*2] of line;
			p=newp;
			newp=nil;
		}
	}
	ilen[i]=j;
	file[i] = p;
	input[i] = bp;			
	if (i == 0) {			
		file1 = arg;
		firstchange = 0;
	}
	else
		file2 = arg;
	return (bp, REG);
}

squishspace(buf : string) : string 
{
	q:=0;
	p:=0;
	for (space := 0; q<len buf; q++) {
		if (buf[q]==' ' || buf[q]=='\t') {
			space++;
			continue;
		}
		if (space && bflag == 1) {
			buf[p] = ' ';
			p++;
			space = 0;
		}
		buf[p]=buf[q];
		p++;
	}
	buf=buf[0:p];
	return buf;
}


# need to fix up for unexpected EOF's

ftell(b: ref Iobuf): int
{
	return int b.offset();
}

check(bf, bt : ref Iobuf) 
{
	fbuf, tbuf : string;
	f:=1;
	t:=1;
	ixold[0] = ixnew[0] = 0;
	for (; f < ilen[0]; f++) {
		fbuf = bf.gets('\n');
		if (fbuf!=nil)
			fbuf=fbuf[0:len fbuf -1];
		ixold[f] = ftell(bf);
		if (J[f] == 0)
			continue;
		tbuflen: int;
		do {
			tbuf = bt.gets('\n');
			if (tbuf!=nil)
				tbuf=tbuf[0:len tbuf -1];
			tbuflen = len array of byte tbuf;
			ixnew[t] = ftell(bt);
		} while (t++ < J[f]);
		if (bflag) {
			fbuf = squishspace(fbuf);
			tbuf = squishspace(tbuf);
		}
		if (len fbuf != len tbuf || fbuf!=tbuf)
			J[f] = 0;
	}
	while (t < ilen[1]) {
		tbuf = bt.gets('\n');
		if (tbuf!=nil)
			tbuf=tbuf[0:len tbuf -1];
		ixnew[t] = ftell(bt);
		t++;
	}
}

range(a, b : int, separator : string) 
{
	if (a>b)
		out.puts(sys->sprint("%d", b));
	else
		out.puts(sys->sprint("%d", a));
	if (a < b)
		out.puts(sys->sprint("%s%d", separator, b));
}

fetch(f : array of int, a,b : int , bp : ref Iobuf, s : string) 
{
	buf : string;
	bp.seek(big f[a-1], 0);
	while (a++ <= b) {
		buf=bp.gets('\n');
		out.puts(s);
		out.puts(buf);
	}
}

change(a, b, c, d : int) 
{
	if (a > b && c > d)
		return;
	anychange = 1;
	if (mflag && firstchange == 0) {
		out.puts(sys->sprint( "diff %s %s\n", file1, file2));
		firstchange = 1;
	}
	if (mode != 'f') {
		range(a, b, ",");
		if (a>b)
			out.putc('a');
		else if (c>d)
			out.putc('d');
		else
			out.putc('c');
		if (mode != 'e')
			range(c, d, ",");
	}
	else {
		if (a>b)
			out.putc('a');
		else if (c>d)
			out.putc('d');
		else
			out.putc('c');
		range(a, b, " ");
	}
	out.putc('\n');
	if (mode == 0) {
		fetch(ixold, a, b, input[0], "< ");
		if (a <= b && c <= d)
			out.puts("---\n");
	}
	if (mode==0)
		fetch(ixnew, c, d, input[1], "> ");
	else
		fetch(ixnew, c, d, input[1], "");

	if (mode != 0 && c <= d)
		out.puts(".\n");
}


######################### diffdir starts here ......

scandir(name : string) : array of string 
{
	(db,nitems):= readdir->init(name,Readdir->NAME);
	cp := array[nitems] of string;
	for(i:=0;i<nitems;i++)
		cp[i]=db[i].name;
	return cp;
}


diffdir(f, t : string, level : int) 
{
	df, dt : array of string;
	fb, tb : string;
	i:=0;
	j:=0;
	df = scandir(f);
	dt = scandir(t);
	while ((i<len df) || (j<len dt)) {
		if ((j==len dt) || (i<len df && df[i] < dt[j])) {
			if (mode == 0)
				out.puts(sys->sprint("Only in %s: %s\n", f, df[i]));
			i++;
			continue;
		}
		if ((i==len df) || (j<len dt && df[i] > dt[j])) {
			if (mode == 0)
				out.puts(sys->sprint("Only in %s: %s\n", t, dt[j]));
			j++;
			continue;
		}
		fb=sys->sprint("%s/%s", f, df[i]);
		tb=sys->sprint("%s/%s", t, dt[j]);		
		diff(fb, tb, level+1);
		i++; j++;
	}
}

cmp(b0, b1: ref Iobuf): int
{
	b0.seek(big 0, Sys->SEEKSTART);
	b1.seek(big 0, Sys->SEEKSTART);
	buf0 := array[1024] of byte;
	buf1 := array[1024] of byte;
	for (;;) {
		n0 := b0.read(buf0, len buf0);
		n1 := b1.read(buf1, len buf1);

		if (n0 != n1)
			return 1;

		if (n0 == 0)
			return 0;

		for (i := 0; i < n0; i++) 
			if (buf0[i] != buf1[i])
				return 1;
	}
}

################## main from here.....

REGULAR_FILE(s : Sys->Dir) : int 
{
	# both pipes and networks contain non-zero-length files
	# which are not seekable.
	return (s.qid.qtype&Sys->QTDIR) == 0 &&
		s.dtype != '|' &&
		s.dtype != 'I';
#		&& s.length > 0;	device files have zero length. 
}

rmtmpfiles() 
{
	while (whichtmp > 0) {
		whichtmp--;
		sys->remove(tmpname[whichtmp]);
	}
}

mktmpfile(inputf : ref Sys->FD) : (string, Sys->Dir) 
{
	i, j : int;
	sb : Sys->Dir;
	p : string;
	buf := array[8192] of byte;

	p = tmpname[whichtmp++];
	fd := sys->create(p, Sys->OWRITE, 8r600);
	if (fd == nil) {
		error(sys->sprint("cannot create %s: %r", p));
		return (nil, sb);
	}
	while ((i = sys->read(inputf, buf, len buf)) > 0) {
		if ((i = sys->write(fd, buf, i)) < 0)
			break;
	}
	(j,sb)=sys->fstat(fd);
	if (i < 0 || j < 0) {
		error(sys->sprint("cannot read/write %s: %r", p));
		return (nil, sb);
	}
	return (p, sb);
}


statfile(file : string) : (string,Sys->Dir) 
{
	(ret,sb):=sys->stat(file);
	if (ret==-1) {
		if (file != "-" || sys->fstat(sys->fildes(0)).t0 == -1){
			error(sys->sprint("cannot stat %s: %r", file));
			return (nil,sb);
		}
		(file, sb) = mktmpfile(sys->fildes(0));
	}
	else if (!REGULAR_FILE(sb) && !(sb.qid.qtype&Sys->QTDIR)) {
		if ((i := sys->open(file, Sys->OREAD)) == nil) {
			error(sys->sprint("cannot open %s: %r", file));
			return (nil, sb);
		}
		(file, sb) = mktmpfile(i);
	}
	return (file,sb);
}

diff(f, t : string, level : int) 
{
	fp,tp,p,rest,fb,tb : string;
	fsb, tsb : Sys->Dir;
	(fp,fsb) = statfile(f);
	if (fp == nil)
		return;
	(tp,tsb) = statfile(t);
	if (tp == nil)
		return;
	if ((fsb.qid.qtype&Sys->QTDIR) && (tsb.qid.qtype&Sys->QTDIR)) {
		if (rflag || level == 0)
			diffdir(fp, tp, level);
		else
			out.puts(sys->sprint("Common subdirectories: %s and %s\n", fp, tp));
	}
	else if (REGULAR_FILE(fsb) && REGULAR_FILE(tsb)){
		diffreg(fp, tp);
	} else {
		if (!(fsb.qid.qtype&Sys->QTDIR)) {
			(p,rest)=str->splitr(f,"/");
			if (rest!=nil)
				p = rest;
			tb=sys->sprint("%s/%s", tp, p);			
			diffreg(fp, tb);
		}
		else {
			(p,rest)=str->splitr(t,"/");
			if (rest!=nil)
				p = rest;
			fb=sys->sprint("%s/%s", fp, p);			
			diffreg(fb, tp);
		}
	}
}

fatal(s: string)
{
	sys->fprint(stderr, "diff: %s\n", s);
	raise "fail:error";
}

error(s: string)
{
	sys->fprint(stderr, "diff: %s\n", s);
}