ref: babf901b4a508c3ec5d1f89655f10377bbdf9637
dir: /appl/cmd/diff.b/
implement Diff;
# diff - differential file comparison
#
# Uses an algorithm due to Harold Stone, which finds
# a pair of longest identical subsequences in the two
# files.
#
# The major goal is to generate the match vector J.
# J[i] is the index of the line in file1 corresponding
# to line i file0. J[i] = 0 if there is no
# such line in file1.
#
# Lines are hashed so as to work in core. All potential
# matches are located by sorting the lines of each file
# on the hash (called value). In particular, this
# collects the equivalence classes in file1 together.
# Subroutine equiv replaces the value of each line in
# file0 by the index of the first element of its
# matching equivalence in (the reordered) file1.
# To save space equiv squeezes file1 into a single
# array member in which the equivalence classes
# are simply concatenated, except that their first
# members are flagged by changing sign.
#
# Next the indices that point into member are unsorted into
# array class according to the original order of file0.
#
# The cleverness lies in routine stone. This marches
# through the lines of file0, developing a vector klist
# of "k-candidates". At step i a k-candidate is a matched
# pair of lines x,y (x in file0 y in file1) such that
# there is a common subsequence of lenght k
# between the first i lines of file0 and the first y
# lines of file1, but there is no such subsequence for
# any smaller y. x is the earliest possible mate to y
# that occurs in such a subsequence.
#
# Whenever any of the members of the equivalence class of
# lines in file1 matable to a line in file0 has serial number
# less than the y of some k-candidate, that k-candidate
# with the smallest such y is replaced. The new
# k-candidate is chained (via pred) to the current
# k-1 candidate so that the actual subsequence can
# be recovered. When a member has serial number greater
# that the y of all k-candidates, the klist is extended.
# At the end, the longest subsequence is pulled out
# and placed in the array J by unravel.
#
# With J in hand, the matches there recorded are
# check'ed against reality to assure that no spurious
# matches have crept in due to hashing. If they have,
# they are broken, and "jackpot " is recorded--a harmless
# matter except that a true match for a spuriously
# mated line may now be unnecessarily reported as a change.
#
# Much of the complexity of the program comes simply
# from trying to minimize core utilization and
# maximize the range of doable problems by dynamically
# allocating what is needed and reusing what is not.
# The core requirements for problems larger than somewhat
# are (in words) 2*length(file0) + length(file1) +
# 3*(number of k-candidates installed), typically about
# 6n words for files of length n.
#
#
include "sys.m";
sys: Sys;
include "bufio.m";
bufio : Bufio;
Iobuf : import bufio;
include "draw.m";
draw: Draw;
include "readdir.m";
readdir : Readdir;
include "string.m";
str : String;
include "arg.m";
Diff : module
{
init: fn(ctxt: ref Draw->Context, argv: list of string);
};
stderr: ref Sys->FD;
mode : int; # '\0', 'e', 'f', 'h'
bflag : int; # ignore multiple and trailing blanks
rflag : int; # recurse down directory trees
mflag : int; # pseudo flag: doing multiple files, one dir
REG,
BIN: con iota;
HALFINT : con 16;
Usage : con "usage: diff [ -efbwr ] file1 ... file2";
cand : adt {
x : int;
y : int;
pred : int;
};
line : adt {
serial : int;
value : int;
};
out : ref Iobuf;
file := array[2] of array of line;
sfile := array[2] of array of line; # shortened by pruning common prefix and suffix
slen := array[2] of int;
ilen := array[2] of int;
pref, suff, clen : int; # length of prefix and suffix
firstchange : int;
clist : array of cand; # merely a free storage pot for candidates
J : array of int; # will be overlaid on class
ixold, ixnew : array of int;
input := array[2] of ref Iobuf ;
file1, file2 : string;
tmpname := array[] of {"/tmp/diff1", "/tmp/diff2"};
whichtmp : int;
anychange := 0;
init(nil: ref Draw->Context, args: list of string)
{
sys = load Sys Sys->PATH;
stderr = sys->fildes(2);
draw = load Draw Draw->PATH;
bufio = load Bufio Bufio->PATH;
readdir = load Readdir Readdir->PATH;
str = load String String->PATH;
if (bufio==nil)
fatal(sys->sprint("cannot load %s: %r", Bufio->PATH));
if (readdir==nil)
fatal(sys->sprint("cannot load %s: %r", Readdir->PATH));
if (str==nil)
fatal(sys->sprint("cannot load %s: %r", String->PATH));
arg := load Arg Arg->PATH;
if (arg==nil)
fatal(sys->sprint("cannot load %s: %r", Arg->PATH));
fsb, tsb : Sys->Dir;
arg->init(args);
while((o := arg->opt()) != 0)
case o {
'e' or 'f' =>
mode = o;
'w' =>
bflag = 2;
'b' =>
bflag = 1;
'r' =>
rflag = 1;
'm' =>
mflag = 1;
* =>
fatal(Usage);
}
tmp := arg->argv();
arg = nil;
j := len tmp;
if (j < 2)
fatal(Usage);
arr := array[j] of string;
for(i:=0;i<j;i++){
arr[i]= hd tmp;
tmp = tl tmp;
}
(i,tsb)=sys->stat(arr[j-1]);
if (i == -1)
fatal(sys->sprint("can't stat %s: %r", arr[j-1]));
if (j > 2) {
if (!(tsb.qid.qtype&Sys->QTDIR))
fatal(Usage);
mflag = 1;
}
else {
(i,fsb)=sys->stat(arr[0]);
if (i == -1)
fatal(sys->sprint("can't stat %s: %r", arr[0]));
if ((fsb.qid.qtype&Sys->QTDIR) && (tsb.qid.qtype&Sys->QTDIR))
mflag = 1;
}
out=bufio->fopen(sys->fildes(1),Bufio->OWRITE);
for (i = 0; i < j-1; i++) {
diff(arr[i], arr[j-1], 0);
rmtmpfiles();
}
rmtmpfiles();
out.flush();
if (anychange)
raise "fail:some";
}
############################# diffreg from here ....
# shellsort CACM #201
sort(a : array of line, n : int)
{
w : line;
m := 0;
for (i := 1; i <= n; i *= 2)
m = 2*i - 1;
for (m /= 2; m != 0; m /= 2) {
for (j := 1; j <= n-m ; j++) {
ai:=j;
aim:=j+m;
do {
if (a[aim].value > a[ai].value ||
a[aim].value == a[ai].value &&
a[aim].serial > a[ai].serial)
break;
w = a[ai];
a[ai] = a[aim];
a[aim] = w;
aim=ai;
ai-=m;
} while (ai > 0 && aim >= ai);
}
}
}
unsort(f : array of line, l : int) : array of int
{
i : int;
a := array[l+1] of int;
for(i=1;i<=l;i++)
a[f[i].serial] = f[i].value;
return a;
}
prune()
{
for(pref=0;pref< ilen[0]&&pref< ilen[1]&&
file[0][pref+1].value==file[1][pref+1].value;
pref++ ) ;
for(suff=0;suff< ilen[0]-pref&&suff< ilen[1]-pref&&
file[0][ilen[0]-suff].value==file[1][ilen[1]-suff].value;
suff++) ;
for(j:=0;j<2;j++) {
sfile[j] = file[j][pref:];
slen[j]= ilen[j]-pref-suff;
for(i:=0;i<=slen[j];i++)
sfile[j][i].serial = i;
}
}
equiv(a: array of line, n:int , b: array of line, m: int, c : array of int)
{
i := 1;
j := 1;
while(i<=n && j<=m) {
if(a[i].value < b[j].value)
a[i++].value = 0;
else if(a[i].value == b[j].value)
a[i++].value = j;
else
j++;
}
while(i <= n)
a[i++].value = 0;
b[m+1].value = 0; # huh ?
j = 1;
while(j <= m) {
c[j] = -b[j].serial;
while(b[j+1].value == b[j].value) {
j++;
c[j] = b[j].serial;
}
j++;
}
c[j] = -1;
}
newcand(x, y, pred : int) : int
{
if (clen==len clist){
q := array[clen*2] of cand;
q[0:]=clist;
clist= array[clen*2] of cand;
clist[0:]=q;
q=nil;
}
clist[clen].x=x;
clist[clen].y=y;
clist[clen].pred=pred;
return clen++;
}
search(c : array of int, k,y : int) : int
{
if(clist[c[k]].y < y) # quick look for typical case
return k+1;
i := 0;
j := k+1;
while((l:=(i+j)/2) > i) {
t := clist[c[l]].y;
if(t > y)
j = l;
else if(t < y)
i = l;
else
return l;
}
return l+1;
}
stone(a : array of int ,n : int, b: array of int , c : array of int) : int
{
oldc, oldl, tc, l ,y : int;
k := 0;
c[0] = newcand(0,0,0);
for(i:=1; i<=n; i++) {
j := a[i];
if(j==0)
continue;
y = -b[j];
oldl = 0;
oldc = c[0];
do {
if(y <= clist[oldc].y)
continue;
l = search(c, k, y);
if(l!=oldl+1)
oldc = c[l-1];
if(l<=k) {
if(clist[c[l]].y <= y)
continue;
tc = c[l];
c[l] = newcand(i,y,oldc);
oldc = tc;
oldl = l;
} else {
c[l] = newcand(i,y,oldc);
k++;
break;
}
} while((y=b[j+=1]) > 0);
}
return k;
}
unravel(p : int)
{
for(i:=0; i<=ilen[0]; i++) {
if (i <= pref)
J[i] = i;
else if (i > ilen[0]-suff)
J[i] = i+ ilen[1]-ilen[0];
else
J[i] = 0;
}
for(q:=clist[p];q.y!=0;q=clist[q.pred])
J[q.x+pref] = q.y+pref;
}
output()
{
i1: int;
m := ilen[0];
J[0] = 0;
J[m+1] = ilen[1]+1;
if (mode != 'e') {
for (i0 := 1; i0 <= m; i0 = i1+1) {
while (i0 <= m && J[i0] == J[i0-1]+1)
i0++;
j0 := J[i0-1]+1;
i1 = i0-1;
while (i1 < m && J[i1+1] == 0)
i1++;
j1 := J[i1+1]-1;
J[i1] = j1;
change(i0, i1, j0, j1);
}
}
else {
for (i0 := m; i0 >= 1; i0 = i1-1) {
while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
i0--;
j0 := J[i0+1]-1;
i1 = i0+1;
while (i1 > 1 && J[i1-1] == 0)
i1--;
j1 := J[i1-1]+1;
J[i1] = j1;
change(i1 , i0, j1, j0);
}
}
if (m == 0)
change(1, 0, 1, ilen[1]);
out.flush();
}
diffreg(f,t : string)
{
k : int;
(b0, b0type) := prepare(0, f);
if (b0==nil)
return;
(b1, b1type) := prepare(1, t);
if (b1==nil) {
b0=nil;
return;
}
if (b0type == BIN || b1type == BIN) {
if (cmp(b0, b1)) {
out.puts(sys->sprint("Binary files %s %s differ\n", f, t));
anychange = 1;
}
b0 = nil;
b1 = nil;
return;
}
clen=0;
prune();
file[0]=nil;
file[1]=nil;
sort(sfile[0],slen[0]);
sort(sfile[1],slen[1]);
member := array[slen[1]+2] of int;
equiv(sfile[0], slen[0],sfile[1],slen[1], member);
class:=unsort(sfile[0],slen[0]);
sfile[0]=nil;
sfile[1]=nil;
klist := array[slen[0]+2] of int;
clist = array[1] of cand;
k = stone(class, slen[0], member, klist);
J = array[ilen[0]+2] of int;
unravel(klist[k]);
clist=nil;
klist=nil;
class=nil;
member=nil;
ixold = array[ilen[0]+2] of int;
ixnew = array[ilen[1]+2] of int;
b0.seek(big 0, 0);
b1.seek(big 0, 0);
check(b0, b1);
output();
ixold=nil;
ixnew=nil;
b0=nil;
b1=nil;
}
######################## diffio starts here...
# hashing has the effect of
# arranging line in 7-bit bytes and then
# summing 1-s complement in 16-bit hunks
readhash(bp : ref Iobuf) : int
{
sum := 1;
shift := 0;
buf := bp.gets('\n');
if (buf == nil)
return 0;
buf = buf[0:len buf -1];
p := 0;
case bflag {
# various types of white space handling
0 =>
while (p< len buf) {
sum += (buf[p] << (shift &= (HALFINT-1)));
p++;
shift += 7;
}
1 =>
# coalesce multiple white-space
for (space := 0; p< len buf; p++) {
if (buf[p]==' ' || buf[p]=='\t') {
space++;
continue;
}
if (space) {
shift += 7;
space = 0;
}
sum += (buf[p] << (shift &= (HALFINT-1)));
p++;
shift += 7;
}
* =>
# strip all white-space
while (p< len buf) {
if (buf[p]==' ' || buf[p]=='\t') {
p++;
continue;
}
sum += (buf[p] << (shift &= (HALFINT-1)));
p++;
shift += 7;
}
}
return sum;
}
prepare(i : int, arg : string) : (ref Iobuf, int)
{
h : int;
bp := bufio->open(arg,Bufio->OREAD);
if (bp==nil) {
error(sys->sprint("cannot open %s: %r", arg));
return (nil, 0);
}
buf := array[1024] of byte;
n :=bp.read(buf, len buf);
str1 := string buf[0:n];
for (j:=0;j<len str1 -2;j++)
if (str1[j] == Sys->UTFerror)
return (bp, BIN);
bp.seek(big 0, Sys->SEEKSTART);
p := array[4] of line;
for (j = 0; h = readhash(bp); p[j].value = h){
j++;
if (j+3>=len p){
newp:=array[len p*2] of line;
newp[0:]=p[0:];
p=array[len p*2] of line;
p=newp;
newp=nil;
}
}
ilen[i]=j;
file[i] = p;
input[i] = bp;
if (i == 0) {
file1 = arg;
firstchange = 0;
}
else
file2 = arg;
return (bp, REG);
}
squishspace(buf : string) : string
{
q:=0;
p:=0;
for (space := 0; q<len buf; q++) {
if (buf[q]==' ' || buf[q]=='\t') {
space++;
continue;
}
if (space && bflag == 1) {
buf[p] = ' ';
p++;
space = 0;
}
buf[p]=buf[q];
p++;
}
buf=buf[0:p];
return buf;
}
# need to fix up for unexpected EOF's
ftell(b: ref Iobuf): int
{
return int b.offset();
}
check(bf, bt : ref Iobuf)
{
fbuf, tbuf : string;
f:=1;
t:=1;
ixold[0] = ixnew[0] = 0;
for (; f < ilen[0]; f++) {
fbuf = bf.gets('\n');
if (fbuf!=nil)
fbuf=fbuf[0:len fbuf -1];
ixold[f] = ftell(bf);
if (J[f] == 0)
continue;
tbuflen: int;
do {
tbuf = bt.gets('\n');
if (tbuf!=nil)
tbuf=tbuf[0:len tbuf -1];
tbuflen = len array of byte tbuf;
ixnew[t] = ftell(bt);
} while (t++ < J[f]);
if (bflag) {
fbuf = squishspace(fbuf);
tbuf = squishspace(tbuf);
}
if (len fbuf != len tbuf || fbuf!=tbuf)
J[f] = 0;
}
while (t < ilen[1]) {
tbuf = bt.gets('\n');
if (tbuf!=nil)
tbuf=tbuf[0:len tbuf -1];
ixnew[t] = ftell(bt);
t++;
}
}
range(a, b : int, separator : string)
{
if (a>b)
out.puts(sys->sprint("%d", b));
else
out.puts(sys->sprint("%d", a));
if (a < b)
out.puts(sys->sprint("%s%d", separator, b));
}
fetch(f : array of int, a,b : int , bp : ref Iobuf, s : string)
{
buf : string;
bp.seek(big f[a-1], 0);
while (a++ <= b) {
buf=bp.gets('\n');
out.puts(s);
out.puts(buf);
}
}
change(a, b, c, d : int)
{
if (a > b && c > d)
return;
anychange = 1;
if (mflag && firstchange == 0) {
out.puts(sys->sprint( "diff %s %s\n", file1, file2));
firstchange = 1;
}
if (mode != 'f') {
range(a, b, ",");
if (a>b)
out.putc('a');
else if (c>d)
out.putc('d');
else
out.putc('c');
if (mode != 'e')
range(c, d, ",");
}
else {
if (a>b)
out.putc('a');
else if (c>d)
out.putc('d');
else
out.putc('c');
range(a, b, " ");
}
out.putc('\n');
if (mode == 0) {
fetch(ixold, a, b, input[0], "< ");
if (a <= b && c <= d)
out.puts("---\n");
}
if (mode==0)
fetch(ixnew, c, d, input[1], "> ");
else
fetch(ixnew, c, d, input[1], "");
if (mode != 0 && c <= d)
out.puts(".\n");
}
######################### diffdir starts here ......
scandir(name : string) : array of string
{
(db,nitems):= readdir->init(name,Readdir->NAME);
cp := array[nitems] of string;
for(i:=0;i<nitems;i++)
cp[i]=db[i].name;
return cp;
}
diffdir(f, t : string, level : int)
{
df, dt : array of string;
fb, tb : string;
i:=0;
j:=0;
df = scandir(f);
dt = scandir(t);
while ((i<len df) || (j<len dt)) {
if ((j==len dt) || (i<len df && df[i] < dt[j])) {
if (mode == 0)
out.puts(sys->sprint("Only in %s: %s\n", f, df[i]));
i++;
continue;
}
if ((i==len df) || (j<len dt && df[i] > dt[j])) {
if (mode == 0)
out.puts(sys->sprint("Only in %s: %s\n", t, dt[j]));
j++;
continue;
}
fb=sys->sprint("%s/%s", f, df[i]);
tb=sys->sprint("%s/%s", t, dt[j]);
diff(fb, tb, level+1);
i++; j++;
}
}
cmp(b0, b1: ref Iobuf): int
{
b0.seek(big 0, Sys->SEEKSTART);
b1.seek(big 0, Sys->SEEKSTART);
buf0 := array[1024] of byte;
buf1 := array[1024] of byte;
for (;;) {
n0 := b0.read(buf0, len buf0);
n1 := b1.read(buf1, len buf1);
if (n0 != n1)
return 1;
if (n0 == 0)
return 0;
for (i := 0; i < n0; i++)
if (buf0[i] != buf1[i])
return 1;
}
}
################## main from here.....
REGULAR_FILE(s : Sys->Dir) : int
{
# both pipes and networks contain non-zero-length files
# which are not seekable.
return (s.qid.qtype&Sys->QTDIR) == 0 &&
s.dtype != '|' &&
s.dtype != 'I';
# && s.length > 0; device files have zero length.
}
rmtmpfiles()
{
while (whichtmp > 0) {
whichtmp--;
sys->remove(tmpname[whichtmp]);
}
}
mktmpfile(inputf : ref Sys->FD) : (string, Sys->Dir)
{
i, j : int;
sb : Sys->Dir;
p : string;
buf := array[8192] of byte;
p = tmpname[whichtmp++];
fd := sys->create(p, Sys->OWRITE, 8r600);
if (fd == nil) {
error(sys->sprint("cannot create %s: %r", p));
return (nil, sb);
}
while ((i = sys->read(inputf, buf, len buf)) > 0) {
if ((i = sys->write(fd, buf, i)) < 0)
break;
}
(j,sb)=sys->fstat(fd);
if (i < 0 || j < 0) {
error(sys->sprint("cannot read/write %s: %r", p));
return (nil, sb);
}
return (p, sb);
}
statfile(file : string) : (string,Sys->Dir)
{
(ret,sb):=sys->stat(file);
if (ret==-1) {
if (file != "-" || sys->fstat(sys->fildes(0)).t0 == -1){
error(sys->sprint("cannot stat %s: %r", file));
return (nil,sb);
}
(file, sb) = mktmpfile(sys->fildes(0));
}
else if (!REGULAR_FILE(sb) && !(sb.qid.qtype&Sys->QTDIR)) {
if ((i := sys->open(file, Sys->OREAD)) == nil) {
error(sys->sprint("cannot open %s: %r", file));
return (nil, sb);
}
(file, sb) = mktmpfile(i);
}
return (file,sb);
}
diff(f, t : string, level : int)
{
fp,tp,p,rest,fb,tb : string;
fsb, tsb : Sys->Dir;
(fp,fsb) = statfile(f);
if (fp == nil)
return;
(tp,tsb) = statfile(t);
if (tp == nil)
return;
if ((fsb.qid.qtype&Sys->QTDIR) && (tsb.qid.qtype&Sys->QTDIR)) {
if (rflag || level == 0)
diffdir(fp, tp, level);
else
out.puts(sys->sprint("Common subdirectories: %s and %s\n", fp, tp));
}
else if (REGULAR_FILE(fsb) && REGULAR_FILE(tsb)){
diffreg(fp, tp);
} else {
if (!(fsb.qid.qtype&Sys->QTDIR)) {
(p,rest)=str->splitr(f,"/");
if (rest!=nil)
p = rest;
tb=sys->sprint("%s/%s", tp, p);
diffreg(fp, tb);
}
else {
(p,rest)=str->splitr(t,"/");
if (rest!=nil)
p = rest;
fb=sys->sprint("%s/%s", fp, p);
diffreg(fb, tp);
}
}
}
fatal(s: string)
{
sys->fprint(stderr, "diff: %s\n", s);
raise "fail:error";
}
error(s: string)
{
sys->fprint(stderr, "diff: %s\n", s);
}