ref: 8cfd549d118fbec88f3cd8d7740e5b94fbcf9a2d
dir: /sys/src/cmd/aux/msexceltables.c/
/* msexceltables.c    Steve Simon    5-Jan-2005 */
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
enum {
	Tillegal = 0,
	Tnumber,		// cell types
	Tlabel,
	Tindex,
	Tbool,
	Terror,
	Ver8 = 0x600,		// only BIFF8 and BIFF8x files support unicode
	Nwidths = 4096,
};
	
	
typedef struct Biff Biff;
typedef struct Col Col;
typedef struct Row Row;
struct Row {
	Row *next;		// next row
	int r;			// row number
	Col *col;		// list of cols in row
};
struct Col {
	Col *next;		// next col in row
	int c;			// col number
	int f;			// index into formating table (Xf)
	int type;		// type of value for union below
	union {			// value
		int index;	// index into string table (Strtab)
		int error;
		int bool;
		char *label;
		double number;
	};
};
struct  Biff {
	Biobuf *bp;		// input file
	int op;			// current record type
	int len;		// length of current record
};
// options
static int Nopad = 0;		// disable padding cells to colum width
static int Trunc = 0;		// truncate cells to colum width
static int All = 0;		// dump all sheet types, Worksheets only by default
static char *Delim = " ";	// field delimiter
static char *Sheetrange = nil;	// range of sheets wanted
static char *Columnrange = nil;	// range of collums wanted
static int Debug = 0;
// file scope
static int Defwidth = 10;	// default colum width if non given
static int Biffver;		// file vesion
static int Datemode;		// date ref: 1899-Dec-31 or 1904-jan-1
static char **Strtab = nil;	// label contents heap
static int Nstrtab = 0;		// # of above
static int *Xf;			// array of extended format indices
static int Nxf = 0;		// # of above
static Biobuf *bo;		// stdout (sic)
static int Doquote = 1;		// quote text fields if they are rc(1) unfriendly
// table scope
static int Width[Nwidths];	// array of colum widths
static int Ncols = -1;		// max colums in table used
static int Content = 0;		// type code for contents of sheet
static Row *Root = nil;		// one worksheet's worth of cells
				
static char *Months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
static char *Errmsgs[] = {
	[0x0]	"#NULL!",	// intersection of two cell ranges is empty
	[0x7]	"#DIV/0!",	// division by zero	
	[0xf]	"#VALUE!",	// wrong type of operand
	[0x17]	"#REF!",	// illegal or deleted cell reference
	[0x1d]	"#NAME?",	// wrong function or range name
	[0x24]	"#NUM!",	// value range overflow
	[0x2a]	"#N/A!",	// argument of function not available
};
int
wanted(char *range, int here)
{
	int n, s;
	char *p;
	if (! range)
		return 1;
	s = -1;
	p = range;
	while(1){
		n = strtol(p, &p, 10);
		switch(*p){
		case 0:
			if(n == here)
				return 1;
			if(s != -1 && here > s && here < n)
				return 1;
			return 0;
		case ',':
			if(n == here)
				return 1;
			if(s != -1 && here > s && here < n)
				return 1;
			s = -1;
			p++;
			break;
		case '-':
			if(n == here)
				return 1;
			s = n;
			p++;
			break;
		default:
			sysfatal("%s malformed range spec", range);
			break;
		}
	}
}
	
void
cell(int r, int c, int f, int type, void *val)
{
	Row *row, *nrow;
	Col *col, *ncol;
	if(c > Ncols)
		Ncols = c;
	if((ncol = malloc(sizeof(Col))) == nil)
		sysfatal("no memory");
	ncol->c = c;
	ncol->f = f;
	ncol->type = type;
	ncol->next = nil;
	switch(type){
	case Tnumber:	ncol->number = *(double *)val;	break;
	case Tlabel:	ncol->label = (char *)val;	break;
	case Tindex:	ncol->index = *(int *)val;	break;
	case Tbool:	ncol->bool = *(int *)val;	break;
	case Terror:	ncol->error = *(int *)val;	break;
	default:	sysfatal("can't happen error");
	}
	if(Root == nil || Root->r > r){
		if((nrow = malloc(sizeof(Row))) == nil)
			sysfatal("no memory");
		nrow->col = ncol;
		ncol->next = nil;
		nrow->r = r;
		nrow->next = Root;
		Root = nrow;
		return;
	}
	for(row = Root; row; row = row->next){
		if(row->r == r){
			if(row->col->c > c){
				ncol->next = row->col;
				row->col = ncol;
				return;
			}
			else{
				for(col = row->col; col; col = col->next)
					if(col->next == nil || col->next->c > c){
						ncol->next = col->next;
						col->next = ncol;
						return;
					}
			}
		}
		if(row->next == nil || row->next->r > r){
			if((nrow = malloc(sizeof(Row))) == nil)
				sysfatal("no memory");
			nrow->col = ncol;
			nrow->r = r;
			nrow->next = row->next;
			row->next = nrow;
			return;
		}
	}
	sysfatal("cannot happen error");
}
struct Tm *
bifftime(double num)
{
	long long t = num;
	/* Beware - These epochs are wrong, this
	 * is due to Excel still remaining compatible
	 * with Lotus-123, which incorrectly believed 1900
	 * was a leap year
	 */
	if(Datemode)
		t -= 24107;		// epoch = 1/1/1904
	else
		t -= 25569;		// epoch = 31/12/1899
	t *= 60*60*24;
	return localtime((long)t);
}
void
numfmt(int fmt, int min, int max, double num)
{
	char buf[1024];
	struct Tm *tm;
	if(fmt == 9)
		snprint(buf, sizeof(buf),"%.0f%%", num);
	else
	if(fmt == 10)
		snprint(buf, sizeof(buf),"%f%%", num);
	else
	if(fmt == 11 || fmt == 48)
		snprint(buf, sizeof(buf),"%e", num);
	else
	if(fmt >= 14 && fmt <= 17){
		tm = bifftime(num);
		snprint(buf, sizeof(buf),"%d-%s-%d",
			tm->mday, Months[tm->mon], tm->year+1900);
	}
	else
	if((fmt >= 18 && fmt <= 21) || (fmt >= 45 && fmt <= 47)){
		tm = bifftime(num);
		snprint(buf, sizeof(buf),"%02d:%02d:%02d", tm->hour, tm->min, tm->sec);
	}
	else
	if(fmt == 22){
		tm = bifftime(num);
		snprint(buf, sizeof(buf),"%02d:%02d:%02d %d-%s-%d",
			tm->hour, tm->min, tm->sec,
			tm->mday, Months[tm->mon], tm->year+1900);
	}else
		snprint(buf, sizeof(buf),"%g", num);
	Bprint(bo, "%-*.*q", min, max, buf);
}
void
dump(void)
{
	Row *r;
	Col *c, *c1;
	char *strfmt;
	int i, n, last, min, max;
	if(Doquote)
		strfmt = "%-*.*q";
	else
		strfmt = "%-*.*s";
	for(r = Root; r; r = r->next){
		n = 1;
		for(c = r->col; c; c = c->next){
			n++;
			if(! wanted(Columnrange, n))
				continue;
			if(c->c < 0 || c->c >= Nwidths || (min = Width[c->c]) == 0)
				min = Defwidth;
			if((c->next && c->c == c->next->c) || Nopad)
				min = 0;
			max = -1;
			if(Trunc && min > 2)
				max = min -2;	// FIXME: -2 because of bug %q format ?
			switch(c->type){
			case Tnumber:
				if(Xf == nil || Xf[c->f] == 0)
					Bprint(bo, "%-*.*g", min, max, c->number);
				else
					numfmt(Xf[c->f], min, max, c->number);
				break;
			case Tlabel:
				Bprint(bo, strfmt, min, max, c->label);
				break;
			case Tbool:
				Bprint(bo, strfmt, min, max, (c->bool)? "True": "False");
				break;
			case Tindex:
				if(c->index < 0 || c->index >= Nstrtab)
					sysfatal("SST string out of range - corrupt file?");
				Bprint(bo, strfmt, min, max, Strtab[c->index]);
				break;
			case Terror:
				if(c->error < 0 || c->error >= nelem(Errmsgs) || !Errmsgs[c->error])
					Bprint(bo, "#ERR=%d", c->index);
				else
					Bprint(bo, strfmt, min, max, Errmsgs[c->error]);
				break;
			default:
				sysfatal("cannot happen error");
				break;
			}
			last = 1;
			for(i = n+1, c1 = c->next; c1; c1 = c1->next, i++)
				if(wanted(Columnrange, i)){
					last = 0;
					break;
				}
			if(! last){
				if(c->next->c == c->c)		// bar charts
					Bprint(bo, "=");
				else{
					Bprint(bo, "%s", Delim);
					for(i = c->c; c->next && i < c->next->c -1; i++)
						Bprint(bo, "%-*.*s%s", min, max, "", Delim);
				}
			}
		}
		if(r->next)
			for(i = r->r; i < r->next->r; i++)
				Bprint(bo, "\n");
	}
	Bprint(bo, "\n");
}
void
release(void)
{
	Row *r, *or;
	Col *c, *oc;
	r = Root;
	while(r){
		c = r->col;
		while(c){
			if(c->type == Tlabel)
				free(c->label);
			oc = c;
			c = c->next;
			free(oc);
		}
		or = r;
		r = r->next;
		free(or);
	}
	Root = nil;
	memset(Width, 0, sizeof(Width));
	Ncols = -1;
}
void
skip(Biff *b, int len)
{
	assert(len <= b->len);
	if(Bseek(b->bp, len, 1) == -1)
		sysfatal("seek failed - %r");
	b->len -= len;
}
void
gmem(Biff *b, void *p, int n)
{
	if(b->len < n)
		sysfatal("short record %d < %d", b->len, n);
	if(Bread(b->bp, p, n) != n)
		sysfatal("unexpected EOF - %r");
	b->len -= n;
}
void
xd(Biff *b)
{
	uvlong off;
	uchar buf[16];
	int addr, got, n, i, j;
	addr = 0;
	off = Boffset(b->bp);
	while(addr < b->len){
		n = (b->len >= sizeof(buf))? sizeof(buf): b->len;
		got = Bread(b->bp, buf, n);
		Bprint(bo, "	%6d  ", addr);
		addr += n;
		for(i = 0; i < got; i++)
			Bprint(bo, "%02x ", buf[i]);
		for(j = i; j < 16; j++)
			Bprint(bo, "   ");
		Bprint(bo, "  ");
		for(i = 0; i < got; i++)
			Bprint(bo, "%c", isprint(buf[i])? buf[i]: '.');
		Bprint(bo, "\n");
	}
	Bseek(b->bp, off, 0);
}
static int 
getrec(Biff *b)
{
	int c;
	if((c = Bgetc(b->bp)) == -1)
		return -1;		// real EOF
	b->op = c;
	if((c = Bgetc(b->bp)) == -1)
		sysfatal("unexpected EOF - %r");
	b->op |= c << 8;
	if((c = Bgetc(b->bp)) == -1)
		sysfatal("unexpected EOF - %r");
	b->len = c;
	if((c = Bgetc(b->bp)) == -1)
		sysfatal("unexpected EOF - %r");
	b->len |= c << 8;
	if(b->op == 0 && b->len == 0)
		return -1;
	if(Debug){
		Bprint(bo, "op=0x%x len=%d\n", b->op, b->len);
		xd(b);
	}
	return 0;
}
static uvlong
gint(Biff *b, int n)
{
	int i, c;
	uvlong vl, rc;
	if(b->len < n)
		return -1;
	rc = 0;
	for(i = 0; i < n; i++){
		if((c = Bgetc(b->bp)) == -1)
			sysfatal("unexpected EOF - %r");
		b->len--;
		vl = c;
		rc |= vl << (8*i);
	}
	return rc;
}
double
grk(Biff *b)
{
	int f;
	uvlong n;
	double d;
	n = gint(b, 4);
	f = n & 3;
	n &= ~3LL;
	if(f & 2){
		d = n / 4.0;
	}
	else{
		n <<= 32;
		memcpy(&d, &n, sizeof(d));
	}
	if(f & 1)
		d /= 100.0;
	return d;
}
double
gdoub(Biff *b)
{
	double d;
	uvlong n = gint(b, 8);
	memcpy(&d, &n, sizeof(n));
	return d;
}
char *
gstr(Biff *b, int len_width)
{
	Rune r;
	char *buf, *p;
	int nch, w, ap, ln, rt, opt;
	enum {
		Unicode = 1,
		Asian_phonetic = 4,
		Rich_text = 8,
	};
	if(b->len < len_width){
		if(getrec(b) == -1)
			sysfatal("starting STRING expected CONTINUE, got EOF");
		if(b->op != 0x03c)
			sysfatal("starting STRING expected CONTINUE, got op=0x%x", b->op);
	}
	ln = gint(b, len_width);
	if(Biffver != Ver8){
		if((buf = calloc(ln+1, sizeof(char))) == nil)
			sysfatal("no memory");
		gmem(b, buf, ln);
		return buf;
	}
	if((buf = calloc(ln+1, sizeof(char)*UTFmax)) == nil)
		sysfatal("no memory");
	p = buf;
	if(ln == 0)
		return buf;
	nch = 0;
	*buf = 0;
	opt = gint(b, 1);
	if(opt & Rich_text)
		rt = gint(b, 2);
	else
		rt = 0;
	if(opt & Asian_phonetic)
		ap = gint(b, 4);
	else
		ap = 0;
	for(;;){
		w = (opt & Unicode)? sizeof(Rune): sizeof(char);
		while(b->len > 0){
			r = gint(b, w);
			p += runetochar(p, &r);
			if(++nch >= ln){
				if(rt)
					skip(b, rt*4);
				if(ap)
					skip(b, ap);
				return buf;
			}
		}
		if(getrec(b) == -1)
			sysfatal("in STRING expected CONTINUE, got EOF");
		if(b->op != 0x03c)	
			sysfatal("in STRING expected CONTINUE, got op=0x%x", b->op);
		opt = gint(b, 1);
	}
}
void
sst(Biff *b)
{
	int n;
	
	skip(b, 4);			// total # strings
	Nstrtab = gint(b, 4);		// # unique strings
	if((Strtab = calloc(Nstrtab, sizeof(char *))) == nil)
		sysfatal("no memory");
	for(n = 0; n < Nstrtab; n++)
		Strtab[n] = gstr(b, 2);
}
void
boolerr(Biff *b)
{
	int r = gint(b, 2);		// row
	int c = gint(b, 2);		// col
	int f = gint(b, 2);		// formatting ref
	int v = gint(b, 1);		// bool value / err code
	int t = gint(b, 1);		// type
	cell(r, c, f, (t)? Terror: Tbool, &v);
}
void
rk(Biff *b)
{
	int r = gint(b, 2);		// row
	int c = gint(b, 2);		// col
	int f = gint(b, 2);		// formatting ref
	double v = grk(b);		// value
	cell(r, c, f, Tnumber, &v);
}
void
mulrk(Biff *b)
{
	int r = gint(b, 2);		// row
	int c = gint(b, 2);		// first col
	while(b->len >= 6){
		int f = gint(b, 2);	// formatting ref
		double v = grk(b);	// value
		cell(r, c++, f, Tnumber, &v);
	}
}
void
number(Biff *b)
{
	int r = gint(b, 2);		// row
	int c = gint(b, 2);		// col
	int f = gint(b, 2);		// formatting ref
	double v = gdoub(b);		// double 
	cell(r, c, f, Tnumber, &v);
}
void
label(Biff *b)
{
	int r = gint(b, 2);		// row
	int c = gint(b, 2);		// col
	int f = gint(b, 2);		// formatting ref
	char *s = gstr(b, 2);		// byte string
	cell(r, c, f, Tlabel, s);
}
void
labelsst(Biff *b)
{
	int r = gint(b, 2);		// row
	int c = gint(b, 2);		// col
	int f = gint(b, 2);		// formatting ref
	int i = gint(b, 2);		// sst string ref
	cell(r, c, f, Tindex, &i);
}
void
bof(Biff *b)
{
	Biffver = gint(b, 2);
	Content = gint(b, 2);	
}
void
defcolwidth(Biff *b)
{
	Defwidth = gint(b, 2);
}
void
datemode(Biff *b)
{
	Datemode = gint(b, 2);
}
void
eof(Biff *b)
{
	int i;
	struct {
		int n;
		char *s;
	} names[] = {
		0x005,	"Workbook globals",
		0x006,	"Visual Basic module",
		0x010,	"Worksheet",
		0x020,	"Chart",
		0x040,	"Macro sheet",
		0x100,	"Workspace file",
	};
	static int sheet = 0;
	if(! wanted(Sheetrange, ++sheet)){
		release();
		return;
	}
			
	if(Ncols != -1){
		if(All){
			for(i = 0; i < nelem(names); i++)
				if(names[i].n == Content){
					Bprint(bo, "\n# contents %s\n", names[i].s);
					dump();
				}
		}
		else 
		if(Content == 0x10)		// Worksheet
			dump();
	}
	release();
	USED(b);
}
void
colinfo(Biff *b)
{
	int c;
	int c1 = gint(b, 2);
	int c2 = gint(b, 2);
	int w  = gint(b, 2);
	if(c1 < 0)
		sysfatal("negative column number (%d)", c1);
	if(c2 >= Nwidths)
		sysfatal("too many columns (%d > %d)", c2, Nwidths);
	w /= 256;
	if(w > 100)
		w = 100;
	if(w < 0)
		w = 0;
	for(c = c1; c <= c2; c++)
		Width[c] = w;
}
void
xf(Biff *b)
{
	int fmt;
	static int nalloc = 0;
	skip(b, 2);
	fmt = gint(b, 2);
	if(nalloc >= Nxf){
		nalloc += 20;
		if((Xf = realloc(Xf, nalloc*sizeof(int))) == nil)
			sysfatal("no memory");
	}
	Xf[Nxf++] = fmt;
}
void
writeaccess(Biff *b)
{
	Bprint(bo, "# author %s\n", gstr(b, 2));
}
void
codepage(Biff *b)
{
	int codepage = gint(b, 2);
	if(codepage != 1200)				// 1200 == UTF-16
		Bprint(bo, "# codepage %d\n", codepage);
}
void
xls2csv(Biobuf *bp)
{
	int i;
	Biff biff, *b;
	struct {
		int op;
		void (*func)(Biff *);
	} dispatch[] = {
		0x000a,	eof,
		0x0022,	datemode,
		0x0042,	codepage,
		0x0055,	defcolwidth,
		0x005c,	writeaccess,
		0x007d,	colinfo,
		0x00bd,	mulrk,
		0x00fc,	sst,
		0x00fd,	labelsst,
		0x0203,	number,
		0x0204,	label,
		0x0205,	boolerr,
		0x027e,	rk,
		0x0809,	bof,
		0x00e0,	xf,
	};		
	
	b = &biff;
	b->bp = bp;
	while(getrec(b) != -1){
		for(i = 0; i < nelem(dispatch); i++)
			if(b->op == dispatch[i].op)
				(*dispatch[i].func)(b);
		skip(b, b->len);
	}
}
void
usage(void)
{
	fprint(2, "usage: %s [-Danqt] [-w worksheets] [-c columns] [-d delim] /mnt/doc/Workbook\n", argv0);
	exits("usage");
}
void
main(int argc, char *argv[])
{
	int i;
	Biobuf bin, bout, *bp;
	
	ARGBEGIN{
	case 'D':
		Debug = 1;
		break;
	case 'a':
		All = 1;
		break;
	case 'q':
		Doquote = 0;
		break;
	case 'd':
		Delim = EARGF(usage());
		break;
	case 'n':
		Nopad = 1;
		break;
	case 't':
		Trunc = 1;
		break;
	case 'c':
		Columnrange = EARGF(usage());
		break;
	case 'w':
		Sheetrange = EARGF(usage());
		break;
	default:
		usage();
		break;
	}ARGEND;
	if(argc != 1)
		usage();
	bo = &bout;
	quotefmtinstall();
	Binit(bo, OWRITE, 1);
	if(argc > 0) {
		for(i = 0; i < argc; i++){
			if((bp = Bopen(argv[i], OREAD)) == nil)
				sysfatal("%s cannot open - %r", argv[i]);
			xls2csv(bp);
			Bterm(bp);
		}
	} else {
		Binit(&bin, 0, OREAD);
		xls2csv(&bin);
	}
	exits(0);
}