git: 9front

ref: baf202e3cb0f6cb03e52c2af4fbc19406133b9e5
dir: /sys/src/cmd/html2ms.c/

View raw version
#include <u.h>
#include <libc.h>
#include <ctype.h>
#include <bio.h>

typedef struct Tag Tag;
typedef struct Attr Attr;
typedef struct Text Text;

struct Attr {
	char	attr[64];
	char	val[256-64];
};

struct Tag {
	Tag	*up;
	char	tag[32];
	Attr	attr[16];
	int	nattr;
	int	opening;
	int	closing;

	void	(*close)(Text *, Tag *);
	union {
		void	*aux;
		int	restore;
	};
};

struct Text {
	char*	font;
	int	pre;
	int	pos;
	int	space;
	int	output;
	int	underline;
};

void eatwhite(void);
Tag *parsetext(Text *, Tag *);
int parsetag(Tag *);
int parseattr(Attr *);

Biobuf in, out;

void
emit(Text *text, char *fmt, ...)
{
	va_list a;

	if(text->pos > 0){
		text->pos = 0;
		Bputc(&out, '\n');
	}
	va_start(a, fmt);
	Bvprint(&out, fmt, a);
	va_end(a);
}

void
restoreoutput(Text *text, Tag *tag)
{
	text->output = tag->restore;
}

void
ongarbage(Text *text, Tag *tag)
{
	tag->restore = text->output;
	tag->close = restoreoutput;
	text->output = 0;
}

void
onp(Text *text, Tag *)
{
	emit(text, ".LP\n");
}

void
restorepre(Text *text, Tag *tag)
{
	text->pre = tag->restore;
	emit(text, ".DE\n");
}

void
onpre(Text *text, Tag *tag)
{
	tag->restore = text->pre;
	tag->close = restorepre;
	text->pre = 1;
	emit(text, ".DS L\n");
}

void
onli(Text *text, Tag *tag)
{
	if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
		emit(text, ".IP\n");
	else
		emit(text, ".IP \\(bu\n");
	if(tag->up)
		tag->up->close = onp;
}

void
onh(Text *text, Tag *tag)
{
	emit(text, ".SH %c\n", tag->tag[1]);
	tag->close = onp;
}

void
onbr(Text *text, Tag *tag)
{
	tag->closing = 1;
	emit(text, ".br\n");
	if(cistrcmp(tag->tag, "hr") == 0)
		emit(text, "\\l'5i'\n.br\n");
}

void
restorefont(Text *text, Tag *tag)
{
	text->font = tag->aux;
	text->pos += Bprint(&out, "\\f%s", text->font);
}

void
onfont(Text *text, Tag *tag)
{
	if(text->font == 0)
		text->font = "R";
	tag->aux = text->font;
	tag->close = restorefont;
	if(cistrcmp(tag->tag, "i") == 0)
		text->font = "I";
	else if(cistrcmp(tag->tag, "b") == 0)
		text->font = "B";
	text->pos += Bprint(&out, "\\f%s", text->font);
}

void
restoreunderline(Text *text, Tag *tag)
{
	text->underline = tag->restore;
	emit(text, "");
}

void
ona(Text *text, Tag *tag)
{
	int i;

	for(i=0; i<tag->nattr; i++)
		if(cistrcmp(tag->attr[i].attr, "href") == 0)
			break;
	if(i == tag->nattr)
		return;
	tag->restore = text->underline;
	tag->close = restoreunderline;
	text->underline = 1;
}

struct {
	char	*tag;
	void	(*open)(Text *, Tag *);
} ontag[] = {
	"a",		ona,
	"br",		onbr,
	"hr",		onbr,
	"b",		onfont,
	"i",		onfont,
	"p",		onp,
	"h1",		onh,
	"h2",		onh,
	"h3",		onh,
	"h4",		onh,
	"h5",		onh,
	"h6",		onh,
	"li",		onli,
	"pre",		onpre,
	"head",		ongarbage,
	"style",	ongarbage,
	"script",	ongarbage,
};

void
eatwhite(void)
{
	int c;

	while((c = Bgetc(&in)) > 0){
		if(strchr("\n\r\t ", c) == nil){
			Bungetc(&in);
			return;
		}
	}
}

void
parsecomment(void)
{
	char buf[64];
	int n, c;

	n = 0;
	eatwhite();
	while((c = Bgetc(&in)) > 0){
		if(c == '>')
			return;
		if(n == 0 && c == '-'){
			while((c = Bgetc(&in)) > 0){
				if(c == '-')
					if(Bgetc(&in) == '-')
						if(Bgetc(&in) == '>')
							return;
			}
		}
		if(n+1 < sizeof(buf)){
			buf[n++] = c;
			if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
				continue;
			while((c = Bgetc(&in)) > 0){
				if(c == ']'){
					if(Bgetc(&in) == ']'){
						if(Bgetc(&in) != '>')
							Bungetc(&in);
						return;
					}
				}
			}
		}
	}
}

int
parseattr(Attr *a)
{
	int q, c, n;

	n = 0;
	eatwhite();
	while((c = Bgetc(&in)) > 0){
		if(strchr("</>=?!", c)){
			Bungetc(&in);
			break;
		}
		if(strchr("\n\r\t ", c))
			break;
		if(n < sizeof(a->attr)-1)
			a->attr[n++] = c;
	}
	if(n == 0)
		return 0;
	a->attr[n] = 0;
	n = 0;
	eatwhite();
	if(Bgetc(&in) == '='){
		eatwhite();
		c = Bgetc(&in);
		if(strchr("'\"", c)){
			q = c;
			while((c = Bgetc(&in)) > 0){
				if(c == q)
					break;
				if(n < sizeof(a->val)-1)
					a->val[n++] = c;
			}
		} else {
			Bungetc(&in);
			while((c = Bgetc(&in)) > 0){
				if(strchr("\n\r\t </>?!", c)){
					Bungetc(&in);
					break;
				}
				if(n < sizeof(a->val)-1)
					a->val[n++] = c;
			}
		}
	} else
		Bungetc(&in);
	a->val[n] = 0;
	return 1;
}

int
parsetag(Tag *t)
{
	int n, c;

	t->nattr = 0;
	t->opening = 1;
	t->closing = 0;

	n = 0;
	eatwhite();
	while((c = Bgetc(&in)) > 0){
		if(c == '>')
			break;
		if(strchr("\n\r\t ", c)){
			if(parseattr(t->attr + t->nattr))
				if(t->nattr < nelem(t->attr)-1)
					t->nattr++;
			continue;
		}
		if(n == 0 && strchr("?!", c)){
			parsecomment();
			return 0;
		}
		if(c == '/'){
			if(n == 0){
				t->opening = 0;
				t->closing = 1;
			} else
				t->closing = 1;
			continue;
		}
		if(n < sizeof(t->tag)-1)
			t->tag[n++] = c;
	}
	t->tag[n] = 0;
	return n > 0;
}

Rune
parserune(int c)
{
	char buf[10];
	int n;
	Rune r;

	n = 0;
	if(c == '&'){
		while((c = Bgetc(&in)) > 0){
			if(strchr(";&</>\n\r\t ", c)){
				if(c != ';')
					Bungetc(&in);
				if(n == 0)
					return '&';
				break;
			}
			if(n == sizeof(buf)-1)
				break;
			buf[n++] = c;
		}
		buf[n] = 0;
		if(strcmp(buf, "lt") == 0)
			return '<';
		if(strcmp(buf, "gt") == 0)
			return '>';
		if(strcmp(buf, "quot") == 0)
			return '"';
		if(strcmp(buf, "amp") == 0)
			return '&';
		/* use tcs -f html to handle the rest. */
	} else {
		do {
			buf[n++] = c;
			if(fullrune(buf, n)){
				chartorune(&r, buf);
				return r;
			}
			if(n >= UTFmax)
				break;
		} while((c = Bgetc(&in)) > 0);
	}
	return 0xFFFD;
}

Rune
substrune(Rune r)
{
	switch(r){
	case 0x2019:
	case 0x2018:
		return '\'';
	case 0x201c:
	case 0x201d:
		return '"';
	default:
		return r;
	}
}

void
debugtag(Tag *tag, char *dbg)
{
	if(1) return;

	if(tag == nil)
		return;
	debugtag(tag->up, nil);
	fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
}


Tag*
parsetext(Text *text, Tag *tag)
{
	Tag *rtag;
	Rune r;
	int c;

	rtag = tag;
	debugtag(tag, "open");
	if(tag == nil || tag->closing == 0){
		while((c = Bgetc(&in)) > 0){
			if(c == '<'){
				Tag t;

				memset(&t, 0, sizeof(t));
				if(parsetag(&t)){
					if(t.opening){
						t.up = tag;
						for(c = 0; c < nelem(ontag); c++){
							if(cistrcmp(t.tag, ontag[c].tag) == 0){
								ontag[c].open(text, &t);
								break;
							}
						}
						rtag = parsetext(text, &t);
						if(rtag == &t)
							rtag = tag;
						else
							break;
					} else if(t.closing){
						while(rtag && cistrcmp(rtag->tag, t.tag))
							rtag = rtag->up;
						if(rtag == nil)
							rtag = tag;
						else
							break;
					}
				}
				continue;
			}
			if(!text->output)
				continue;
			r = substrune(parserune(c));
			switch(r){
			case '\n':
			case '\r':
			case ' ':
			case '\t':
				text->space = 1;
				if(text->pre == 0)
					continue;
			default:
				if(r == '\n' || r == '\r')
					text->pos = 0;
				if(text->space){
					text->space = 0;
					if(text->underline){
						emit(text, ".UL ");
						text->pos = 1;
					} else if(text->pos >= 70){
						text->pos = 0;
						Bputc(&out, '\n');
					} else if(text->pos > 0){
						text->pos++;
						Bputc(&out, ' ');
					}
				}
				if(text->pos == 0 && r == '.')
					text->pos += Bprint(&out, "\\&");
				else if(r == '\\')
					text->pos += Bprint(&out, "\\&\\");
				else if(r == 0xA0){
					r = ' ';
					text->pos += Bprint(&out, "\\");
				}
				text->pos += Bprint(&out, "%C", r);
			}
		}
	}
	debugtag(tag, "close");
	if(tag && tag->close)
		tag->close(text, tag);
	return rtag;
}

void
main(void)
{
	Text text;

	Binit(&in, 0, OREAD);
	Binit(&out, 1, OWRITE);

	memset(&text, 0, sizeof(text));

	text.font = "R";
	text.output = 1;

	parsetext(&text, nil);
	emit(&text, "\n");
}