ref: 8c6018592dc887e32e1070705282539d4a19309c
parent: 1638860787748a0597afcd08e3068638c6b4cdb6
author: cinap_lenrek <cinap_lenrek@centraldogma>
date: Sat Sep 24 13:06:45 EDT 2011
html2ms, tcs, mothra, uhtml: threat ' as special entity, add uhtml(1)
--- /dev/null
+++ b/sys/man/1/uhtml
@@ -1,0 +1,46 @@
+.TH UHTML 1
+.SH NAME
+uhtml \- convert foreign character set HTML file to unicode
+.SH SYNOPSIS
+.B uhtml
+[
+.B -p
+] [
+.B -c
+.I charset
+] [
+.I file
+]
+.SH DESCRIPTION
+HTML comes in various character set encodings
+and has special forms to encode characters. To
+make it easier to process html, uthml is used
+to normalize it to a unicode only form.
+.LP
+Uhtml detects the character set of the html input
+.I file
+and calls
+.IR tcs (1)
+to convert it to utf replacing html-entity forms
+by ther unicode character representations except for
+.B lt
+.B gt
+.B amp
+.B quot
+and
+.B apos .
+The converted html is written to
+standard output. If no
+.I file
+was given, it is read from standard input. If the
+.B -p
+option is given, the detected character set is printed and
+the program exits without conversion.
+In case character set detection fails, the default (utf)
+is assumed. This default can be changed with the
+.B -c
+option.
+.SH SOURCE
+.B /sys/src/cmd/uhtml.c
+.SH SEE ALSO
+.IR tcs (1)
--- a/sys/src/cmd/html2ms.c
+++ b/sys/src/cmd/html2ms.c
@@ -680,6 +680,8 @@
return '>';
if(strcmp(buf, "quot") == 0)
return '"';
+ if(strcmp(buf, "apos") == 0)
+ return '\'';
if(strcmp(buf, "amp") == 0)
return '&';
/* use tcs -f html to handle the rest. */
--- a/sys/src/cmd/mothra/rdhtml.c
+++ b/sys/src/cmd/mothra/rdhtml.c
@@ -272,6 +272,8 @@
*t++='>';
else if(strcmp(u, "quot") == 0)
*t++='"';
+ else if(strcmp(u, "apos") == 0)
+ *t++='\'';
else if(strcmp(u, "amp") == 0)
*t++='&';
else {--- a/sys/src/cmd/tcs/html.c
+++ b/sys/src/cmd/tcs/html.c
@@ -11,8 +11,6 @@
Rune r;
};
-/* <, >, ", & intentionally omitted */
-
/*
* Names beginning with _ are names we recognize
* (without the underscore) but will not generate,
@@ -86,7 +84,7 @@
{"agrave", 224}, {"alefsym", 8501}, {"alpha", 945},- /* {"amp", 38}, */+ {"amp", 38}, {"and", 8743}, {"ang", 8736}, {"aring", 229},@@ -141,7 +139,7 @@
{"frasl", 8260}, {"gamma", 947}, {"ge", 8805},- /* {"gt", 62}, */+ {"gt", 62}, {"hArr", 8660}, {"harr", 8596}, {"hearts", 9829},@@ -173,7 +171,7 @@
{"lrm", 8206}, {"lsaquo", 8249}, {"lsquo", 8216},- /* {"lt", 60}, */+ {"lt", 60}, {"macr", 175}, {"mdash", 8212}, {"micro", 181},@@ -219,7 +217,7 @@
{"prop", 8733}, {"psi", 968}, {"quad", 8193},- /* {"quot", 34}, */+ {"quot", 34}, {"rArr", 8658}, {"radic", 8730}, {"rang", 9002},@@ -416,10 +414,8 @@
}
buf[i] = 0;
if(i > 1){- if((c = findbyname(buf+1)) != Runeerror){- *r++ = c;
- continue;
- }
+ if((c = findbyname(buf+1)) != Runeerror)
+ goto out;
if(i > 2 && buf[1] == '#'){ if(i > 3 && strchr("xX", buf[2]))c = strtol(buf+3, &p, 16);
@@ -427,8 +423,7 @@
c = strtol(buf+2, &p, 10);
if(*p || c >= NRUNE || c < 0)
goto bad;
- *r++ = c;
- continue;
+ goto out;
}
}
bad:
@@ -442,6 +437,12 @@
}
}
continue;
+ out:
+ if(strchr("<>&\"'", c)){+ s = ';';
+ i = sprint(buf, "&%s", findbyrune(c));
+ goto bad;
+ }
}
*r++ = c;
}
--- a/sys/src/cmd/uhtml.c
+++ b/sys/src/cmd/uhtml.c
@@ -41,7 +41,7 @@
main(int argc, char *argv[])
{int pfd[2], pflag = 0;
- char *arg[4], *s;
+ char *arg[4], *s, *p;
ARGBEGIN {case 'h':
@@ -59,21 +59,32 @@
if(open(*argv, OREAD) != 1)
sysfatal("open: %r");}
- if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
+ if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
sysfatal("read: %r");buf[nbuf] = 0;
-
- /* useless BOM marker */
- if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
- memmove(buf, buf+3, nbuf-3);
-
- for(;;){- if(s = cistrstr(buf, "encoding="))
+ p = buf;
+ while(nbuf > 0){+ if(memcmp(p, "\xEF\xBB\xBF", 3)==0){+ p += 3;
+ cset = "utf";
+ break;
+ }
+ if(memcmp(p, "\xFE\xFF", 2) == 0){+ p += 2;
+ cset = "unicode-be";
+ break;
+ }
+ if(memcmp(p, "\xFF\xFE", 2) == 0){+ p += 2;
+ cset = "unicode-le";
+ break;
+ }
+ if(s = cistrstr(p, "encoding="))
if(s = strval(s+9)){cset = s;
break;
}
- if(s = cistrstr(buf, "charset="))
+ if(s = cistrstr(p, "charset="))
if(s = strval(s+8)){cset = s;
break;
@@ -80,6 +91,7 @@
}
break;
}
+ nbuf -= p - buf;
if(pflag){ print("%s\n", cset);@@ -86,15 +98,15 @@
exits(0);
}
- if(pipe(pfd) < 0)
- sysfatal("pipe: %r");-
if(nbuf == 0){- write(1, buf, 0);
+ write(1, p, 0);
exits(0);
}
- switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){+ if(pipe(pfd) < 0)
+ sysfatal("pipe: %r");+
+ switch(rfork(RFFDG|RFREND|RFPROC)){case -1:
sysfatal("fork: %r");case 0:
@@ -114,10 +126,13 @@
close(pfd[1]);
while(nbuf > 0){- if(write(1, buf, nbuf) != nbuf)
+ if(write(1, p, nbuf) != nbuf)
sysfatal("write: %r");- if((nbuf = read(0, buf, sizeof(buf))) < 0)
+ p = buf;
+ if((nbuf = read(0, p, sizeof(buf))) < 0)
sysfatal("read: %r");}
+ close(1);
+ waitpid();
exits(0);
}
--
⑨