ref: 3b2e0693b7e4db8f9fda2ff15fbfcba6f655371b
parent: d97f0e11402d5ac6ed4e24332c6ad6ea78b9a0e3
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Thu May 28 12:37:55 EDT 2015
uhtml: check if document is valid utf8 even with charset specified often, documents specify charsets but are really utf-8 encoded. we now try to decode as utf-8 and only if that fails assume the charset specified in the document.
--- a/sys/src/cmd/uhtml.c
+++ b/sys/src/cmd/uhtml.c
@@ -49,7 +49,7 @@
main(int argc, char *argv[])
{int n, q, pfd[2], pflag = 0;
- char *arg[4], *s, *e, *p, *g, *a, t;
+ char *arg[4], *s, *g, *e, *p, *a, t;
Rune r;
ARGBEGIN {@@ -69,79 +69,79 @@
sysfatal("open: %r");}
nbuf = 0;
- p = buf;
- g = buf;
while(nbuf < sizeof(buf)-1){if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
break;
nbuf += n;
buf[nbuf] = 0;
- if(nbuf == n){- if(memcmp(p, "\xEF\xBB\xBF", 3)==0){- p += 3;
- nbuf -= 3;
- cset = "utf";
- goto Found;
+ }
+
+ p = buf;
+ if(nbuf >= 3 && memcmp(p, "\xEF\xBB\xBF", 3)==0){+ p += 3;
+ nbuf -= 3;
+ cset = "utf";
+ goto Found;
+ }
+ if(nbuf >= 2 && memcmp(p, "\xFE\xFF", 2) == 0){+ p += 2;
+ nbuf -= 2;
+ cset = "unicode-be";
+ goto Found;
+ }
+ if(nbuf >= 2 && memcmp(p, "\xFF\xFE", 2) == 0){+ p += 2;
+ nbuf -= 2;
+ cset = "unicode-le";
+ goto Found;
+ }
+
+ s = p;
+ do {+ if((s = strchr(s, '<')) == nil)
+ break;
+ q = 0;
+ g = ++s;
+ e = buf+nbuf;
+ while(s < e){+ if(*s == '=' && q == 0)
+ q = '=';
+ else if(*s == '\'' || *s == '"'){+ if(q == '=')
+ q = *s;
+ else if(q == *s)
+ q = 0;
}
- if(memcmp(p, "\xFE\xFF", 2) == 0){- p += 2;
- nbuf -= 2;
- cset = "unicode-be";
- goto Found;
- }
- if(memcmp(p, "\xFF\xFE", 2) == 0){- p += 2;
- nbuf -= 2;
- cset = "unicode-le";
- goto Found;
- }
- }
- s = g;
- do {- if((s = strchr(s, '<')) == nil)
+ else if(*s == '>' && q != '\'' && q != '"'){+ e = s;
break;
- q = 0;
- g = ++s;
- e = buf+nbuf;
- while(s < e){- if(*s == '=' && q == 0)
- q = '=';
- else if(*s == '\'' || *s == '"'){- if(q == '=')
- q = *s;
- else if(q == *s)
- q = 0;
- }
- else if(*s == '>' && q != '\'' && q != '"'){- e = s;
- break;
- }
- else if(q == '=' && strchr(whitespace, *s) == nil)
- q = 0;
- s++;
}
- t = *e;
- *e = 0;
- if((a = attr(g, "encoding")) || (a = attr(g, "charset"))){- *e = t;
- cset = a;
- goto Found;
- }
+ else if(q == '=' && strchr(whitespace, *s) == nil)
+ q = 0;
+ s++;
+ }
+ t = *e;
+ *e = 0;
+ if((a = attr(g, "encoding")) != nil || (a = attr(g, "charset")) != nil){+ cset = a;
*e = t;
- s = ++e;
- } while(t);
- }
- if(cset)
- goto Found;
+ break;
+ }
+ *e = t;
+ s = ++e;
+ } while(t);
+
s = p;
while(s+UTFmax < p+nbuf){s += chartorune(&r, s);
if(r == Runeerror){- cset = "latin1";
+ if(cset == nil)
+ cset = "latin1";
goto Found;
}
}
cset = "utf";
+
Found:
if(pflag){ print("%s\n", cset);--
⑨