git: 9front

Download patch

ref: a50320179c3f66c43e115021a9b288a183cc9c9b
parent: 575c5ff33b2cc32eaf0154fc2e463a511f657d73
author: cinap_lenrek <cinap_lenrek@gmx.de>
date: Wed Apr 24 16:13:18 EDT 2013

make all the commands agnostic about Rune width. (from sources)

--- a/sys/include/libc.h
+++ b/sys/include/libc.h
@@ -46,6 +46,7 @@
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
 	Runeerror	= 0xFFFD,	/* decoding error in UTF */
 	Runemax		= 0xFFFF,	/* 16 bit rune */
+	Runemask	= 0xFFFF,	/* bits used by runes (see grep) */
 };
 
 /*
--- a/sys/src/cmd/acme/regx.c
+++ b/sys/src/cmd/acme/regx.c
@@ -20,7 +20,7 @@
 typedef struct Inst Inst;
 struct Inst
 {
-	uint	type;	/* < 0x10000 ==> literal, otherwise action */
+	uint	type;	/* <= Runemax+1 ==> literal, otherwise action */
 	union {
 		int sid;
 		int subid;
@@ -61,26 +61,29 @@
  *	0x100xx are operators, value == precedence
  *	0x200xx are tokens, i.e. operands for operators
  */
-#define	OPERATOR	0x10000	/* Bitmask of all operators */
-#define	START		0x10000	/* Start, used for marker on stack */
-#define	RBRA		0x10001	/* Right bracket, ) */
-#define	LBRA		0x10002	/* Left bracket, ( */
-#define	OR		0x10003	/* Alternation, | */
-#define	CAT		0x10004	/* Concatentation, implicit operator */
-#define	STAR		0x10005	/* Closure, * */
-#define	PLUS		0x10006	/* a+ == aa* */
-#define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
-#define	ANY		0x20000	/* Any character but newline, . */
-#define	NOP		0x20001	/* No operation, internal use only */
-#define	BOL		0x20002	/* Beginning of line, ^ */
-#define	EOL		0x20003	/* End of line, $ */
-#define	CCLASS		0x20004	/* Character class, [] */
-#define	NCCLASS		0x20005	/* Negated character class, [^] */
-#define	END		0x20077	/* Terminate: match found */
+enum {
+	OPERATOR = Runemask+1,	/* Bitmask of all operators */
+	START	= OPERATOR,	/* Start, used for marker on stack */
+	RBRA,			/* Right bracket, ) */
+	LBRA,			/* Left bracket, ( */
+	OR,			/* Alternation, | */
+	CAT,			/* Concatentation, implicit operator */
+	STAR,			/* Closure, * */
+	PLUS,			/* a+ == aa* */
+	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */
 
-#define	ISATOR		0x10000
-#define	ISAND		0x20000
+	ANY	= OPERATOR<<1,	/* Any character but newline, . */
+	NOP,			/* No operation, internal use only */
+	BOL,			/* Beginning of line, ^ */
+	EOL,			/* End of line, $ */
+	CCLASS,			/* Character class, [] */
+	NCCLASS,		/* Negated character class, [^] */
+	END,			/* Terminate: match found */
 
+	ISATOR	= OPERATOR,
+	ISAND	= OPERATOR<<1,
+};
+
 /*
  * Parser Information
  */
@@ -452,7 +455,7 @@
 			exprp++;
 			return '\n';
 		}
-		return *exprp++|0x10000;
+		return *exprp++|(Runemax+1);
 	}
 	return *exprp++;
 }
--- a/sys/src/cmd/ed.c
+++ b/sys/src/cmd/ed.c
@@ -15,7 +15,7 @@
 	ESIZE	= 256,		/* max size of reg exp */
 	GBSIZE	= 256,		/* max size of global command */
 	MAXSUB	= 9,		/* max number of sub reg exp */
-	ESCFLG	= 0xFFFF,	/* escape Rune - user defined code */
+	ESCFLG	= Runemax,	/* escape Rune - user defined code */
 	EOF	= -1,
 };
 
@@ -737,7 +737,7 @@
 		if(c == 0)
 			continue;
 		*p++ = c;
-		if(p >= &linebuf[LBSIZE-2])
+		if(p >= &linebuf[LBSIZE-sizeof(Rune)])
 			error(Q);
 	}
 }
@@ -1162,7 +1162,7 @@
 	for(a1=addr1; a1<=addr2; a1++) {
 		lp = getline(*a1);
 		while(*gp = *lp++)
-			if(gp++ >= &genbuf[LBSIZE-2])
+			if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)])
 				error(Q);
 	}
 	lp = linebuf;
--- a/sys/src/cmd/file.c
+++ b/sys/src/cmd/file.c
@@ -273,60 +273,6 @@
 	close(fd);
 }
 
-/*
- * Unicode 4.0 4-byte runes.
- */
-typedef int Rune1;
-
-enum {
-	UTFmax1 = 4,
-};
-
-int
-fullrune1(char *p, int n)
-{
-	int c;
-
-	if(n >= 1) {
-		c = *(uchar*)p;
-		if(c < 0x80)
-			return 1;
-		if(n >= 2 && c < 0xE0)
-			return 1;
-		if(n >= 3 && c < 0xF0)
-			return 1;
-		if(n >= 4)
-			return 1;
-	}
-	return 0;
-}
-
-int
-chartorune1(Rune1 *rune, char *str)
-{
-	int c, c1, c2, c3, n;
-	Rune r;
-
-	c = *(uchar*)str;
-	if(c < 0xF0){
-		r = 0;
-		n = chartorune(&r, str);
-		*rune = r;
-		return n;
-	}
-	c &= ~0xF0;
-	c1 = *(uchar*)(str+1) & ~0x80;
-	c2 = *(uchar*)(str+2) & ~0x80;
-	c3 = *(uchar*)(str+3) & ~0x80;
-	n = (c<<18) | (c1<<12) | (c2<<6) | c3;
-	if(n < 0x10000 || n > 0x10FFFF){
-		*rune = Runeerror;
-		return 1;
-	}
-	*rune = n;
-	return 4;
-}
-
 void
 utfconv(void)
 {
@@ -392,7 +338,7 @@
 void
 filetype(int fd)
 {
-	Rune1 r;
+	Rune r;
 	int i, f, n;
 	char *p, *eob;
 
@@ -435,9 +381,9 @@
 		language[i].count = 0;
 	eob = (char *)buf+nbuf;
 	for(n = 0, p = (char *)buf; p < eob; n++) {
-		if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
+		if (!fullrune(p, eob-p) && eob-p < UTFmax)
 			break;
-		p += chartorune1(&r, p);
+		p += chartorune(&r, p);
 		if (r == 0)
 			f = Cnull;
 		else if (r <= 0x7f) {
--- a/sys/src/cmd/freq.c
+++ b/sys/src/cmd/freq.c
@@ -2,7 +2,7 @@
 #include <libc.h>
 #include <bio.h>
 
-uvlong	count[1<<16];
+uvlong	count[Runemax+1];
 Biobuf	bout;
 
 void	usage(void);
--- a/sys/src/cmd/grep/comp.c
+++ b/sys/src/cmd/grep/comp.c
@@ -275,7 +275,7 @@
 			x = re2or(x, rclass(ov, p[0]-1));
 			ov = p[1]+1;
 		}
-		x = re2or(x, rclass(ov, 0xffff));
+		x = re2or(x, rclass(ov, Runemask));
 	} else {
 		x = rclass(p[0], p[1]);
 		for(p+=2; *p; p+=2)
--- a/sys/src/cmd/grep/grep.h
+++ b/sys/src/cmd/grep/grep.h
@@ -53,7 +53,7 @@
 
 	Caselim		= 7,
 	Nhunk		= 1<<16,
-	Cbegin		= 0x10000,
+	Cbegin		= Runemax+1,
 	Flshcnt		= (1<<9)-1,
 
 	Cflag		= 1<<0,
--- a/sys/src/cmd/htmlroff/char.c
+++ b/sys/src/cmd/htmlroff/char.c
@@ -16,6 +16,12 @@
 	if(r == '\n')
 		return L("\n");
 
+	if(((uint)r&~0xFFFF) != 0){
+		/* The cache must grow a lot to handle them */
+		fprint(2, "%s: can't handle rune '%C'\n", argv0, r);
+		return L("?");
+	}
+
 	if(tcscache[r>>8] && tcscache[r>>8][r&0xFF])
 		return tcscache[r>>8][r&0xFF];
 
@@ -59,7 +65,7 @@
 typedef struct Trtab Trtab;
 struct Trtab
 {
-	char t[3];
+	char t[UTFmax];
 	Rune r;
 };
 
--- a/sys/src/cmd/rc/glob.c
+++ b/sys/src/cmd/rc/glob.c
@@ -118,18 +118,16 @@
 int
 equtf(uchar *p, uchar *q)
 {
+	Rune pr, qr;
+ 
 	if(*p!=*q)
-		return 0;
-	if(twobyte(*p)) return p[1]==q[1];
-	if(threebyte(*p)){
-		if(p[1]!=q[1])
-			return 0;
-		if(p[1]=='\0')
-			return 1;	/* broken code at end of string! */
-		return p[2]==q[2];
-	}
-	return 1;
+ 		return 0;
+
+	chartorune(&pr, (char*)p);
+	chartorune(&qr, (char*)q);
+	return pr == qr;
 }
+
 /*
  * Return a pointer to the next utf code in the string,
  * not jumping past nuls in broken utf codes!
@@ -138,10 +136,11 @@
 uchar*
 nextutf(uchar *p)
 {
-	if(twobyte(*p)) return p[1]=='\0'?p+1:p+2;
-	if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3;
-	return p+1;
+	Rune dummy;
+
+	return p + chartorune(&dummy, (char*)p);
 }
+
 /*
  * Convert the utf code at *p to a unicode value
  */
@@ -149,14 +148,12 @@
 int
 unicode(uchar *p)
 {
-	int u = *p;
+	Rune r;
 
-	if(twobyte(u))
-		return ((u&0x1f)<<6)|(p[1]&0x3f);
-	if(threebyte(u))
-		return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f);
-	return u;
+	chartorune(&r, (char*)p);
+	return r;
 }
+
 /*
  * Does the string s match the pattern p
  * . and .. are only matched by patterns starting with .
--- a/sys/src/cmd/rc/lex.c
+++ b/sys/src/cmd/rc/lex.c
@@ -166,15 +166,25 @@
 char*
 addutf(char *p, int c)
 {
-	p = addtok(p, c);
-	if(twobyte(c))	 /* 2-byte escape */
-		return addtok(p, advance());
-	if(threebyte(c)){	/* 3-byte escape */
+	uchar b, m;
+	int i;
+
+	p = addtok(p, c);	/* 1-byte UTF runes are special */
+	if(onebyte(c))
+		return p;
+
+	m = 0xc0;
+	b = 0x80;
+	for(i=1; i < UTFmax; i++){
+		if((c&m) == b)
+			break;
 		p = addtok(p, advance());
-		return addtok(p, advance());
+		b = m;
+		m = (m >> 1)|0x80;
 	}
 	return p;
 }
+
 int lastdol;	/* was the last token read '$' or '$#' or '"'? */
 int lastword;	/* was the last token read a word or compound word terminator? */
 
--- a/sys/src/cmd/rc/rc.h
+++ b/sys/src/cmd/rc/rc.h
@@ -123,12 +123,10 @@
  */
 #define	GLOB	((char)0x01)
 /*
- * onebyte(c), twobyte(c), threebyte(c)
- * Is c the first character of a one- two- or three-byte utf sequence?
+ * onebyte(c)
+ * Is c the first character of a one-byte utf sequence?
  */
 #define	onebyte(c)	((c&0x80)==0x00)
-#define	twobyte(c)	((c&0xe0)==0xc0)
-#define	threebyte(c)	((c&0xf0)==0xe0)
 
 char **argp;
 char **args;
--- a/sys/src/cmd/sam/regexp.c
+++ b/sys/src/cmd/sam/regexp.c
@@ -9,7 +9,7 @@
 
 struct Inst
 {
-	long	type;	/* < 0x10000 ==> literal, otherwise action */
+	long	type;	/* <= Runemax ==> literal, otherwise action */
 	union {
 		int rsid;
 		int rsubid;
@@ -46,7 +46,7 @@
 
 #define	NLIST	127
 
-Ilist	*tl, *nl;	/* This list, next list */
+Ilist	*tl, *nl;		/* This list, next list */
 Ilist	list[2][NLIST+1];	/* +1 for trailing null */
 static	Rangeset sempty;
 
@@ -56,26 +56,29 @@
  *	0x100xx are operators, value == precedence
  *	0x200xx are tokens, i.e. operands for operators
  */
-#define	OPERATOR	0x10000	/* Bitmask of all operators */
-#define	START		0x10000	/* Start, used for marker on stack */
-#define	RBRA		0x10001	/* Right bracket, ) */
-#define	LBRA		0x10002	/* Left bracket, ( */
-#define	OR		0x10003	/* Alternation, | */
-#define	CAT		0x10004	/* Concatentation, implicit operator */
-#define	STAR		0x10005	/* Closure, * */
-#define	PLUS		0x10006	/* a+ == aa* */
-#define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
-#define	ANY		0x20000	/* Any character but newline, . */
-#define	NOP		0x20001	/* No operation, internal use only */
-#define	BOL		0x20002	/* Beginning of line, ^ */
-#define	EOL		0x20003	/* End of line, $ */
-#define	CCLASS		0x20004	/* Character class, [] */
-#define	NCCLASS		0x20005	/* Negated character class, [^] */
-#define	END		0x20077	/* Terminate: match found */
+enum {
+	OPERATOR = Runemask+1,	/* Bitmask of all operators */
+	START	= OPERATOR,	/* Start, used for marker on stack */
+	RBRA,			/* Right bracket, ) */
+	LBRA,			/* Left bracket, ( */
+	OR,			/* Alternation, | */
+	CAT,			/* Concatentation, implicit operator */
+	STAR,			/* Closure, * */
+	PLUS,			/* a+ == aa* */
+	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */
 
-#define	ISATOR		0x10000
-#define	ISAND		0x20000
+	ANY	= OPERATOR<<1,	/* Any character but newline, . */
+	NOP,			/* No operation, internal use only */
+	BOL,			/* Beginning of line, ^ */
+	EOL,			/* End of line, $ */
+	CCLASS,			/* Character class, [] */
+	NCCLASS,		/* Negated character class, [^] */
+	END,			/* Terminate: match found */
 
+	ISATOR	= OPERATOR,
+	ISAND	= OPERATOR<<1,
+};
+
 /*
  * Parser Information
  */
@@ -459,7 +462,7 @@
 			exprp++;
 			return '\n';
 		}
-		return *exprp++|0x10000;
+		return *exprp++|(Runemax+1);
 	}
 	return *exprp++;
 }
--- a/sys/src/cmd/tr.c
+++ b/sys/src/cmd/tr.c
@@ -15,10 +15,8 @@
 #define	CLEARBIT(a,c)		((a)[(c)/8] &= ~bits[(c)&07])
 #define	BITSET(a,c)		((a)[(c)/8] & bits[(c)&07])
 
-#define	MAXRUNE	Runemax
-
-uchar	f[(MAXRUNE+1)/8];
-uchar	t[(MAXRUNE+1)/8];
+uchar	f[(Runemax+1)/8];
+uchar	t[(Runemax+1)/8];
 char 	wbuf[4096];
 char	*wptr;
 
--