ref: d7f569ae4700bea47f16f4e387d0d49c8f9a817d
dir: /sys/src/cmd/tcs/utf.c/
#include	<u.h>
#include	<libc.h>
#include	<bio.h>
#include	"hdr.h"
/*
	the our_* routines are implementations for the corresponding library
	routines. for a while, i tried to actually name them wctomb etc
	but stopped that after i found a system which made wchar_t an
	unsigned char.
*/
int our_wctomb(char *s, unsigned long wc);
int our_mbtowc(unsigned long *p, char *s, unsigned n);
int runetoisoutf(char *str, Rune *rune);
int fullisorune(char *str, int n);
int isochartorune(Rune *rune, char *str);
void
utf_in(int fd, long *, struct convert *out)
{
	char buf[N];
	int i, j, c, n, tot;
	unsigned long l;
	tot = 0;
	while((n = read(fd, buf+tot, N-tot)) >= 0){
		tot += n;
		for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
			c = our_mbtowc(&l, buf+i, tot-i);
			if(c == -1){
				if(squawk)
					warn("bad UTF sequence near byte %ld in input", ninput+i);
				if(clean){
					i++;
					continue;
				}
				nerrors++;
				l = Runeerror;
				c = 1;
			}
			runes[j++] = l;
			i += c;
		}
		OUT(out, runes, j);
		tot -= i;
		ninput += i;
		if(tot)
			memmove(buf, buf+i, tot);
		if(n == 0)
			break;
	}
	OUT(out, runes, 0);
}
void
utf_out(Rune *base, int n, long *)
{
	char *p;
	Rune *r;
	nrunes += n;
	for(r = base, p = obuf; n-- > 0; r++){
		p += our_wctomb(p, *r);
	}
	noutput += p-obuf;
	if(p > obuf)
		write(1, obuf, p-obuf);
}
void
isoutf_in(int fd, long *, struct convert *out)
{
	char buf[N];
	int i, j, c, n, tot;
	tot = 0;
	while((n = read(fd, buf+tot, N-tot)) >= 0){
		tot += n;
		for(i=j=0; i<tot; ){
			if(!fullisorune(buf+i, tot-i))
				break;
			c = isochartorune(&runes[j], buf+i);
			if(runes[j] == Runeerror){
				if(squawk)
					warn("bad UTF sequence near byte %ld in input", ninput+i);
				if(clean){
					i++;
					continue;
				}
				nerrors++;
			}
			j++;
			i += c;
		}
		OUT(out, runes, j);
		tot -= i;
		ninput += i;
		if(tot)
			memmove(buf, buf+i, tot);
		if(n == 0)
			break;
	}
	OUT(out, runes, 0);
}
void
isoutf_out(Rune *base, int n, long *)
{
	char *p;
	Rune *r;
	nrunes += n;
	for(r = base, p = obuf; n-- > 0; r++)
		p += runetoisoutf(p, r);
	noutput += p-obuf;
	if(p > obuf)
		write(1, obuf, p-obuf);
}
enum
{
	Char1	= Runeself,	Rune1	= Runeself,
	Char21	= 0xA1,		Rune21	= 0x0100,
	Char22	= 0xF6,		Rune22	= 0x4016,
	Char3	= 0xFC,		Rune3	= 0x10000,	/* really 0x38E2E */
	Esc	= 0xBE,		Bad	= Runeerror
};
static	uchar	U[256];
static	uchar	T[256];
static
void
mktable(void)
{
	int i, u;
	for(i=0; i<256; i++) {
		u = i + (0x5E - 0xA0);
		if(i < 0xA0)
			u = i + (0xDF - 0x7F);
		if(i < 0x7F)
			u = i + (0x00 - 0x21);
		if(i < 0x21)
			u = i + (0xBE - 0x00);
		U[i] = u;
		T[u] = i;
	}
}
int
isochartorune(Rune *rune, char *str)
{
	int c, c1, c2;
	long l;
	if(U[0] == 0)
		mktable();
	/*
	 * one character sequence
	 *	00000-0009F => 00-9F
	 */
	c = *(uchar*)str;
	if(c < Char1) {
		*rune = c;
		return 1;
	}
	/*
	 * two character sequence
	 *	000A0-000FF => A0; A0-FF
	 */
	c1 = *(uchar*)(str+1);
	if(c < Char21) {
		if(c1 >= Rune1 && c1 < Rune21) {
			*rune = c1;
			return 2;
		}
		goto bad;
	}
	/*
	 * two character sequence
	 *	00100-04015 => A1-F5; 21-7E/A0-FF
	 */
	c1 = U[c1];
	if(c1 >= Esc)
		goto bad;
	if(c < Char22) {
		*rune =  (c-Char21)*Esc + c1 + Rune21;
		return 2;
	}
	/*
	 * three character sequence
	 *	04016-38E2D => A6-FB; 21-7E/A0-FF
	 */
	c2 = U[*(uchar*)(str+2)];
	if(c2 >= Esc)
		goto bad;
	if(c < Char3) {
		l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
		if(l >= Rune3)
			goto bad;
		*rune = l;
		return 3;
	}
	/*
	 * bad decoding
	 */
bad:
	*rune = Bad;
	return 1;
}
int
runetoisoutf(char *str, Rune *rune)
{
	long c;
	if(T[0] == 0)
		mktable();
	/*
	 * one character sequence
	 *	00000-0009F => 00-9F
	 */
	c = *rune;
	if(c < Rune1) {
		str[0] = c;
		return 1;
	}
	/*
	 * two character sequence
	 *	000A0-000FF => A0; A0-FF
	 */
	if(c < Rune21) {
		str[0] = Char1;
		str[1] = c;
		return 2;
	}
	/*
	 * two character sequence
	 *	00100-04015 => A1-F5; 21-7E/A0-FF
	 */
	if(c < Rune22) {
		c -= Rune21;
		str[0] = c/Esc + Char21;
		str[1] = T[c%Esc];
		return 2;
	}
	/*
	 * three character sequence
	 *	04016-38E2D => A6-FB; 21-7E/A0-FF
	 */
	c -= Rune22;
	str[0] = c/(Esc*Esc) + Char22;
	str[1] = T[c/Esc%Esc];
	str[2] = T[c%Esc];
	return 3;
}
int
fullisorune(char *str, int n)
{
	int c;
	if(n > 0) {
		c = *(uchar*)str;
		if(c < Char1)
			return 1;
		if(n > 1)
			if(c < Char22 || n > 2)
				return 1;
	}
	return 0;
}
enum
{
	T1	= 0x00,
	Tx	= 0x80,
	T2	= 0xC0,
	T3	= 0xE0,
	T4	= 0xF0,
	T5	= 0xF8,
	T6	= 0xFC,
	Bit1	= 7,
	Bitx	= 6,
	Bit2	= 5,
	Bit3	= 4,
	Bit4	= 3,
	Bit5	= 2,
	Bit6	= 2,
	Mask1	= (1<<Bit1)-1,
	Maskx	= (1<<Bitx)-1,
	Mask2	= (1<<Bit2)-1,
	Mask3	= (1<<Bit3)-1,
	Mask4	= (1<<Bit4)-1,
	Mask5	= (1<<Bit5)-1,
	Mask6	= (1<<Bit6)-1,
	Wchar1	= (1UL<<Bit1)-1,
	Wchar2	= (1UL<<(Bit2+Bitx))-1,
	Wchar3	= (1UL<<(Bit3+2*Bitx))-1,
	Wchar4	= (1UL<<(Bit4+3*Bitx))-1,
	Wchar5	= (1UL<<(Bit5+4*Bitx))-1,
};
int
our_wctomb(char *s, unsigned long wc)
{
	if(s == 0)
		return 0;		/* no shift states */
	if(wc & ~Wchar2) {
		if(wc & ~Wchar4) {
			if(wc & ~Wchar5) {
				/* 6 bytes */
				s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
				s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
				s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
				s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
				s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
				s[5] = Tx |  (wc & Maskx);
				return 6;
			}
			/* 5 bytes */
			s[0] = T5 |  (wc >> 4*Bitx);
			s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
			s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
			s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
			s[4] = Tx |  (wc & Maskx);
			return 5;
		}
		if(wc & ~Wchar3) {
			/* 4 bytes */
			s[0] = T4 |  (wc >> 3*Bitx);
			s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
			s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
			s[3] = Tx |  (wc & Maskx);
			return 4;
		}
		/* 3 bytes */
		s[0] = T3 |  (wc >> 2*Bitx);
		s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
		s[2] = Tx |  (wc & Maskx);
		return 3;
	}
	if(wc & ~Wchar1) {
		/* 2 bytes */
		s[0] = T2 | (wc >> 1*Bitx);
		s[1] = Tx | (wc & Maskx);
		return 2;
	}
	/* 1 byte */
	s[0] = T1 | wc;
	return 1;
}
int
our_mbtowc(unsigned long *p, char *s, unsigned n)
{
	uchar *us;
	int c0, c1, c2, c3, c4, c5;
	unsigned long wc;
	if(s == 0)
		return 0;		/* no shift states */
	if(n < 1)
		goto bad;
	us = (uchar*)s;
	c0 = us[0];
	if(c0 >= T3) {
		if(n < 3)
			goto bad;
		c1 = us[1] ^ Tx;
		c2 = us[2] ^ Tx;
		if((c1|c2) & T2)
			goto bad;
		if(c0 >= T5) {
			if(n < 5)
				goto bad;
			c3 = us[3] ^ Tx;
			c4 = us[4] ^ Tx;
			if((c3|c4) & T2)
				goto bad;
			if(c0 >= T6) {
				/* 6 bytes */
				if(n < 6)
					goto bad;
				c5 = us[5] ^ Tx;
				if(c5 & T2)
					goto bad;
				wc = ((((((((((c0 & Mask6) << Bitx) |
					c1) << Bitx) | c2) << Bitx) |
					c3) << Bitx) | c4) << Bitx) | c5;
				if(wc <= Wchar5)
					goto bad;
				*p = wc;
				return 6;
			}
			/* 5 bytes */
			wc = ((((((((c0 & Mask5) << Bitx) |
				c1) << Bitx) | c2) << Bitx) |
				c3) << Bitx) | c4;
			if(wc <= Wchar4)
				goto bad;
			*p = wc;
			return 5;
		}
		if(c0 >= T4) {
			/* 4 bytes */
			if(n < 4)
				goto bad;
			c3 = us[3] ^ Tx;
			if(c3 & T2)
				goto bad;
			wc = ((((((c0 & Mask4) << Bitx) |
				c1) << Bitx) | c2) << Bitx) |
				c3;
			if(wc <= Wchar3)
				goto bad;
			*p = wc;
			return 4;
		}
		/* 3 bytes */
		wc = ((((c0 & Mask3) << Bitx) |
			c1) << Bitx) | c2;
		if(wc <= Wchar2)
			goto bad;
		*p = wc;
		return 3;
	}
	if(c0 >= T2) {
		/* 2 bytes */
		if(n < 2)
			goto bad;
		c1 = us[1] ^ Tx;
		if(c1 & T2)
			goto bad;
		wc = ((c0 & Mask2) << Bitx) |
			c1;
		if(wc <= Wchar1)
			goto bad;
		*p = wc;
		return 2;
	}
	/* 1 byte */
	if(c0 >= Tx)
		goto bad;
	*p = c0;
	return 1;
bad:
	return -1;
}