git: drawterm

--- a/libmemdraw/draw.c

+++ b/libmemdraw/draw.c

@@ -10,23 +10,35 @@

 #define RGB2K(r,g,b)	((156763*(r)+307758*(g)+59769*(b))>>19)

/*

- * for 0 ≤ x ≤ 255*255, (x*0x0101+0x100)>>16 is a perfect approximation.

- * for 0 ≤ x < (1<<16), x/255 = ((x+1)*0x0101)>>16 is a perfect approximation.

- * the last one is perfect for all up to 1<<16, avoids a multiply, but requires a rathole.

+ * For 16-bit values, x / 255 == (t = x+1, (t+(t>>8)) >> 8).

+ * We add another 127 to round to the nearest value rather

+ * than truncate.

+ *

+ * CALCxy does x bytewise calculations on y input images (x=1,4; y=1,2).

+ * CALC2x does two parallel 16-bit calculations on y input images (y=1,2).

*/

-/* #define DIV255(x) (((x)*257+256)>>16)  */

-#define DIV255(x) ((((x)+1)*257)>>16)

-/* #define DIV255(x) (tmp=(x)+1, (tmp+(tmp>>8))>>8) */

+#define CALC11(a, v, tmp) \

+	(tmp=(a)*(v)+128, (tmp+(tmp>>8))>>8)

-#define MUL(x, y, t)	(t = (x)*(y)+128, (t+(t>>8))>>8)

-#define MASK13	0xFF00FF00

-#define MASK02	0x00FF00FF

-#define MUL13(a, x, t)		(t = (a)*(((x)&MASK13)>>8)+128, ((t+((t>>8)&MASK02))>>8)&MASK02)

-#define MUL02(a, x, t)		(t = (a)*(((x)&MASK02)>>0)+128, ((t+((t>>8)&MASK02))>>8)&MASK02)

-#define MUL0123(a, x, s, t)	((MUL13(a, x, s)<<8)|MUL02(a, x, t))

+#define CALC12(a1, v1, a2, v2, tmp) \

+	(tmp=(a1)*(v1)+(a2)*(v2)+128, (tmp+(tmp>>8))>>8)

-#define MUL2(u, v, x, y)	(t = (u)*(v)+(x)*(y)+256, (t+(t>>8))>>8)

+#define MASK 0xFF00FF

+#define CALC21(a, vvuu, tmp) \

+	(tmp=(a)*(vvuu)+0x00800080, ((tmp+((tmp>>8)&MASK))>>8)&MASK)

+#define CALC41(a, rgba, tmp1, tmp2) \

+	(CALC21(a, rgba & MASK, tmp1) | \

+	 (CALC21(a, (rgba>>8)&MASK, tmp2)<<8))

+#define CALC22(a1, vvuu1, a2, vvuu2, tmp) \

+	(tmp=(a1)*(vvuu1)+(a2)*(vvuu2)+0x00800080, ((tmp+((tmp>>8)&MASK))>>8)&MASK)

+#define CALC42(a1, rgba1, a2, rgba2, tmp1, tmp2) \

+	(CALC22(a1, rgba1 & MASK, a2, rgba2 & MASK, tmp1) | \

+	 (CALC22(a1, (rgba1>>8) & MASK, a2, (rgba2>>8) & MASK, tmp2)<<8))

 static void mktables(void);

 typedef int Subdraw(Memdrawparam*);

 static Subdraw chardraw, alphadraw, memoptdraw;

@@ -307,6 +319,9 @@

  * Conversion tables.

*/

 static uchar replbit[1+8][256];		/* replbit[x][y] is the replication of the x-bit quantity y to 8-bit depth */

+static uchar conv18[256][8];		/* conv18[x][y] is the yth pixel in the depth-1 pixel x */

+static uchar conv28[256][4];		/* ... */

+static uchar conv48[256][2];

/*

  * bitmap of how to replicate n bits to fill 8, for 1 ≤ n ≤ 8.

@@ -340,7 +355,7 @@

 static void

 mktables(void)

-	int i, j, small;

+	int i, j, mask, sh, small;

 	if(tablesbuilt)

 		return;

@@ -357,6 +372,17 @@

+	/* bit unpacking up to 8 bits, only powers of 2 */

+	for(i=0; i<256; i++){

+		for(j=0, sh=7, mask=1; j<8; j++, sh--)

+			conv18[i][j] = replbit[1][(i>>sh)&mask];

+		for(j=0, sh=6, mask=3; j<4; j++, sh-=2)

+			conv28[i][j] = replbit[2][(i>>sh)&mask];

+		for(j=0, sh=4, mask=15; j<2; j++, sh-=4)

+			conv48[i][j] = replbit[4][(i>>sh)&mask];

+	}

 static uchar ones = 0xff;

@@ -770,6 +796,50 @@

 	return bdst;

+/*

+ * Do the channels in the buffers match enough

+ * that we can do word-at-a-time operations

+ * on the pixels?

+ */

+static int

+chanmatch(Buffer *bdst, Buffer *bsrc)

+{

+	uchar *drgb, *srgb;

+	/*

+	 * first, r, g, b must be in the same place

+	 * in the rgba word.

+	 */

+	drgb = (uchar*)bdst->rgba;

+	srgb = (uchar*)bsrc->rgba;

+	if(bdst->red - drgb != bsrc->red - srgb

+	|| bdst->blu - drgb != bsrc->blu - srgb

+	|| bdst->grn - drgb != bsrc->grn - srgb)

+		return 0;

+	/*

+	 * that implies alpha is in the same place,

+	 * if it is there at all (it might be == &ones).

+	 * if the destination is &ones, we can scribble

+	 * over the rgba slot just fine.

+	 */

+	if(bdst->alpha == &ones)

+		return 1;

+	/*

+	 * if the destination is not ones but the src is,

+	 * then the simultaneous calculation will use

+	 * bogus bytes from the src's rgba.  no good.

+	 */

+	if(bsrc->alpha == &ones)

+		return 0;

+	/*

+	 * otherwise, alphas are in the same place.

+	 */

+	return 1;

+}

 static Buffer

 alphacalc14(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)

@@ -776,26 +846,26 @@

 	Buffer obdst;

 	int fd, sadelta;

 	int i, sa, ma, q;

-	ulong s, t;

+	ulong t, t1;

 	obdst = bdst;

 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;

-	q = bsrc.delta == 4 && bdst.delta == 4;

+	q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);

 	for(i=0; i<dx; i++){

 		sa = *bsrc.alpha;

 		ma = *bmask.alpha;

-		fd = MUL(sa, ma, t);

+		fd = CALC11(sa, ma, t);

 		if(op == DoutS)

 			fd = 255-fd;

 		if(grey){

-			*bdst.grey = MUL(fd, *bdst.grey, t);

+			*bdst.grey = CALC11(fd, *bdst.grey, t);

 			bsrc.grey += bsrc.delta;

 			bdst.grey += bdst.delta;

 		}else{

 			if(q){

-				*bdst.rgba = MUL0123(fd, *bdst.rgba, s, t);

+				*bdst.rgba = CALC41(fd, *bdst.rgba, t, t1);

 				bsrc.rgba++;

 				bdst.rgba++;

 				bsrc.alpha += sadelta;

@@ -802,9 +872,9 @@

 				bmask.alpha += bmask.delta;

 				continue;

-			*bdst.red = MUL(fd, *bdst.red, t);

-			*bdst.grn = MUL(fd, *bdst.grn, t);

-			*bdst.blu = MUL(fd, *bdst.blu, t);

+			*bdst.red = CALC11(fd, *bdst.red, t);

+			*bdst.grn = CALC11(fd, *bdst.grn, t);

+			*bdst.blu = CALC11(fd, *bdst.blu, t);

 			bsrc.red += bsrc.delta;

 			bsrc.blu += bsrc.delta;

 			bsrc.grn += bsrc.delta;

@@ -813,7 +883,7 @@

 			bdst.grn += bdst.delta;

 		if(bdst.alpha != &ones){

-			*bdst.alpha = MUL(fd, *bdst.alpha, t);

+			*bdst.alpha = CALC11(fd, *bdst.alpha, t);

 			bdst.alpha += bdst.delta;

 		bmask.alpha += bmask.delta;

@@ -828,11 +898,11 @@

 	Buffer obdst;

 	int fs, sadelta;

 	int i, ma, da, q;

-	ulong s, t;

+	ulong t, t1;

 	obdst = bdst;

 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;

-	q = bsrc.delta == 4 && bdst.delta == 4;

+	q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);

 	for(i=0; i<dx; i++){

 		ma = *bmask.alpha;

@@ -841,15 +911,15 @@

 			da = 255-da;

 		fs = ma;

 		if(op != S)

-			fs = MUL(fs, da, t);

+			fs = CALC11(fs, da, t);

 		if(grey){

-			*bdst.grey = MUL(fs, *bsrc.grey, t);

+			*bdst.grey = CALC11(fs, *bsrc.grey, t);

 			bsrc.grey += bsrc.delta;

 			bdst.grey += bdst.delta;

 		}else{

 			if(q){

-				*bdst.rgba = MUL0123(fs, *bsrc.rgba, s, t);

+				*bdst.rgba = CALC41(fs, *bsrc.rgba, t, t1);

 				bsrc.rgba++;

 				bdst.rgba++;

 				bmask.alpha += bmask.delta;

@@ -856,9 +926,9 @@

 				bdst.alpha += bdst.delta;

 				continue;

-			*bdst.red = MUL(fs, *bsrc.red, t);

-			*bdst.grn = MUL(fs, *bsrc.grn, t);

-			*bdst.blu = MUL(fs, *bsrc.blu, t);

+			*bdst.red = CALC11(fs, *bsrc.red, t);

+			*bdst.grn = CALC11(fs, *bsrc.grn, t);

+			*bdst.blu = CALC11(fs, *bsrc.blu, t);

 			bsrc.red += bsrc.delta;

 			bsrc.blu += bsrc.delta;

 			bsrc.grn += bsrc.delta;

@@ -867,7 +937,7 @@

 			bdst.grn += bdst.delta;

 		if(bdst.alpha != &ones){

-			*bdst.alpha = MUL(fs, *bsrc.alpha, t);

+			*bdst.alpha = CALC11(fs, *bsrc.alpha, t);

 			bdst.alpha += bdst.delta;

 		bmask.alpha += bmask.delta;

@@ -882,11 +952,11 @@

 	Buffer obdst;

 	int fs, fd, sadelta;

 	int i, sa, ma, da, q;

-	ulong s, t, u, v;

+	ulong t, t1;

 	obdst = bdst;

 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;

-	q = bsrc.delta == 4 && bdst.delta == 4;

+	q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);

 	for(i=0; i<dx; i++){

 		sa = *bsrc.alpha;

@@ -893,24 +963,24 @@

 		ma = *bmask.alpha;

 		da = *bdst.alpha;

 		if(op == SatopD)

-			fs = MUL(ma, da, t);

+			fs = CALC11(ma, da, t);

 		else

-			fs = MUL(ma, 255-da, t);

+			fs = CALC11(ma, 255-da, t);

 		if(op == DoverS)

 			fd = 255;

 		else{

-			fd = MUL(sa, ma, t);

+			fd = CALC11(sa, ma, t);

 			if(op != DatopS)

 				fd = 255-fd;

 		if(grey){

-			*bdst.grey = MUL(fs, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);

+			*bdst.grey = CALC12(fs, *bsrc.grey, fd, *bdst.grey, t);

 			bsrc.grey += bsrc.delta;

 			bdst.grey += bdst.delta;

 		}else{

 			if(q){

-				*bdst.rgba = MUL0123(fs, *bsrc.rgba, s, t)+MUL0123(fd, *bdst.rgba, u, v);

+				*bdst.rgba = CALC42(fs, *bsrc.rgba, fd, *bdst.rgba, t, t1);

 				bsrc.rgba++;

 				bdst.rgba++;

 				bsrc.alpha += sadelta;

@@ -918,9 +988,9 @@

 				bdst.alpha += bdst.delta;

 				continue;

-			*bdst.red = MUL(fs, *bsrc.red, s)+MUL(fd, *bdst.red, t);

-			*bdst.grn = MUL(fs, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);

-			*bdst.blu = MUL(fs, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);

+			*bdst.red = CALC12(fs, *bsrc.red, fd, *bdst.red, t);

+			*bdst.grn = CALC12(fs, *bsrc.grn, fd, *bdst.grn, t);

+			*bdst.blu = CALC12(fs, *bsrc.blu, fd, *bdst.blu, t);

 			bsrc.red += bsrc.delta;

 			bsrc.blu += bsrc.delta;

 			bsrc.grn += bsrc.delta;

@@ -929,7 +999,7 @@

 			bdst.grn += bdst.delta;

 		if(bdst.alpha != &ones){

-			*bdst.alpha = MUL(fs, sa, s)+MUL(fd, da, t);

+			*bdst.alpha = CALC12(fs, sa, fd, da, t);

 			bdst.alpha += bdst.delta;

 		bmask.alpha += bmask.delta;

@@ -953,25 +1023,25 @@

 	Buffer obdst;

 	int fd, sadelta;

 	int i, sa, ma, q;

-	ulong s, t, u, v;

+	ulong t, t1;

 	USED(op);

 	obdst = bdst;

 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;

-	q = bsrc.delta == 4 && bdst.delta == 4;

+	q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);

 	for(i=0; i<dx; i++){

 		sa = *bsrc.alpha;

 		ma = *bmask.alpha;

-		fd = 255-MUL(sa, ma, t);

+		fd = 255-CALC11(sa, ma, t);

 		if(grey){

-			*bdst.grey = MUL(ma, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);

+			*bdst.grey = CALC12(ma, *bsrc.grey, fd, *bdst.grey, t);

 			bsrc.grey += bsrc.delta;

 			bdst.grey += bdst.delta;

 		}else{

 			if(q){

-				*bdst.rgba = MUL0123(ma, *bsrc.rgba, s, t)+MUL0123(fd, *bdst.rgba, u, v);

+				*bdst.rgba = CALC42(ma, *bsrc.rgba, fd, *bdst.rgba, t, t1);

 				bsrc.rgba++;

 				bdst.rgba++;

 				bsrc.alpha += sadelta;

@@ -978,9 +1048,9 @@

 				bmask.alpha += bmask.delta;

 				continue;

-			*bdst.red = MUL(ma, *bsrc.red, s)+MUL(fd, *bdst.red, t);

-			*bdst.grn = MUL(ma, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);

-			*bdst.blu = MUL(ma, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);

+			*bdst.red = CALC12(ma, *bsrc.red, fd, *bdst.red, t);

+			*bdst.grn = CALC12(ma, *bsrc.grn, fd, *bdst.grn, t);

+			*bdst.blu = CALC12(ma, *bsrc.blu, fd, *bdst.blu, t);

 			bsrc.red += bsrc.delta;

 			bsrc.blu += bsrc.delta;

 			bsrc.grn += bsrc.delta;

@@ -989,7 +1059,7 @@

 			bdst.grn += bdst.delta;

 		if(bdst.alpha != &ones){

-			*bdst.alpha = MUL(ma, sa, s)+MUL(fd, *bdst.alpha, t);

+			*bdst.alpha = CALC12(ma, sa, fd, *bdst.alpha, t);

 			bdst.alpha += bdst.delta;

 		bmask.alpha += bmask.delta;

@@ -1045,7 +1115,7 @@

 	Buffer obdst;

 	int fd;

 	int i, ma;

-	ulong s, t;

+	ulong t;

 	USED(op);

 	obdst = bdst;

@@ -1055,13 +1125,13 @@

 		fd = 255-ma;

 		if(grey){

-			*bdst.grey = MUL(ma, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);

+			*bdst.grey = CALC12(ma, *bsrc.grey, fd, *bdst.grey, t);

 			bsrc.grey += bsrc.delta;

 			bdst.grey += bdst.delta;

 		}else{

-			*bdst.red = MUL(ma, *bsrc.red, s)+MUL(fd, *bdst.red, t);

-			*bdst.grn = MUL(ma, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);

-			*bdst.blu = MUL(ma, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);

+			*bdst.red = CALC12(ma, *bsrc.red, fd, *bdst.red, t);

+			*bdst.grn = CALC12(ma, *bsrc.grn, fd, *bdst.grn, t);

+			*bdst.blu = CALC12(ma, *bsrc.blu, fd, *bdst.blu, t);

 			bsrc.red += bsrc.delta;

 			bsrc.blu += bsrc.delta;

 			bsrc.grn += bsrc.delta;

@@ -1070,7 +1140,7 @@

 			bdst.grn += bdst.delta;

 		if(bdst.alpha != &ones){

-			*bdst.alpha = ma+MUL(fd, *bdst.alpha, t);

+			*bdst.alpha = ma+CALC11(fd, *bdst.alpha, t);

 			bdst.alpha += bdst.delta;

 		bmask.alpha += bmask.delta;

@@ -1117,7 +1187,7 @@

 	Buffer obdst;

 	int fs, fd;

 	int i, ma, da, zero;

-	ulong s, t;

+	ulong t;

 	obdst = bdst;

 	zero = !(op&1);

@@ -1134,7 +1204,7 @@

 		if(grey){

 			if(ma)

-				*bdst.grey = MUL(fs, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);

+				*bdst.grey = CALC12(fs, *bsrc.grey, fd, *bdst.grey, t);

 			else if(zero)

 				*bdst.grey = 0;

 			bsrc.grey += bsrc.delta;

@@ -1141,9 +1211,9 @@

 			bdst.grey += bdst.delta;

 		}else{

 			if(ma){

-				*bdst.red = MUL(fs, *bsrc.red, s)+MUL(fd, *bdst.red, t);

-				*bdst.grn = MUL(fs, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);

-				*bdst.blu = MUL(fs, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);

+				*bdst.red = CALC12(fs, *bsrc.red, fd, *bdst.red, t);

+				*bdst.grn = CALC12(fs, *bsrc.grn, fd, *bdst.grn, t);

+				*bdst.blu = CALC12(fs, *bsrc.blu, fd, *bdst.blu, t);

 			else if(zero)

 				*bdst.red = *bdst.grn = *bdst.blu = 0;

@@ -1157,7 +1227,7 @@

 		bmask.alpha += bmask.delta;

 		if(bdst.alpha != &ones){

 			if(ma)

-				*bdst.alpha = fs+MUL(fd, da, t);

+				*bdst.alpha = fs+CALC11(fd, da, t);

 			else if(zero)

 				*bdst.alpha = 0;

 			bdst.alpha += bdst.delta;

@@ -1890,7 +1960,7 @@

 	default:

 		assert(0 /* boolcopyfn */);

-	return 0;

+	return nil;

/*

@@ -2353,7 +2423,7 @@

 	bx = -bsh-1;

 	ex = -bsh-1-dx;

-	bits = 0;

+	SET(bits);

 	v = par->sdval;

 	/* make little endian */

@@ -2457,7 +2527,6 @@

 	ulong bits;

 	int d, y;

-	uchar p[4];

 	if(val == DNofill)

 		return;

@@ -2471,11 +2540,6 @@

 	default:	/* 1, 2, 4, 8, 16, 32 */

 		for(d=i->depth; d<32; d*=2)

 			bits = (bits << d) | bits;

-		p[0] = bits;		/* make little endian */

-		p[1] = bits>>8;

-		p[2] = bits>>16;

-		p[3] = bits>>24;

-		bits = *(ulong*)p;

 		memsetl(wordaddr(i, i->r.min), bits, i->width*Dy(i->r));

 		break;

--

⑨