git: 9front

Download patch

ref: 2e4717b4e21ee49ef77271fed6e4ad6b38f532b1
parent: 8bbc6b260915019b8f31c3a22ae4c50f4e69653a
author: rodri <rgl@antares-labs.eu>
date: Thu Sep 4 15:49:57 EDT 2025

libmemdraw: add image correlation and other improvements

i replaced the bit-shifting in some of the pixel sampling routines for
a single SWAR multiplication, which should be faster in most of the
arches we support; although i've only tested it on amd64 and arm64.

also wrote memimagecorrelate() which allows us to correlate and
convolve images with a filter or kernel.  it's limited to a single
filter for all of the color channels, since we don't have support for
images with more than 8bpc yet, and we use 32-bit fixed-point numbers
for every kernel component.

we still had a decision outstanding for memaffinewarp() as to how to
control the filter it uses for the transformation.  i wasn't getting
good results when trying to integrate convolution into the sampler, so
i added a parameter to choose whether you want a smooth, higher quality
result or a faster but blocky one.  a more knowledgeable user can then
do their own post-processing with the correlation operator.

--- a/sys/include/memdraw.h
+++ b/sys/include/memdraw.h
@@ -151,8 +151,10 @@
 extern void	memarc(Memimage*, Point, int, int, int, Memimage*, Point, int, int, int);
 extern Rectangle	memlinebbox(Point, Point, int, int, int);
 extern int	memlineendsize(int);
-extern int	memaffinewarp(Memimage*, Rectangle, Memimage*, Point, Warp);
+extern int	memaffinewarp(Memimage*, Rectangle, Memimage*, Point, Warp, int);
 extern void	mkwarp(Warp, double[3][3]);
+extern Memimage*	allocmemimagekernel(double*, int, int, double);
+extern int	memimagecorrelate(Memimage*, Rectangle, Memimage*, Point, Memimage*);
 extern void	_memmkcmap(void);
 extern int	memimageinit(void);
 
--- a/sys/man/2/memdraw
+++ b/sys/man/2/memdraw
@@ -26,6 +26,8 @@
 memimagedraw,
 memaffinewarp,
 mkwarp,
+allocmemimagekernel,
+memimagecorrelate,
 drawclip,
 drawclipnorepl,
 memlinebbox,
@@ -138,8 +140,12 @@
 void	memimagedraw(Memimage *dst, Rectangle r, Memimage *src,
 	   Point sp, Memimage *mask, Point mp, Drawop op)
 int	memaffinewarp(Memimage *dst, Rectangle r, Memimage *src,
-	   Point sp, Warp w)
+	   Point sp, Warp w, int smooth)
 void	mkwarp(Warp w, double m[3][3])
+Memimage*	allocmemimagekernel(double *k, int dx, int dy,
+	   double denom)
+int	memimagecorrelate(Memimage *dst, Rectangle r, Memimage *src,
+	   Point sp, Memimage *k)
 .PP
 .ft L
 .nf
@@ -399,6 +405,10 @@
 and the
 .B repl
 bit, but applied to the affine mappings.
+If
+.B smooth
+is set, each sample will be interpolated for better quality, which
+will make the transformation slower.
 .PP
 .I Mkwarp
 converts a 3×3 row-major matrix of
@@ -413,6 +423,38 @@
 ignoring the last row which is always set to
 .B "[0 0 1]"
 for affine transformations.
+.PP
+.I Allocmemimagekernel
+takes
+.B k
+as a matrix with
+.B dx
+columns and
+.B dy
+rows, and turns it into a 32-bit depth
+.B Memimage
+with each pixel representing an element of the matrix as a fixed-point
+number—as required by
+.IR memimagecorrelate.
+Each of the elements will be divided by
+.B denom
+except when it's zero, where a sum of coefficients (Σkᵢ) will be
+precalculated and used instead.
+.PP
+.I Memimagecorrelate
+correlates image
+.B src
+with the filter
+.BR k ,
+and stores the result in the region of
+.B dst
+defined by
+.BR r .
+It also follows the conventions of
+.IR draw (2)
+regarding clipping and source image replication.
+The result is not normalized, but the same can be accomplished by
+normalizing the kernel (k/Σkᵢ) before calling this routine.
 .PP
 .I Drawclip
 takes the images involved in a draw operation,
--- /dev/null
+++ b/sys/src/libmemdraw/convolvetest.c
@@ -1,0 +1,258 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <draw.h>
+#include <memdraw.h>
+
+uvlong t0;
+
+void
+profbegin(void)
+{
+	char buf[128];
+	int fd;
+
+	snprint(buf, sizeof(buf), "/proc/%d/ctl", getpid());
+	fd = open(buf, OWRITE);
+	if(fd < 0)
+		sysfatal("open: %r");
+	fprint(fd, "profile\n");
+	close(fd);
+	t0 = nsec();
+}
+
+void
+profend(void)
+{
+	char buf[128];
+	double dt;
+
+	dt = (nsec() - t0)/1e6;
+	fprint(2, "dt: %fms\n", dt);
+
+	snprint(buf, sizeof(buf), "%d", getpid());
+	switch(fork()){
+	case -1:
+		abort();
+	case 0:
+		dup(2, 1);
+		print("tprof %s\n", buf);
+		execl("/bin/tprof", "tprof", buf, nil);
+		abort();
+	default:
+		free(wait());
+		break;
+	}
+}
+
+void
+initworkrects(Rectangle *wr, int nwr, Rectangle *fbr)
+{
+	int i, Δy;
+
+	wr[0] = *fbr;
+	Δy = Dy(wr[0])/nwr;
+	wr[0].max.y = wr[0].min.y + Δy;
+	for(i = 1; i < nwr; i++)
+		wr[i] = rectaddpt(wr[i-1], Pt(0,Δy));
+	if(wr[nwr-1].max.y < fbr->max.y)
+		wr[nwr-1].max.y = fbr->max.y;
+}
+
+void *
+erealloc(void *v, ulong n)
+{
+	void *nv;
+
+	nv = realloc(v, n);
+	if(nv == nil)
+		sysfatal("realloc: %r");
+	if(v == nil)
+		setmalloctag(nv, getcallerpc(&v));
+	else
+		setrealloctag(nv, getcallerpc(&v));
+	return nv;
+}
+
+void
+reversem(double *k, int len)
+{
+	double *e, t;
+
+	e = k + len;
+	while(k < e){
+		t = *k;
+		*k++ = *--e;
+		*e = t;
+	}
+}
+
+#define isspace(c)	((c) == ' ' || (c) == '\t')
+Memimage *
+readimagekernel(int fd, double denom, int reverse)
+{
+	Memimage *ik;
+	Biobuf *bin;
+	double *k, d;
+	int nk, dx0, dx, dy;
+	char *line;
+
+	k = nil;
+	nk = 0;
+	dx0 = dx = dy = 0;
+
+	bin = Bfdopen(fd, OREAD);
+	if(bin == nil)
+		sysfatal("Bfdopen: %r");
+
+	while((line = Brdstr(bin, '\n', 1)) != nil){
+		while(*line){
+			d = strtod(line, &line);
+			if(*line && !isspace(*line)){
+				free(k);
+				Bterm(bin);
+				werrstr("bad image kernel file");
+				return nil;
+			}
+			k = erealloc(k, (nk + 1)*sizeof(double));
+			k[nk++] = d;
+			dx++;
+		}
+		if(dy > 0 && dx != dx0){
+			free(k);
+			Bterm(bin);
+			werrstr("image kernel file has inconsistent dx");
+			return nil;
+		}
+		if(dy++ == 0)
+			dx0 = dx;
+		dx = 0;
+	}
+	Bterm(bin);
+
+	if(reverse)
+		reversem(k, dx0*dy);
+
+	ik = allocmemimagekernel(k, dx0, dy, denom);
+	free(k);
+	if(ik == nil){
+		werrstr("allocmemimagekernel: %r");
+		return nil;
+	}
+	return ik;
+}
+
+void
+usage(void)
+{
+	fprint(2, "usage: %s [-cRp] kernel [denom]\n", argv0);
+	exits("usage");
+}
+
+void
+main(int argc, char *argv[])
+{
+	Memimage *dst, *src;
+	Memimage *ikern;
+	Rectangle dr, *wr;
+	int convolve, dorepl, parallel, fd, nproc, i;
+	char *nprocs, *kernf;
+	double denom;
+
+	kernf = nil;
+	denom = 0;
+	convolve = 1;
+	dorepl = 0;
+	parallel = 0;
+	ARGBEGIN{
+	case 'c':
+		convolve = 0;	/* correlate */
+		break;
+	case 'R':
+		dorepl++;
+		break;
+	case 'p':
+		parallel++;
+		break;
+	default:
+		usage();
+	}ARGEND;
+	switch(argc){
+	case 2:
+		denom = strtod(argv[1], nil);
+		/* fallthrough */
+	case 1:
+		kernf = argv[0];
+		break;
+	default:
+		usage();
+	}
+
+	if(memimageinit() != 0)
+		sysfatal("memimageinit: %r");
+
+	fd = open(kernf, OREAD);
+	if(fd < 0)
+		sysfatal("open: %r");
+	ikern = readimagekernel(fd, denom, convolve);
+	if(ikern == nil)
+		sysfatal("readimagekernel: %r");
+	close(fd);
+
+	src = readmemimage(0);
+	if(dorepl)
+		src->flags |= Frepl;
+
+	dr = src->r;
+	dst = allocmemimage(dr, src->chan);
+	memfillcolor(dst, DTransparent);
+
+	profbegin();
+	if(parallel){
+		nprocs = getenv("NPROC");
+		if(nprocs == nil || (nproc = strtoul(nprocs, nil, 10)) < 2)
+			nproc = 1;
+		free(nprocs);
+
+		wr = malloc(nproc*sizeof(Rectangle));
+		initworkrects(wr, nproc, &dr);
+
+		for(i = 0; i < nproc; i++){
+			switch(rfork(RFPROC|RFMEM)){
+			case -1:
+				sysfatal("rfork: %r");
+			case 0:
+				if(memimagecorrelate(dst, wr[i], src, src->r.min, ikern) < 0)
+					fprint(2, "[%d] memimagecorrelate: %r\n", getpid());
+				exits(nil);
+			}
+		}
+		while(waitpid() != -1)
+			;
+
+		free(wr);
+	}else{
+		if(memimagecorrelate(dst, dr, src, src->r.min, ikern) < 0)
+			sysfatal("memimagecorrelate: %r");
+
+//		dr = rectaddpt(Rect(0,0,Dx(dst->r)/2,Dy(dst->r)/2), dst->r.min);
+//		dst->clipr = dr;
+//		if(memimagecorrelate(dst, dr, src, src->r.min, ikern) < 0)
+//			sysfatal("memimagecorrelate: %r");
+//		dr = rectaddpt(Rect(Dx(dst->r)/2+1,0,Dx(dst->r),Dy(dst->r)/2), dst->r.min);
+//		dst->clipr = dst->r;
+//		if(memimagecorrelate(dst, dr, src, src->r.min, ikern) < 0)
+//			sysfatal("memimagecorrelate: %r");
+//		dr = rectaddpt(Rect(0,Dy(dst->r)/2+1,Dx(dst->r)/2,Dy(dst->r)), dst->r.min);
+//		if(memimagecorrelate(dst, dr, src, src->r.min, ikern) < 0)
+//			sysfatal("memimagecorrelate: %r");
+//		dr = rectaddpt(Rect(Dx(dst->r)/2+1,Dy(dst->r)/2+1,Dx(dst->r),Dy(dst->r)), dst->r.min);
+//		dst->clipr = dr;
+//		if(memimagecorrelate(dst, dr, src, src->r.min, ikern) < 0)
+//			sysfatal("memimagecorrelate: %r");
+	}
+	profend();
+	writememimage(1, dst);
+
+	exits(nil);
+}
--- a/sys/src/libmemdraw/warp.c
+++ b/sys/src/libmemdraw/warp.c
@@ -9,12 +9,17 @@
 /* 25.7 fixed-point number operations */
 
 #define FMASK		((1<<7) - 1)
+#define flt2fix(n)	((long)((n)*(1<<7) + ((n) < 0? -0.5: 0.5)))
+#define fix2flt(n)	((n)/128.0)
 #define int2fix(n)	((vlong)(n)<<7)
 #define fix2int(n)	((n)>>7)
 #define fixmul(a,b)	((vlong)(a)*(vlong)(b) >> 7)
+#define fixdiv(a,b)	(((vlong)(a) << 7)/(vlong)(b))
 #define fixfrac(n)	((n)&FMASK)
 #define lerp(a,b,t)	((a) + ((((b) - (a))*(t))>>7))
 
+#define clamp(a,b,c)	((a)<(b)?(b):((a)>(c)?(c):(a)))
+
 typedef struct Sampler Sampler;
 typedef struct Blitter Blitter;
 
@@ -26,7 +31,9 @@
 	int bpl;
 	int cmask;
 	long Δx, Δy;
-	ulong (*getpixel)(Sampler*, Point);
+	Memimage *k;			/* filtering kernel */
+	Point kcp;			/* kernel center point */
+	ulong (*fn)(Sampler*, Point);
 };
 
 struct Blitter
@@ -35,11 +42,34 @@
 	uchar *a;
 	int bpl;
 	int cmask;
-	void (*putpixel)(Blitter*, Point, ulong);
+	void (*fn)(Blitter*, Point, ulong);
 };
 
+static void *getsampfn(ulong);
+static void *getblitfn(ulong);
+
+static void
+initsampler(Sampler *s, Memimage *i)
+{
+	s->i = i;
+	s->a = i->data->bdata + i->zero;
+	s->bpl = sizeof(ulong)*i->width;
+	s->cmask = (1ULL << i->depth) - 1;
+	s->fn = getsampfn(i->chan);
+}
+
+static void
+initblitter(Blitter *b, Memimage *i)
+{
+	b->i = i;
+	b->a = i->data->bdata + i->zero;
+	b->bpl = sizeof(ulong)*i->width;
+	b->cmask = (1ULL << i->depth) - 1;
+	b->fn = getblitfn(i->chan);
+}
+
 static Point
-fix_xform(Point p, Warp m)
+xform(Point p, Warp m)
 {
 	return (Point){
 		fixmul(p.x, m[0][0]) + fixmul(p.y, m[0][1]) + m[0][2],
@@ -57,8 +87,7 @@
 	npack = 8;
 	off = pt.x % npack;
 	v = p[0] >> (npack-1-off) & 0x1;
-	v *= 0xFFFFFFFF;
-	return v|0xFF;
+	return v*0xFFFFFF00 | 0xFF;
 }
 
 static ulong
@@ -71,9 +100,7 @@
 	npack = 8/2;
 	off = pt.x % npack;
 	v = p[0] >> 2*(npack-1-off) & 0x3;
-	v |= v<<2;
-	v |= v<<4;
-	return (v<<24)|(v<<16)|(v<<8)|0xFF;
+	return v*0x55555500 | 0xFF;
 }
 
 static ulong
@@ -86,32 +113,27 @@
 	npack = 8/4;
 	off = pt.x % npack;
 	v = p[0] >> 4*(npack-1-off) & 0xF;
-	v |= v<<4;
-	return (v<<24)|(v<<16)|(v<<8)|0xFF;
+	return v*0x11111100 | 0xFF;
 }
 
 static ulong
 getpixel_k8(Sampler *s, Point pt)
 {
-	uchar *p, v;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x;
-	v = p[0];
-	return (v<<24)|(v<<16)|(v<<8)|0xFF;
+	return p[0]*0x01010100 | 0xFF;
 }
 
 static ulong
 getpixel_m8(Sampler *s, Point pt)
 {
-	uchar *p, m, r, g, b;
+	uchar *p, m;
 
 	p = s->a + pt.y*s->bpl + pt.x;
 	m = p[0];
 	p = s->i->cmap->cmap2rgb+3*m;
-	r = p[0];
-	g = p[1];
-	b = p[2];
-	return (r<<24)|(g<<16)|(b<<8)|0xFF;
+	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|0xFF;
 }
 
 static ulong
@@ -149,88 +171,64 @@
 static ulong
 getpixel_r8g8b8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*3;
-	b = p[0];
-	g = p[1];
-	r = p[2];
-	return (r<<24)|(g<<16)|(b<<8)|0xFF;
+	return (p[2]<<24)|(p[1]<<16)|(p[0]<<8)|0xFF;
 }
 
 static ulong
 getpixel_r8g8b8a8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b, a;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*4;
-	a = p[0];
-	b = p[1];
-	g = p[2];
-	r = p[3];
-	return (r<<24)|(g<<16)|(b<<8)|a;
+	return (p[3]<<24)|(p[2]<<16)|(p[1]<<8)|p[0];
 }
 
 static ulong
 getpixel_a8r8g8b8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b, a;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*4;
-	b = p[0];
-	g = p[1];
-	r = p[2];
-	a = p[3];
-	return (r<<24)|(g<<16)|(b<<8)|a;
+	return (p[2]<<24)|(p[1]<<16)|(p[0]<<8)|p[3];
 }
 
 static ulong
 getpixel_x8r8g8b8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*4;
-	b = p[0];
-	g = p[1];
-	r = p[2];
-	return (r<<24)|(g<<16)|(b<<8)|0xFF;
+	return (p[2]<<24)|(p[1]<<16)|(p[0]<<8)|0xFF;
 }
 
 static ulong
 getpixel_b8g8r8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*3;
-	r = p[0];
-	g = p[1];
-	b = p[2];
-	return (r<<24)|(g<<16)|(b<<8)|0xFF;
+	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|0xFF;
 }
 
 static ulong
 getpixel_a8b8g8r8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b, a;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*4;
-	r = p[0];
-	g = p[1];
-	b = p[2];
-	a = p[3];
-	return (r<<24)|(g<<16)|(b<<8)|a;
+	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
 }
 
 static ulong
 getpixel_x8b8g8r8(Sampler *s, Point pt)
 {
-	uchar *p, r, g, b;
+	uchar *p;
 
 	p = s->a + pt.y*s->bpl + pt.x*4;
-	r = p[0];
-	g = p[1];
-	b = p[2];
-	return (r<<24)|(g<<16)|(b<<8)|0xFF;
+	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|0xFF;
 }
 
 static ulong
@@ -391,12 +389,11 @@
 {
 	uchar *p, r, g, b, m;
 
-	p = blt->a + dp.y*blt->bpl + dp.x;
-
 	r = rgba>>24;
 	g = rgba>>16;
 	b = rgba>>8;
 	m = RGB2K(r,g,b);
+	p = blt->a + dp.y*blt->bpl + dp.x;
 	p[0] = m;
 }
 
@@ -405,12 +402,11 @@
 {
 	uchar *p, r, g, b, m;
 
-	p = blt->a + dp.y*blt->bpl + dp.x;
-
 	r = rgba>>24;
 	g = rgba>>16;
 	b = rgba>>8;
 	m = blt->i->cmap->rgb2cmap[(r>>4)*256+(g>>4)*16+(b>>4)];
+	p = blt->a + dp.y*blt->bpl + dp.x;
 	p[0] = m;
 }
 
@@ -420,7 +416,6 @@
 	uchar *p, r, g, b;
 	ushort v;
 
-	p = blt->a + dp.y*blt->bpl + dp.x*2;
 	r = rgba>>24;
 	g = rgba>>16;
 	b = rgba>>8;
@@ -427,6 +422,7 @@
 	v = r>>(8-5);
 	v = (v<<5)|(g>>(8-5));
 	v = (v<<5)|(b>>(8-5));
+	p = blt->a + dp.y*blt->bpl + dp.x*2;
 	p[0] = v;
 	p[1] = v>>8;
 }
@@ -437,7 +433,6 @@
 	uchar *p, r, g, b;
 	ushort v;
 
-	p = blt->a + dp.y*blt->bpl + dp.x*2;
 	r = rgba>>24;
 	g = rgba>>16;
 	b = rgba>>8;
@@ -444,6 +439,7 @@
 	v = r>>(8-5);
 	v = (v<<6)|(g>>(8-6));
 	v = (v<<5)|(b>>(8-5));
+	p = blt->a + dp.y*blt->bpl + dp.x*2;
 	p[0] = v;
 	p[1] = v>>8;
 }
@@ -589,7 +585,7 @@
 }
 
 static void *
-initsampfn(ulong chan)
+getsampfn(ulong chan)
 {
 	switch(chan){
 	case GREY1: return getpixel_k1;
@@ -611,7 +607,7 @@
 }
 
 static void *
-initblitfn(ulong chan)
+getblitfn(ulong chan)
 {
 	switch(chan){
 	case GREY1: return putpixel_k1;
@@ -637,10 +633,10 @@
 {
 	if(p.x >= s->r.min.x && p.x < s->r.max.x
 	&& p.y >= s->r.min.y && p.y < s->r.max.y)
-		return s->getpixel(s, p);
+		return s->fn(s, p);
 	else if(s->i->flags & Frepl){
 		p = drawrepl(s->r, p);
-		return s->getpixel(s, p);
+		return s->fn(s, p);
 	}
 	/* edge handler: constant */
 	return 0;
@@ -683,16 +679,44 @@
 }
 
 static ulong
-nearest(Sampler *s, Point p)
+correlate(Sampler *s, Point p)
 {
-	return sample1(s, p);
-}
+	Point sp;
+	int r, g, b, a, Σr, Σg, Σb, Σa;
+	long *kp, kv;
+	ulong v;
 
-static ulong (*sample)(Sampler*, Point) = bilinear;
+	kp = (long*)(s->k->data->bdata + s->k->zero);
+	Σr = Σg = Σb = Σa = 0;
 
+	for(sp.y = 0; sp.y < s->k->r.max.y; sp.y++)
+	for(sp.x = 0; sp.x < s->k->r.max.x; sp.x++){
+		v = sample1(s, addpt(p, subpt(sp, s->kcp)));
+		r = v>>24 & 0xFF; r = int2fix(r);
+		g = v>>16 & 0xFF; g = int2fix(g);
+		b = v>>8  & 0xFF; b = int2fix(b);
+		a = v>>0  & 0xFF; a = int2fix(a);
+
+		kv = kp[sp.y*s->k->r.max.x + sp.x];
+		r = fixmul(r, kv);
+		g = fixmul(g, kv);
+		b = fixmul(b, kv);
+		a = fixmul(a, kv);
+
+		Σr += r; Σg += g; Σb += b; Σa += a;
+	}
+	r = clamp(fix2int(Σr), 0, 0xFF);
+	g = clamp(fix2int(Σg), 0, 0xFF);
+	b = clamp(fix2int(Σb), 0, 0xFF);
+	a = clamp(fix2int(Σa), 0, 0xFF);
+
+	return r<<24|g<<16|b<<8|a;
+}
+
 int
-memaffinewarp(Memimage *d, Rectangle r, Memimage *s, Point sp0, Warp m)
+memaffinewarp(Memimage *d, Rectangle r, Memimage *s, Point sp0, Warp m, int smooth)
 {
+	ulong (*sample)(Sampler*, Point) = sample1;
 	Sampler samp;
 	Blitter blit;
 	Point sp, dp, p2, p2₀;
@@ -708,17 +732,11 @@
 	if(rectclip(&samp.r, s->r) == 0)
 		return 0;
 
-	samp.i = s;
-	samp.a = s->data->bdata + s->zero;
-	samp.bpl = sizeof(ulong)*s->width;
-	samp.cmask = (1ULL << s->depth) - 1;
-	samp.getpixel = initsampfn(s->chan);
+	if(smooth)
+		sample = bilinear;
 
-	blit.i = d;
-	blit.a = d->data->bdata + d->zero;
-	blit.bpl = sizeof(ulong)*d->width;
-	blit.cmask = (1ULL << d->depth) - 1;
-	blit.putpixel = initblitfn(d->chan);
+	initsampler(&samp, s);
+	initblitter(&blit, d);
 
 	/*
 	 * incremental affine warping technique from:
@@ -726,7 +744,7 @@
 	 * 	Lee, S., Lee, GG., Jang, E.S., Kim, WY,
 	 * 	Intelligent Computing.  ICIC 2006. LNCS, vol 4113.
 	 */
-	p2 = p2₀ = fix_xform((Point){int2fix(r.min.x - dr.min.x), int2fix(r.min.y - dr.min.y)}, m);
+	p2 = p2₀ = xform((Point){int2fix(r.min.x - dr.min.x) + (1<<6), int2fix(r.min.y - dr.min.y) + (1<<6)}, m);
 	for(dp.y = r.min.y; dp.y < r.max.y; dp.y++){
 	for(dp.x = r.min.x; dp.x < r.max.x; dp.x++){
 		samp.Δx = fixfrac(p2.x);
@@ -736,7 +754,7 @@
 		sp.y = sp0.y + fix2int(p2.y);
 
 		c = sample(&samp, sp);
-		blit.putpixel(&blit, dp, c);
+		blit.fn(&blit, dp, c);
 
 		p2.x += m[0][0];
 		p2.y += m[1][0];
@@ -743,6 +761,89 @@
 	}
 		p2.x = p2₀.x += m[0][1];
 		p2.y = p2₀.y += m[1][1];
+	}
+	return 0;
+}
+
+static double
+coeffsum(double *m, int len)
+{
+	double *e, Σ;
+
+	e = m + len;
+	Σ = 0;
+	while(m < e)
+		Σ += *m++;
+	return Σ;
+}
+
+Memimage *
+allocmemimagekernel(double *k, int dx, int dy, double denom)
+{
+	Memimage *ik;
+	long *kern;
+	double t;
+	int x, y;
+
+	ik = allocmemimage(Rect(0,0,dx,dy), RGBA32); /* any 32-bit depth chan would do */
+	if(ik == nil){
+		werrstr("could not allocate image kernel: %r");
+		return nil;
+	}
+
+	if(denom == 0)
+		denom = coeffsum(k, dx*dy);
+	denom = denom? 1/denom: 1;
+
+	kern = (long*)(ik->data->bdata + ik->zero);
+	for(y = 0; y < dy; y++)
+	for(x = 0; x < dx; x++){
+		t = k[y*dx + x];
+		t *= denom;
+		kern[y*dx + x] = flt2fix(t);
+	}
+
+	return ik;
+}
+
+int
+memimagecorrelate(Memimage *d, Rectangle r, Memimage *s, Point sp0, Memimage *k)
+{
+	Sampler samp;
+	Blitter blit;
+	Point sp, dp;
+	Rectangle dr;
+	ulong c;
+
+	dr = d->clipr;
+	rectclip(&dr, d->r);
+	if(rectclip(&r, dr) == 0)
+		return 0;
+
+	samp.r = s->clipr;
+	if(rectclip(&samp.r, s->r) == 0)
+		return 0;
+
+	initsampler(&samp, s);
+	initblitter(&blit, d);
+
+	if(k == nil){
+		werrstr("kernel image is nil");
+		return -1;
+	}
+	if(k->depth != 32){
+		werrstr("kernel image depth is not 32");
+		return -1;
+	}
+	samp.k = k;
+	samp.kcp = Pt(Dx(k->r)/2, Dy(k->r)/2);
+
+	for(dp.y = r.min.y; dp.y < r.max.y; dp.y++)
+	for(dp.x = r.min.x; dp.x < r.max.x; dp.x++){
+		sp.x = sp0.x + dp.x - dr.min.x;
+		sp.y = sp0.y + dp.y - dr.min.y;
+		c = correlate(&samp, sp);
+		blit.fn(&blit, dp, c);
 	}
 	return 0;
 }
--- a/sys/src/libmemdraw/warptest.c
+++ b/sys/src/libmemdraw/warptest.c
@@ -85,7 +85,7 @@
 void
 usage(void)
 {
-	fprint(2, "usage: %s [-s sx sy] [-t tx ty] [-r θ]\n", argv0);
+	fprint(2, "usage: %s [-SRp] [-s sx sy] [-t tx ty] [-r θ]\n", argv0);
 	exits("usage");
 }
 
@@ -96,12 +96,13 @@
 	Warp w;
 	Rectangle dr, *wr;
 	double sx, sy, tx, ty, θ, c, s;
-	int dorepl, parallel, nproc, i;
+	int smooth, dorepl, parallel, nproc, i;
 	char *nprocs;
 
 	sx = sy = 1;
 	tx = ty = 0;
 	θ = 0;
+	smooth = 0;
 	dorepl = 0;
 	parallel = 0;
 	ARGBEGIN{
@@ -116,6 +117,9 @@
 	case 'r':
 		θ = strtod(EARGF(usage()), nil)*DEG;
 		break;
+	case 'S':
+		smooth++;
+		break;
 	case 'R':
 		dorepl++;
 		break;
@@ -175,7 +179,7 @@
 			case -1:
 				sysfatal("rfork: %r");
 			case 0:
-				if(memaffinewarp(dst, wr[i], src, src->r.min, w) < 0)
+				if(memaffinewarp(dst, wr[i], src, src->r.min, w, smooth) < 0)
 					fprint(2, "[%d] memaffinewarp: %r\n", getpid());
 				exits(nil);
 			}
@@ -185,23 +189,23 @@
 
 		free(wr);
 	}else{
-		if(memaffinewarp(dst, dr, src, src->r.min, w) < 0)
+		if(memaffinewarp(dst, dr, src, src->r.min, w, smooth) < 0)
 			sysfatal("memaffinewarp: %r");
 
 //		dr = rectaddpt(Rect(0,0,Dx(dst->r)/2,Dy(dst->r)/2), dst->r.min);
 //		dst->clipr = dr;
-//		if(memaffinewarp(dst, dr, src, src->r.min, w) < 0)
+//		if(memaffinewarp(dst, dr, src, src->r.min, w, smooth) < 0)
 //			sysfatal("memaffinewarp: %r");
 //		dr = rectaddpt(Rect(Dx(dst->r)/2+1,0,Dx(dst->r),Dy(dst->r)/2), dst->r.min);
 //		dst->clipr = dst->r;
-//		if(memaffinewarp(dst, dr, src, src->r.min, w) < 0)
+//		if(memaffinewarp(dst, dr, src, src->r.min, w, smooth) < 0)
 //			sysfatal("memaffinewarp: %r");
 //		dr = rectaddpt(Rect(0,Dy(dst->r)/2+1,Dx(dst->r)/2,Dy(dst->r)), dst->r.min);
-//		if(memaffinewarp(dst, dr, src, src->r.min, w) < 0)
+//		if(memaffinewarp(dst, dr, src, src->r.min, w, smooth) < 0)
 //			sysfatal("memaffinewarp: %r");
 //		dr = rectaddpt(Rect(Dx(dst->r)/2+1,Dy(dst->r)/2+1,Dx(dst->r),Dy(dst->r)), dst->r.min);
 //		dst->clipr = dr;
-//		if(memaffinewarp(dst, dr, src, src->r.min, w) < 0)
+//		if(memaffinewarp(dst, dr, src, src->r.min, w, smooth) < 0)
 //			sysfatal("memaffinewarp: %r");
 	}
 	profend();
--