code: plan9front

ref: 85ca0c9adc0d507b17f85b6de5227f533d9fb0b6
dir: /sys/src/9/pc/etherm10g.c/

View raw version
/*
 * myricom 10g-pcie-8a 10 Gb ethernet driver
 * © 2007 erik quanstrom, coraid
 *
 * the card is big endian.
 * we use uvlong rather than uintptr to hold addresses so that
 * we don't get "warning: stupid shift" on 32-bit architectures.
 */
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/pci.h"
#include "../port/error.h"
#include "../port/netif.h"
#include "../port/etherif.h"

#ifndef KiB
#define KiB		1024u			/* Kibi 0x0000000000000400 */
#define MiB		1048576u		/* Mebi 0x0000000000100000 */
#endif /* KiB */

#define	dprint(...)	if(debug) print(__VA_ARGS__)
#define	pcicapdbg(...)
#define malign(n)	mallocalign((n), 4*KiB, 0, 0)

#include "etherm10g2k.i"
#include "etherm10g4k.i"

static int 	debug		= 0;
static char	Etimeout[]	= "timeout";

enum {
	Epromsz	= 256,
	Maxslots= 1024,
	Align	= 4096,
	Maxmtu	= 9000,
	Noconf	= 0xffffffff,

	Fwoffset= 1*MiB,
	Cmdoff	= 0xf80000,	/* command port offset */
	Fwsubmt	= 0xfc0000,	/* firmware submission command port offset */
	Rdmaoff	= 0xfc01c0,	/* rdma command port offset */
};

enum {
	CZero,
	Creset,
	Cversion,

	CSintrqdma,	/* issue these before Cetherup */
	CSbigsz,	/* in bytes bigsize = 2^n */
	CSsmallsz,

	CGsendoff,
	CGsmallrxoff,
	CGbigrxoff,
	CGirqackoff,
	CGirqdeassoff,
	CGsendrgsz,
	CGrxrgsz,

	CSintrqsz,	/* 2^n */
	Cetherup,	/* above parameters + mtu/mac addr must be set first. */
	Cetherdn,

	CSmtu,		/* below may be issued live */
	CGcoaloff,	/* in µs */
	CSstatsrate,	/* in µs */
	CSstatsdma,

	Cpromisc,
	Cnopromisc,
	CSmac,

	Cenablefc,
	Cdisablefc,

	Cdmatest,	/* address in d[0-1], d[2]=length */

	Cenableallmc,
	Cdisableallmc,

	CSjoinmc,
	CSleavemc,
	Cleaveallmc,

	CSstatsdma2,	/* adds (unused) multicast stats */
};

typedef union {
	uint	i[2];
	uchar	c[8];
} Cmd;

typedef ulong Slot;
typedef struct {
	ushort	cksum;
	ushort	len;
} Slotparts;

enum {
	SFsmall	= 1,
	SFfirst	= 2,
	SFalign	= 4,
	SFnotso	= 16,
};

typedef struct {
	ulong	high;
	ulong	low;
	ushort	hdroff;
	ushort	len;
	uchar	pad;
	uchar	nrdma;
	uchar	chkoff;
	uchar	flags;
} Send;

typedef struct {
	QLock;
	Send	*lanai;		/* tx ring (cksum+len in lanai memory) */
	Send	*host;		/* tx ring (data in our memory) */
	Block	**bring;
//	uchar	*wcfifo;	/* what the heck is a w/c fifo? */
	int	size;		/* of buffers in the z8's memory */
	ulong	segsz;
	uint	n;		/* rxslots */
	uint	m;		/* mask; rxslots must be a power of two */
	uint	i;		/* number of segments (not frames) queued */
	uint	cnt;		/* number of segments sent by the card */

	ulong	npkt;
	vlong	nbytes;
} Tx;

typedef struct {
	Lock;
	Block	*head;
	uint	size;		/* buffer size of each block */
	uint	n;		/* n free buffers */
	uint	cnt;
} Bpool;

static Bpool	smpool 	= { .size = 128, };
static Bpool	bgpool	= { .size = Maxmtu, };

typedef struct {
	Bpool	*pool;		/* free buffers */
	ulong	*lanai;		/* rx ring; we have no permanent host shadow */
	Block	**host;		/* called "info" in myricom driver */
//	uchar	*wcfifo;	/* cmd submission fifo */
	uint	m;
	uint	n;		/* rxslots */
	uint	i;
	uint	cnt;		/* number of buffers allocated (lifetime) */
	uint	allocfail;
} Rx;

/* dma mapped.  unix network byte order. */
typedef struct {
	uchar	txcnt[4];
	uchar	linkstat[4];
	uchar	dlink[4];
	uchar	derror[4];
	uchar	drunt[4];
	uchar	doverrun[4];
	uchar	dnosm[4];
	uchar	dnobg[4];
	uchar	nrdma[4];
	uchar	txstopped;
	uchar	down;
	uchar	updated;
	uchar	valid;
} Stats;

enum {
	Detached,
	Attached,
	Runed,
};

typedef struct {
	Slot 	*entry;
	uvlong	busaddr;
	uint	m;
	uint	n;
	uint	i;
} Done;

typedef struct Ctlr Ctlr;
typedef struct Ctlr {
	QLock;
	uvlong	port;
	Pcidev*	pcidev;
	Ctlr*	next;
	int	state;
	int	kprocs;
	int	active;
	int	id;		/* do we need this? */

	uchar	ra[Eaddrlen];

	int	ramsz;
	uchar	*ram;

	ulong	*irqack;
	ulong	*irqdeass;
	ulong	*coal;

	char	eprom[Epromsz];
	ulong	serial;		/* unit serial number */

	QLock	cmdl;
	Cmd	*cmd;		/* address of command return */
	uvlong	cprt;		/* bus address of command */

	uvlong	boot;		/* boot address */

	Done	done;
	Tx	tx;
	Rx	sm;
	Rx	bg;
	Stats	*stats;
	uvlong	statsprt;

	Rendez	rxrendez;
	Rendez	txrendez;

	int	msi;
	ulong	linkstat;
	ulong	nrdma;
} Ctlr;

static Ctlr 	*ctlrs;

enum {
	PcieAERC = 1,
	PcieVC,
	PcieSNC,
	PciePBC,
};

enum {
	AercCCR	= 0x18,		/* control register */
};

enum {
	PcieCTL	= 8,
	PcieLCR	= 12,
	PcieMRD	= 0x7000,	/* maximum read size */
};

static int
pcicap(Pcidev *p, int cap)
{
	int i, c, off;

	pcicapdbg("pcicap: %x:%d\n", p->vid, p->did);
	off = 0x34;			/* 0x14 for cardbus */
	for(i = 48; i--; ){
		pcicapdbg("\t" "loop %x\n", off);
		off = pcicfgr8(p, off);
		pcicapdbg("\t" "pcicfgr8 %x\n", off);
		if(off < 0x40)
			break;
		off &= ~3;
		c = pcicfgr8(p, off);
		pcicapdbg("\t" "pcicfgr8 %x\n", c);
		if(c == 0xff)
			break;
		if(c == cap)
			return off;
		off++;
	}
	return 0;
}

/*
 * this function doesn't work because pcicgr32 doesn't have access
 * to the pcie extended configuration space.
 */
static int
pciecap(Pcidev *p, int cap)
{
	uint off, i;

	off = 0x100;
	while(((i = pcicfgr32(p, off)) & 0xffff) != cap){
		off = i >> 20;
		print("m10g: pciecap offset = %ud",  off);
		if(off < 0x100 || off >= 4*KiB - 1)
			return 0;
	}
	print("m10g: pciecap found = %ud",  off);
	return off;
}

static int
setpcie(Pcidev *p)
{
	int off;

	/* set 4k writes */
	off = pcicap(p, PciCapPCIe);
	if(off < 64)
		return -1;
	off += PcieCTL;
	pcicfgw16(p, off, (pcicfgr16(p, off) & ~PcieMRD) | 5<<12);
	return 0;
}

static int
whichfw(Pcidev *p)
{
	char *s;
	int i, off, lanes, ecrc;
	ulong cap;

	/* check the number of configured lanes. */
	off = pcicap(p, PciCapPCIe);
	if(off < 64)
		return -1;
	off += PcieLCR;
	cap = pcicfgr16(p, off);
	lanes = (cap>>4) & 0x3f;

	/* check AERC register.  we need it on.  */
	off = pciecap(p, PcieAERC);
	print("; offset %d returned\n", off);
	cap = 0;
	if(off != 0){
		off += AercCCR;
		cap = pcicfgr32(p, off);
		print("m10g: %lud cap\n", cap);
	}
	ecrc = (cap>>4) & 0xf;
	/* if we don't like the aerc, kick it here. */

	print("m10g: %d lanes; ecrc=%d; ", lanes, ecrc);
	if(s = getconf("myriforce")){
		i = strtol(s, 0, 0);
		if(i != 4*KiB || i != 2*KiB)
			i = 2*KiB;
		print("fw = %d [forced]\n", i);
		return i;
	}
	if(lanes <= 4)
		print("fw = 4096 [lanes]\n");
	else if(ecrc & 10)
		print("fw = 4096 [ecrc set]\n");
	else
		print("fw = 4096 [default]\n");
	return 4*KiB;
}

static int
parseeprom(Ctlr *c)
{
	int i, j, k, l, bits;
	char *s;

	dprint("m10g eprom:\n");
	s = c->eprom;
	bits = 3;
	for(i = 0; s[i] && i < Epromsz; i++){
		l = strlen(s+i);
		dprint("\t%s\n", s+i);
		if(strncmp(s+i, "MAC=", 4) == 0 && l == 4+12+5){
			bits ^= 1;
			j = i + 4;
			for(k = 0; k < 6; k++)
				c->ra[k] = strtoul(s+j+3*k, 0, 16);
		}else if(strncmp(s+i, "SN=", 3) == 0){
			bits ^= 2;
			c->serial = strtoul(s+i+3, 0, 10);
		}
		i += l;
	}
	if(bits)
		return -1;
	return 0;
}

static ushort
pbit16(ushort i)
{
	ushort j;
	uchar *p;

	p = (uchar*)&j;
	p[1] = i;
	p[0] = i>>8;
	return j;
}

static ushort
gbit16(uchar i[2])
{
	ushort j;

	j  = i[1];
	j |= i[0]<<8;
	return j;
}

static ulong
pbit32(ulong i)
{
	ulong j;
	uchar *p;

	p = (uchar*)&j;
	p[3] = i;
	p[2] = i>>8;
	p[1] = i>>16;
	p[0] = i>>24;
	return j;
}

static ulong
gbit32(uchar i[4])
{
	ulong j;

	j  = i[3];
	j |= i[2]<<8;
	j |= i[1]<<16;
	j |= i[0]<<24;
	return j;
}

static void
prepcmd(ulong *cmd, int i)
{
	while(i-- > 0)
		cmd[i] = pbit32(cmd[i]);
}

/*
 * the command looks like this (int 32bit integers)
 * cmd type
 * addr (low)
 * addr (high)
 * pad (used for dma testing)
 * response (high)
 * response (low)
 * 40 byte = 5 int pad.
 */

ulong
cmd(Ctlr *c, int type, uvlong data)
{
	ulong buf[16], i;
	Cmd *cmd;

	qlock(&c->cmdl);
	cmd = c->cmd;
	cmd->i[1] = Noconf;
	memset(buf, 0, sizeof buf);
	buf[0] = type;
	buf[1] = data;
	buf[2] = data >> 32;
	buf[4] = c->cprt >> 32;
	buf[5] = c->cprt;
	prepcmd(buf, 6);
	coherence();
	memmove(c->ram + Cmdoff, buf, sizeof buf);

	if(waserror())
		nexterror();
	for(i = 0; i < 15; i++){
		if(cmd->i[1] != Noconf){
			poperror();
			i = gbit32(cmd->c);
			qunlock(&c->cmdl);
			if(cmd->i[1] != 0)
				dprint("[%lux]", i);
			return i;
		}
		tsleep(&up->sleep, return0, 0, 1);
	}
	qunlock(&c->cmdl);
	iprint("m10g: cmd timeout [%ux %ux] cmd=%d\n",
		cmd->i[0], cmd->i[1], type);
	error(Etimeout);
	return ~0;			/* silence! */
}

ulong
maccmd(Ctlr *c, int type, uchar *m)
{
	ulong buf[16], i;
	Cmd *cmd;

	qlock(&c->cmdl);
	cmd = c->cmd;
	cmd->i[1] = Noconf;
	memset(buf, 0, sizeof buf);
	buf[0] = type;
	buf[1] = m[0]<<24 | m[1]<<16 | m[2]<<8 | m[3];
	buf[2] = m[4]<< 8 | m[5];
	buf[4] = c->cprt >> 32;
	buf[5] = c->cprt;
	prepcmd(buf, 6);
	coherence();
	memmove(c->ram + Cmdoff, buf, sizeof buf);

	if(waserror())
		nexterror();
	for(i = 0; i < 15; i++){
		if(cmd->i[1] != Noconf){
			poperror();
			i = gbit32(cmd->c);
			qunlock(&c->cmdl);
			if(cmd->i[1] != 0)
				dprint("[%lux]", i);
			return i;
		}
		tsleep(&up->sleep, return0, 0, 1);
	}
	qunlock(&c->cmdl);
	iprint("m10g: maccmd timeout [%ux %ux] cmd=%d\n",
		cmd->i[0], cmd->i[1], type);
	error(Etimeout);
	return ~0;			/* silence! */
}

/* remove this garbage after testing */
enum {
	DMAread	= 0x10000,
	DMAwrite= 0x1,
};

ulong
dmatestcmd(Ctlr *c, int type, uvlong addr, int len)
{
	ulong buf[16], i;

	memset(buf, 0, sizeof buf);
	memset(c->cmd, Noconf, sizeof *c->cmd);
	buf[0] = Cdmatest;
	buf[1] = addr;
	buf[2] = addr >> 32;
	buf[3] = len * type;
	buf[4] = c->cprt >> 32;
	buf[5] = c->cprt;
	prepcmd(buf, 6);
	coherence();
	memmove(c->ram + Cmdoff, buf, sizeof buf);

	if(waserror())
		nexterror();
	for(i = 0; i < 15; i++){
		if(c->cmd->i[1] != Noconf){
			i = gbit32(c->cmd->c);
			if(i == 0)
				error(Eio);
			poperror();
			return i;
		}
		tsleep(&up->sleep, return0, 0, 5);
	}
	error(Etimeout);
	return ~0;			/* silence! */
}

ulong
rdmacmd(Ctlr *c, int on)
{
	ulong buf[16], i;

	memset(buf, 0, sizeof buf);
	c->cmd->i[0] = 0;
	coherence();
	buf[0] = c->cprt >> 32;
	buf[1] = c->cprt;
	buf[2] = Noconf;
	buf[3] = c->cprt >> 32;
	buf[4] = c->cprt;
	buf[5] = on;
	prepcmd(buf, 6);
	memmove(c->ram + Rdmaoff, buf, sizeof buf);

	if(waserror())
		nexterror();
	for(i = 0; i < 20; i++){
		if(c->cmd->i[0] == Noconf){
			poperror();
			return gbit32(c->cmd->c);
		}
		tsleep(&up->sleep, return0, 0, 1);
	}
	error(Etimeout);
	iprint("m10g: rdmacmd timeout\n");
	return ~0;			/* silence! */
}

static int
loadfw(Ctlr *c, int *align)
{
	ulong *f, *s, sz;
	int i;

	if((*align = whichfw(c->pcidev)) == 4*KiB){
		f = (ulong*)fw4k;
		sz = sizeof fw4k;
	}else{
		f = (ulong*)fw2k;
		sz = sizeof fw2k;
	}

	s = (ulong*)(c->ram + Fwoffset);
	for(i = 0; i < sz / 4; i++)
		s[i] = f[i];
	return sz & ~3;
}

static int
bootfw(Ctlr *c)
{
	int i, sz, align;
	ulong buf[16];
	Cmd* cmd;

	if((sz = loadfw(c, &align)) == 0)
		return 0;
	dprint("bootfw %d bytes ... ", sz);
	cmd = c->cmd;

	memset(buf, 0, sizeof buf);
	c->cmd->i[0] = 0;
	coherence();
	buf[0] = c->cprt >> 32;	/* upper dma target address */
	buf[1] = c->cprt;	/* lower */
	buf[2] = Noconf;	/* writeback */
	buf[3] = Fwoffset + 8,
	buf[4] = sz - 8;
	buf[5] = 8;
	buf[6] = 0;
	prepcmd(buf, 7);
	coherence();
	memmove(c->ram + Fwsubmt, buf, sizeof buf);

	for(i = 0; i < 20; i++){
		if(cmd->i[0] == Noconf)
			break;
		delay(1);
	}
	dprint("[%lux %lux]", gbit32(cmd->c), gbit32(cmd->c+4));
	if(i == 20){
		print("m10g: cannot load fw\n");
		return -1;
	}
	dprint("\n");
	c->tx.segsz = align;
	return 0;
}

static int
kickthebaby(Pcidev *p, Ctlr *c)
{
	/* don't kick the baby! */
	ulong code;

	pcicfgw8(p,  0x10 + c->boot, 0x3);
	pcicfgw32(p, 0x18 + c->boot, 0xfffffff0);
	code = pcicfgr32(p, 0x14 + c->boot);

	dprint("reboot status = %lux\n", code);
	if(code != 0xfffffff0)
		return -1;
	return 0;
}

typedef struct {
	uchar	len[4];
	uchar	type[4];
	char	version[128];
	uchar	globals[4];
	uchar	ramsz[4];
	uchar	specs[4];
	uchar	specssz[4];
} Fwhdr;

enum {
	Tmx	= 0x4d582020,
	Tpcie	= 0x70636965,
	Teth	= 0x45544820,
	Tmcp0	= 0x4d435030,
};

static char *
fwtype(ulong type)
{
	switch(type){
	case Tmx:
		return "mx";
	case Tpcie:
		return "PCIe";
	case Teth:
		return "eth";
	case Tmcp0:
		return "mcp0";
	}
	return "*GOK*";
}

static int
chkfw(Ctlr *c)
{
	ulong off, type;
	Fwhdr *h;

	off = gbit32(c->ram+0x3c);
	dprint("firmware %lux\n", off);
	if((off&3) || off + sizeof *h > c->ramsz){
		print("!m10g: bad firmware %lux\n", off);
		return -1;
	}
	h = (Fwhdr*)(c->ram + off);
	type = gbit32(h->type);
	dprint("\t" "type	%s\n", fwtype(type));
	dprint("\t" "vers	%s\n", h->version);
	dprint("\t" "ramsz	%lux\n", gbit32(h->ramsz));
	if(type != Teth){
		print("!m10g: bad card type %s\n", fwtype(type));
		return -1;
	}

	return bootfw(c) || rdmacmd(c, 0);
}

static int
reset(Ether *e, Ctlr *c)
{
	ulong i, sz;

	if(waserror()){
		print("m10g: reset error\n");
		nexterror();
		return -1;
	}

	chkfw(c);
	cmd(c, Creset, 0);

	cmd(c, CSintrqsz, c->done.n * sizeof *c->done.entry);
	cmd(c, CSintrqdma, c->done.busaddr);
	c->irqack =   (ulong*)(c->ram + cmd(c, CGirqackoff, 0));
	/* required only if we're not doing msi? */
	c->irqdeass = (ulong*)(c->ram + cmd(c, CGirqdeassoff, 0));
	/* this is the driver default, why fiddle with this? */
	c->coal = (ulong*)(c->ram + cmd(c, CGcoaloff, 0));
	*c->coal = pbit32(25);

	dprint("dma stats:\n");
	rdmacmd(c, 1);
	sz = c->tx.segsz;
	i = dmatestcmd(c, DMAread, c->done.busaddr, sz);
	print("m10g: read %lud MB/s;", ((i>>16)*sz*2) / (i&0xffff));
	i = dmatestcmd(c, DMAwrite, c->done.busaddr, sz);
	print(" write %lud MB/s;", ((i>>16)*sz*2) / (i&0xffff));
	i = dmatestcmd(c, DMAwrite|DMAread, c->done.busaddr, sz);
	print(" r/w %lud MB/s\n", ((i>>16)*sz*2*2) / (i&0xffff));
	memset(c->done.entry, 0, c->done.n * sizeof *c->done.entry);

	maccmd(c, CSmac, c->ra);
//	cmd(c, Cnopromisc, 0);
	cmd(c, Cenablefc, 0);
	e->maxmtu = Maxmtu;
	cmd(c, CSmtu, e->maxmtu);
	dprint("CSmtu %d...\n", e->maxmtu);

	poperror();
	return 0;
}

static void
ctlrfree(Ctlr *c)
{
	/* free up all the Block*s, too */
	free(c->tx.host);
	free(c->sm.host);
	free(c->bg.host);
	free(c->cmd);
	free(c->done.entry);
	free(c->stats);
	free(c);
}

static int
setmem(Pcidev *p, Ctlr *c)
{
	uvlong raddr;
	void *mem;
	Done *d;
	ulong i;

	c->tx.segsz = 2048;
	c->ramsz = 2*MiB - (2*48*KiB + 32*KiB) - 0x100;
	if(c->ramsz > p->mem[0].size)
		return -1;
	if(p->mem[0].bar & 1)
		return -1;
	raddr = p->mem[0].bar & ~0xF;
	mem = vmap(raddr, p->mem[0].size);
	if(mem == nil){
		print("m10g: can't map %llux\n", raddr);
		return -1;
	}
	dprint("%llux <- vmap(mem[0].size = %d)\n", raddr, p->mem[0].size);
	c->port = raddr;
	c->ram = mem;
	c->cmd = malign(sizeof *c->cmd);
	c->cprt = PCIWADDR(c->cmd);

	d = &c->done;
	d->n = Maxslots;
	d->m = d->n - 1;
	i = d->n * sizeof *d->entry;
	d->entry = malign(i);
	memset(d->entry, 0, i);
	d->busaddr = PCIWADDR(d->entry);

	c->stats = malign(sizeof *c->stats);
	memset(c->stats, 0, sizeof *c->stats);
	c->statsprt = PCIWADDR(c->stats);

	memmove(c->eprom, c->ram + c->ramsz - Epromsz, Epromsz-2);
	return setpcie(p) || parseeprom(c);
}

static Rx*
whichrx(Ctlr *c, int sz)
{
	if(sz <= smpool.size)
		return &c->sm;
	return &c->bg;
}

static Block*
balloc(Rx* rx)
{
	Block *bp;

	ilock(rx->pool);
	if((bp = rx->pool->head) != nil){
		rx->pool->head = bp->next;
		bp->next = nil;
		rx->pool->n--;
	}
	iunlock(rx->pool);
	return bp;
}

static void
rbfree(Block *b, Bpool *p)
{
	b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base);
 	b->flag &= ~(Bipck | Budpck | Btcpck | Bpktck);

	ilock(p);
	b->next = p->head;
	p->head = b;
	p->n++;
	p->cnt++;
	iunlock(p);
}

static void
smbfree(Block *b)
{
	rbfree(b, &smpool);
}

static void
bgbfree(Block *b)
{
	rbfree(b, &bgpool);
}

static void
replenish(Rx *rx)
{
	ulong buf[16], i, idx, e;
	Bpool *p;
	Block *b;

	p = rx->pool;
	if(p->n < 8)
		return;
	memset(buf, 0, sizeof buf);
	e = (rx->i - rx->cnt) & ~7;
	e += rx->n;
	while(p->n >= 8 && e){
		idx = rx->cnt & rx->m;
		for(i = 0; i < 8; i++){
			b = balloc(rx);
			buf[i*2]   = pbit32((uvlong)PCIWADDR(b->wp) >> 32);
			buf[i*2+1] = pbit32(PCIWADDR(b->wp));
			rx->host[idx+i] = b;
			assert(b);
		}
		memmove(rx->lanai + 2*idx, buf, sizeof buf);
		coherence();
		rx->cnt += 8;
		e -= 8;
	}
	if(e && p->n > 7+1)
		print("m10g: should panic? pool->n = %d", p->n);
}

/*
 * future:
 * if (c->mtrr >= 0) {
 *	c->tx.wcfifo = c->ram+0x200000;
 *	c->sm.wcfifo = c->ram+0x300000;
 *	c->bg.wcfifo = c->ram+0x340000;
 * }
 */

static int
nextpow(int j)
{
	int i;

	for(i = 0; j > (1 << i); i++)
		;
	return 1 << i;
}

static void*
emalign(int sz)
{
	void *v;

	v = malign(sz);
	if(v == nil)
		error(Enomem);
	memset(v, 0, sz);
	return v;
}

static void
open0(Ether *e, Ctlr *c)
{
	Block *b;
	int i, sz, entries;

	entries = cmd(c, CGsendrgsz, 0) / sizeof *c->tx.lanai;
	c->tx.lanai = (Send*)(c->ram + cmd(c, CGsendoff, 0));
	c->tx.host  = emalign(entries * sizeof *c->tx.host);
	c->tx.bring = emalign(entries * sizeof *c->tx.bring);
	c->tx.n = entries;
	c->tx.m = entries-1;

	entries = cmd(c, CGrxrgsz, 0)/8;
	c->sm.pool = &smpool;
	cmd(c, CSsmallsz, c->sm.pool->size);
	c->sm.lanai = (ulong*)(c->ram + cmd(c, CGsmallrxoff, 0));
	c->sm.n = entries;
	c->sm.m = entries-1;
	c->sm.host = emalign(entries * sizeof *c->sm.host);

	c->bg.pool = &bgpool;
	c->bg.pool->size = nextpow(2 + e->maxmtu);  /* 2-byte alignment pad */
	cmd(c, CSbigsz, c->bg.pool->size);
	c->bg.lanai = (ulong*)(c->ram + cmd(c, CGbigrxoff, 0));
	c->bg.n = entries;
	c->bg.m = entries-1;
	c->bg.host = emalign(entries * sizeof *c->bg.host);

	sz = c->sm.pool->size + BY2PG;
	for(i = 0; i < c->sm.n; i++){
		if((b = allocb(sz)) == 0)
			break;
		b->free = smbfree;
		freeb(b);
	}
	sz = c->bg.pool->size + BY2PG;
	for(i = 0; i < c->bg.n; i++){
		if((b = allocb(sz)) == 0)
			break;
		b->free = bgbfree;
		freeb(b);
	}

	cmd(c, CSstatsdma, c->statsprt);
	c->linkstat = ~0;
	c->nrdma = 15;

	cmd(c, Cetherup, 0);
}

static Block*
nextblock(Ctlr *c)
{
	uint i;
	ushort l, k;
	Block *b;
	Done *d;
	Rx *rx;
	Slot *s;
	Slotparts *sp;

	d = &c->done;
	s = d->entry;
	i = d->i & d->m;
	sp = (Slotparts *)(s + i);
	l = sp->len;
	if(l == 0)
		return 0;
	k = sp->cksum;
	s[i] = 0;
	d->i++;
	l = gbit16((uchar*)&l);
//dprint("nextb: i=%d l=%d\n", d->i, l);
	rx = whichrx(c, l);
	if(rx->i >= rx->cnt){
		iprint("m10g: overrun\n");
		return 0;
	}
	i = rx->i & rx->m;
	b = rx->host[i];
	rx->host[i] = 0;
	if(b == 0){
		iprint("m10g: error rx to no block.  memory is hosed.\n");
		return 0;
	}
	rx->i++;

	b->flag |= Bipck|Btcpck|Budpck;
	b->checksum = k;
	b->rp += 2;
	b->wp += 2+l;
	b->lim = b->wp;			/* lie like a dog. */
	return b;
}

static int
rxcansleep(void *v)
{
	Ctlr *c;
	Slot *s;
	Slotparts *sp;
	Done *d;

	c = v;
	d = &c->done;
	s = c->done.entry;
	sp = (Slotparts *)(s + (d->i & d->m));
	if(sp->len != 0)
		return -1;
	c->irqack[0] = pbit32(3);
	return 0;
}

static void
m10rx(void *v)
{
	Ether *e;
	Ctlr *c;
	Block *b;

	e = v;
	c = e->ctlr;
	while(waserror())
		;
	for(;;){
		replenish(&c->sm);
		replenish(&c->bg);
		sleep(&c->rxrendez, rxcansleep, c);
		while(b = nextblock(c))
			etheriq(e, b);
	}
}

static void
txcleanup(Tx *tx, ulong n)
{
	Block *b;
	uint j, l, m;

	if(tx->npkt == n)
		return;
	l = 0;
	m = tx->m;
	/*
	 * if tx->cnt == tx->i, yet tx->npkt == n-1, we just
	 * caught ourselves and myricom card updating.
	 */
	for(;; tx->cnt++){
		j = tx->cnt & tx->m;
		if(b = tx->bring[j]){
			tx->bring[j] = 0;
			tx->nbytes += BLEN(b);
			freeb(b);
			if(++tx->npkt == n)
				return;
		}
		if(tx->cnt == tx->i)
			return;
		if(l++ == m){
			iprint("m10g: tx ovrun: %lud %lud\n", n, tx->npkt);
			return;
		}
	}
}

static int
txcansleep(void *v)
{
	Ctlr *c;

	c = v;
	if(c->tx.cnt != c->tx.i && c->tx.npkt != gbit32(c->stats->txcnt))
		return -1;
	return 0;
}

static void
txproc(void *v)
{
	Ether *e;
	Ctlr *c;
	Tx *tx;

	e = v;
	c = e->ctlr;
	tx = &c->tx;
	while(waserror())
		;
	for(;;){
 		sleep(&c->txrendez, txcansleep, c);
		txcleanup(tx, gbit32(c->stats->txcnt));
	}
}

static void
submittx(Tx *tx, int n)
{
	Send *l, *h;
	int i0, i, m;

	m = tx->m;
	i0 = tx->i & m;
	l = tx->lanai;
	h = tx->host;
	for(i = n-1; i >= 0; i--)
		memmove(l+(i + i0 & m), h+(i + i0 & m), sizeof *h);
	tx->i += n;
//	coherence();
}

static int
nsegments(Block *b, int segsz)
{
	uintptr bus, end, slen, len;
	int i;

	bus = PCIWADDR(b->rp);
	i = 0;
	for(len = BLEN(b); len; len -= slen){
		end = bus + segsz & ~(segsz-1);
		slen = end - bus;
		if(slen > len)
			slen = len;
		bus += slen;
		i++;
	}
	return i;
}

static void
m10gtransmit(Ether *e)
{
	ushort slen;
	ulong i, cnt, rdma, nseg, count, end, bus, len, segsz;
	uchar flags;
	Block *b;
	Ctlr *c;
	Send *s, *s0, *s0m8;
	Tx *tx;

	c = e->ctlr;
	tx = &c->tx;
	segsz = tx->segsz;

	qlock(tx);
	count = 0;
	s = tx->host + (tx->i & tx->m);
	cnt = tx->cnt;
	s0 =   tx->host + (cnt & tx->m);
	s0m8 = tx->host + ((cnt - 8) & tx->m);
	i = tx->i;
	for(; s >= s0 || s < s0m8; i += nseg){
		if((b = qget(e->oq)) == nil)
			break;
		flags = SFfirst|SFnotso;
		if((len = BLEN(b)) < 1520)
			flags |= SFsmall;
		rdma = nseg = nsegments(b, segsz);
		bus = PCIWADDR(b->rp);
		for(; len; len -= slen){
			end = (bus + segsz) & ~(segsz-1);
			slen = end - bus;
			if(slen > len)
				slen = len;
			s->low = pbit32(bus);
			s->len = pbit16(slen);
			s->nrdma = rdma;
			s->flags = flags;

			bus += slen;
			if(++s ==  tx->host + tx->n)
				s = tx->host;
			count++;
			flags &= ~SFfirst;
			rdma = 1;
		}
		tx->bring[(i + nseg - 1) & tx->m] = b;
		if(1 || count > 0){
			submittx(tx, count);
			count = 0;
			cnt = tx->cnt;
			s0 =   tx->host + (cnt & tx->m);
			s0m8 = tx->host + ((cnt - 8) & tx->m);
		}
	}
	qunlock(tx);
}

static void
checkstats(Ether *e, Ctlr *c, Stats *s)
{
	ulong i;

	if(s->updated == 0)
		return;

	i = gbit32(s->linkstat);
	if(c->linkstat != i){
		e->link = i;
		if(c->linkstat = i)
			dprint("m10g: link up\n");
		else
			dprint("m10g: link down\n");
	}
	i = gbit32(s->nrdma);
	if(i != c->nrdma){
		dprint("m10g: rdma timeout %ld\n", i);
		c->nrdma = i;
	}
}

static void
waitintx(Ctlr *c)
{
	int i;

	for(i = 0; i < 1024*1024; i++){
		if(c->stats->valid == 0)
			break;
		coherence();
	}
}

static void
m10ginterrupt(Ureg *, void *v)
{
	Ether *e;
	Ctlr *c;

	e = v;
	c = e->ctlr;

	if(c->state != Runed || c->stats->valid == 0)	/* not ready for us? */
		return;

	if(c->stats->valid & 1)
		wakeup(&c->rxrendez);
	if(gbit32(c->stats->txcnt) != c->tx.npkt)
		wakeup(&c->txrendez);
	if(c->msi == 0)
		*c->irqdeass = 0;
	else
		c->stats->valid = 0;
	waitintx(c);
	checkstats(e, c, c->stats);
	c->irqack[1] = pbit32(3);
}

static void
m10gattach(Ether *e)
{
	Ctlr *c;
	char name[12];

	dprint("m10gattach\n");

	qlock(e->ctlr);
	c = e->ctlr;
	if(c->state != Detached){
		qunlock(c);
		return;
	}
	if(waserror()){
		c->state = Detached;
		qunlock(c);
		nexterror();
	}
	reset(e, c);
	c->state = Attached;
	open0(e, c);
	if(c->kprocs == 0){
		c->kprocs++;
		snprint(name, sizeof name, "#l%drxproc", e->ctlrno);
		kproc(name, m10rx, e);
		snprint(name, sizeof name, "#l%dtxproc", e->ctlrno);
		kproc(name, txproc, e);
	}
	c->state = Runed;
	qunlock(c);
	poperror();
}

static int
m10gdetach(Ctlr *c)
{
	dprint("m10gdetach\n");
//	reset(e->ctlr);
	vunmap(c->ram, c->pcidev->mem[0].size);
	ctlrfree(c);
	return -1;
}

static int
lstcount(Block *b)
{
	int i;

	i = 0;
	for(; b; b = b->next)
		i++;
	return i;
}

static long
m10gifstat(Ether *e, void *v, long n, ulong off)
{
	char *p;
	Ctlr *c;
	Stats s;

	c = e->ctlr;
	p = smalloc(READSTR);
	/* no point in locking this because this is done via dma. */
	memmove(&s, c->stats, sizeof s);
	snprint(p, READSTR,
		"txcnt = %lud\n"  "linkstat = %lud\n" 	"dlink = %lud\n"
		"derror = %lud\n" "drunt = %lud\n" 	"doverrun = %lud\n"
		"dnosm = %lud\n"  "dnobg = %lud\n"	"nrdma = %lud\n"
		"txstopped = %ud\n" "down = %ud\n" 	"updated = %ud\n"
		"valid = %ud\n\n"
		"tx pkt = %lud\n"  "tx bytes = %lld\n"
		"tx cnt = %ud\n"  "tx n = %ud\n"	"tx i = %ud\n"
		"sm cnt = %ud\n"  "sm i = %ud\n"	"sm n = %ud\n"
		"sm lst = %ud\n"
		"bg cnt = %ud\n"  "bg i = %ud\n"	"bg n = %ud\n"
		"bg lst = %ud\n"
		"segsz = %lud\n"   "coal = %lud\n",
		gbit32(s.txcnt),  gbit32(s.linkstat),	gbit32(s.dlink),
		gbit32(s.derror), gbit32(s.drunt),	gbit32(s.doverrun),
		gbit32(s.dnosm),  gbit32(s.dnobg),	gbit32(s.nrdma),
		s.txstopped,  s.down, s.updated, s.valid,
		c->tx.npkt, c->tx.nbytes,
		c->tx.cnt, c->tx.n, c->tx.i,
		c->sm.cnt, c->sm.i, c->sm.pool->n, lstcount(c->sm.pool->head),
		c->bg.cnt, c->bg.i, c->bg.pool->n, lstcount(c->bg.pool->head),
		c->tx.segsz, gbit32((uchar*)c->coal));

	n = readstr(off, v, n, p);
	free(p);
	return n;
}

//static void
//summary(Ether *e)
//{
//	char *buf;
//	int n, i, j;
//
//	if(e == 0)
//		return;
//	buf = malloc(n=250);
//	if(buf == 0)
//		return;
//
//	snprint(buf, n, "oq\n");
//	qsummary(e->oq, buf+3, n-3-1);
//	iprint("%s", buf);
//
//	if(e->f) for(i = 0; e->f[i]; i++){
//		j = snprint(buf, n, "f%d %d\n", i, e->f[i]->type);
//		qsummary(e->f[i]->in, buf+j, n-j-1);
//		print("%s", buf);
//	}
//
//	free(buf);
//}

static void
rxring(Ctlr *c)
{
	Done *d;
	Slot *s;
	Slotparts *sp;
	int i;

	d = &c->done;
	s = d->entry;
	for(i = 0; i < d->n; i++) {
		sp = (Slotparts *)(s + i);
		if(sp->len)
			iprint("s[%d] = %d\n", i, sp->len);
	}
}

enum {
	CMdebug,
	CMcoal,
	CMwakeup,
	CMtxwakeup,
	CMqsummary,
	CMrxring,
};

static Cmdtab ctab[] = {
	CMdebug,	"debug",	2,
	CMcoal,		"coal",		2,
	CMwakeup,	"wakeup",	1,
	CMtxwakeup,	"txwakeup",	1,
//	CMqsummary,	"q",		1,
	CMrxring,	"rxring",	1,
};

static long
m10gctl(Ether *e, void *v, long n)
{
	int i;
	Cmdbuf *c;
	Cmdtab *t;

	dprint("m10gctl\n");
	if(e->ctlr == nil)
		error(Enonexist);

	c = parsecmd(v, n);
	if(waserror()){
		free(c);
		nexterror();
	}
	t = lookupcmd(c, ctab, nelem(ctab));
	switch(t->index){
	case CMdebug:
		debug = (strcmp(c->f[1], "on") == 0);
		break;
	case CMcoal:
		i = atoi(c->f[1]);
		if(i < 0 || i > 1000)
			error(Ebadarg);
		*((Ctlr*)e->ctlr)->coal = pbit32(i);
		break;
	case CMwakeup:
		wakeup(&((Ctlr*)e->ctlr)->rxrendez); /* you're kidding, right? */
		break;
	case CMtxwakeup:
		wakeup(&((Ctlr*)e->ctlr)->txrendez); /* you're kidding, right? */
		break;
//	case CMqsummary:
//		summary(e);
//		break;
	case CMrxring:
		rxring(e->ctlr);
		break;
	default:
		error(Ebadarg);
	}
	free(c);
	poperror();
	return n;
}

static void
m10gshutdown(Ether *e)
{
	dprint("m10gshutdown\n");
	m10gdetach(e->ctlr);
}

static void
m10gpromiscuous(void *v, int on)
{
	Ether *e;
	int i;

	dprint("m10gpromiscuous\n");
	e = v;
	if(on)
		i = Cpromisc;
	else
		i = Cnopromisc;
	cmd(e->ctlr, i, 0);
}

static int	mcctab[]  = { CSleavemc, CSjoinmc };
static char	*mcntab[] = { "leave", "join" };

static void
m10gmulticast(void *v, uchar *ea, int on)
{
	Ether *e;
	int i;

	dprint("m10gmulticast\n");
	e = v;
	if((i = maccmd(e->ctlr, mcctab[on], ea)) != 0)
		print("m10g: can't %s %E: %d\n", mcntab[on], ea, i);
}

static void
m10gpci(void)
{
	Pcidev *p;
	Ctlr *t, *c;

	t = 0;
	for(p = 0; p = pcimatch(p, Vmyricom, 0); ){
		switch(p->did){
		case 0x8:		/* 8a */
			break;
		case 0x9:		/* 8a with msi-x fw */
		case 0xa:		/* 8b */
		case 0xb:		/* 8b2 */
		case 0xc:		/* 2-8b2 */
			/* untested */
			break;
		default:
			print("etherm10g: unknown myricom did %#ux\n", p->did);
			continue;
		}
		c = malloc(sizeof *c);
		if(c == nil){
			print("etherm10g: can't allocate memory\n");
			continue;
		}
		c->pcidev = p;
		pcienable(p);
		c->id = p->did<<16 | p->vid;
		c->boot = pcicap(p, PciCapVND);
//		kickthebaby(p, c);
		pcisetbme(p);
		if(setmem(p, c) == -1){
			print("m10g: setmem failed\n");
			free(c);
			/* cleanup */
			continue;
		}
		if(t)
			t->next = c;
		else
			ctlrs = c;
		t = c;
	}
}

static int
m10gpnp(Ether *e)
{
	Ctlr *c;

	if(ctlrs == nil)
		m10gpci();

	for(c = ctlrs; c != nil; c = c->next)
		if(c->active)
			continue;
		else if(e->port == 0 || e->port == c->port)
			break;
	if(c == nil)
		return -1;
	c->active = 1;

	e->ctlr = c;
	e->port = c->port;
	e->irq = c->pcidev->intl;
	e->tbdf = c->pcidev->tbdf;
	e->mbps = 10000;
	memmove(e->ea, c->ra, Eaddrlen);

	e->attach = m10gattach;
	e->transmit = m10gtransmit;
	e->interrupt = m10ginterrupt;
	e->ifstat = m10gifstat;
	e->ctl = m10gctl;
//	e->power = m10gpower;
	e->shutdown = m10gshutdown;

	e->arg = e;
	e->promiscuous = m10gpromiscuous;
	e->multicast = m10gmulticast;

	return 0;
}

void
etherm10glink(void)
{
	addethercard("m10g", m10gpnp);
}