code: plan9front

Download patch

ref: 86b470d877870dd1c48f57d98586b48278cd9b5c
parent: 83f489cf5d6e3740ae33e8b400099ada12254670
author: Keegan Saunders <keegan@undefinedbehaviour.org>
date: Fri Sep 15 18:29:01 EDT 2023

kernel: move virtio10 drivers to port

These drivers are also known to work on arm64, so make them available

--- a/sys/src/9/pc/ethervirtio10.c
+++ /dev/null
@@ -1,793 +1,0 @@
-/*
- * virtio 1.0 ethernet driver
- * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
- *
- * In contrast to ethervirtio.c, this driver handles the non-legacy
- * interface for virtio ethernet which uses mmio for all register accesses
- * and requires a laborate pci capability structure dance to get working.
- *
- * It is kind of pointless as it is most likely slower than
- * port i/o (harder to emulate on the pc platform).
- * 
- * The reason why this driver is needed it is that vultr set the
- * disable-legacy=on option in the -device parameter for qemu
- * on their hypervisor.
- */
-#include "u.h"
-#include "../port/lib.h"
-#include "mem.h"
-#include "dat.h"
-#include "fns.h"
-#include "io.h"
-#include "../port/pci.h"
-#include "../port/error.h"
-#include "../port/netif.h"
-#include "../port/etherif.h"
-
-typedef struct Vconfig Vconfig;
-typedef struct Vnetcfg Vnetcfg;
-
-typedef struct Vring Vring;
-typedef struct Vdesc Vdesc;
-typedef struct Vused Vused;
-typedef struct Vheader Vheader;
-typedef struct Vqueue Vqueue;
-
-typedef struct Ctlr Ctlr;
-
-enum {
-	/* §2.1 Device Status Field */
-	Sacknowledge = 1,
-	Sdriver = 2,
-	Sdriverok = 4,
-	Sfeaturesok = 8,
-	Sfailed = 128,
-
-	/* flags in Qnetstatus */
-	Nlinkup = (1<<0),
-	Nannounce = (1<<1),
-
-	/* feat[0] bits */
-	Fmac = 1<<5,
-	Fstatus = 1<<16,
-	Fctrlvq = 1<<17,
-	Fctrlrx = 1<<18,
-
-	/* feat[1] bits */
-	Fversion1 = 1<<(32-32),
-
-	/* vring used flags */
-	Unonotify = 1,
-	/* vring avail flags */
-	Rnointerrupt = 1,
-
-	/* descriptor flags */
-	Dnext = 1,
-	Dwrite = 2,
-	Dindirect = 4,
-
-	/* struct sizes */
-	VringSize = 4,
-	VdescSize = 16,
-	VusedSize = 8,
-	VheaderSize = 12,
-
-	Vrxq	= 0,
-	Vtxq	= 1,
-	Vctlq	= 2,
-
-	/* class/cmd for Vctlq */
-	CtrlRx	= 0x00,
-		CmdPromisc	= 0x00,
-		CmdAllmulti	= 0x01,
-	CtrlMac	= 0x01,
-		CmdMacTableSet	= 0x00,
-	CtrlVlan= 0x02,
-		CmdVlanAdd	= 0x00,
-		CmdVlanDel	= 0x01,
-};
-
-struct Vconfig {
-	u32int	devfeatsel;
-	u32int	devfeat;
-	u32int	drvfeatsel;
-	u32int	drvfeat;
-
-	u16int	msixcfg;
-	u16int	nqueues;
-
-	u8int	status;
-	u8int	cfggen;
-	u16int	queuesel;
-
-	u16int	queuesize;
-	u16int	queuemsixvect;
-
-	u16int	queueenable;
-	u16int	queuenotifyoff;
-
-	u64int	queuedesc;
-	u64int	queueavail;
-	u64int	queueused;
-};
-
-struct Vnetcfg
-{
-	u16int	mac0;
-	u16int	mac1;
-	u16int	mac2;
-	u16int	status;
-	u16int	maxqueuepairs;
-	u16int	mtu;
-};
-
-struct Vring
-{
-	u16int	flags;
-	u16int	idx;
-};
-
-struct Vdesc
-{
-	u64int	addr;
-	u32int	len;
-	u16int	flags;
-	u16int	next;
-};
-
-struct Vused
-{
-	u32int	id;
-	u32int	len;
-};
-
-struct Vheader
-{
-	u8int	flags;
-	u8int	segtype;
-	u16int	hlen;
-	u16int	seglen;
-	u16int	csumstart;
-	u16int	csumend;
-};
-
-struct Vqueue
-{
-	Rendez;
-
-	uint	qsize;
-	uint	qmask;
-
-	Vdesc	*desc;
-
-	Vring	*avail;
-	u16int	*availent;
-	u16int	*availevent;
-
-	Vring	*used;
-	Vused	*usedent;
-	u16int	*usedevent;
-	u16int	lastused;
-
-	uint	nintr;
-	uint	nnote;
-
-	/* notify register */
-	void	*notify;
-};
-
-struct Ctlr {
-	Lock;
-
-	QLock	ctllock;
-
-	int	attached;
-
-	/* registers */
-	Vconfig	*cfg;
-	Vnetcfg *dev;
-	u8int	*isr;
-	u8int	*notify;
-	u32int	notifyoffmult;
-
-	uvlong	port;
-	Pcidev	*pcidev;
-	Ctlr	*next;
-	int	active;
-	ulong	feat[2];
-	int	nqueue;
-
-	/* virtioether has 3 queues: rx, tx and ctl */
-	Vqueue	queue[3];
-};
-
-static Ctlr *ctlrhead;
-
-static int
-vhasroom(void *v)
-{
-	Vqueue *q = v;
-	return q->lastused != q->used->idx;
-}
-
-static void
-vqnotify(Ctlr *ctlr, int x)
-{
-	Vqueue *q;
-
-	coherence();
-	q = &ctlr->queue[x];
-	if(q->used->flags & Unonotify)
-		return;
-	q->nnote++;
-	*((u16int*)q->notify) = x;
-}
-
-static void
-txproc(void *v)
-{
-	Vheader *header;
-	Block **blocks;
-	Ether *edev;
-	Ctlr *ctlr;
-	Vqueue *q;
-	Vused *u;
-	Block *b;
-	int i, j;
-
-	edev = v;
-	ctlr = edev->ctlr;
-	q = &ctlr->queue[Vtxq];
-
-	header = smalloc(VheaderSize);
-	blocks = smalloc(sizeof(Block*) * (q->qsize/2));
-
-	for(i = 0; i < q->qsize/2; i++){
-		j = i << 1;
-		q->desc[j].addr = PADDR(header);
-		q->desc[j].len = VheaderSize;
-		q->desc[j].next = j | 1;
-		q->desc[j].flags = Dnext;
-
-		q->availent[i] = q->availent[i + q->qsize/2] = j;
-
-		j |= 1;
-		q->desc[j].next = 0;
-		q->desc[j].flags = 0;
-	}
-
-	q->avail->flags &= ~Rnointerrupt;
-
-	while(waserror())
-		;
-
-	while((b = qbread(edev->oq, 1000000)) != nil){
-		for(;;){
-			/* retire completed packets */
-			while((i = q->lastused) != q->used->idx){
-				u = &q->usedent[i & q->qmask];
-				i = (u->id & q->qmask) >> 1;
-				if(blocks[i] == nil)
-					break;
-				freeb(blocks[i]);
-				blocks[i] = nil;
-				q->lastused++;
-			}
-
-			/* have free slot? */
-			i = q->avail->idx & (q->qmask >> 1);
-			if(blocks[i] == nil)
-				break;
-
-			/* ring full, wait and retry */
-			if(!vhasroom(q))
-				sleep(q, vhasroom, q);
-		}
-
-		/* slot is free, fill in descriptor */
-		blocks[i] = b;
-		j = (i << 1) | 1;
-		q->desc[j].addr = PADDR(b->rp);
-		q->desc[j].len = BLEN(b);
-		coherence();
-		q->avail->idx++;
-		vqnotify(ctlr, Vtxq);
-	}
-
-	pexit("ether out queue closed", 1);
-}
-
-static void
-rxproc(void *v)
-{
-	Vheader *header;
-	Block **blocks;
-	Ether *edev;
-	Ctlr *ctlr;
-	Vqueue *q;
-	Vused *u;
-	Block *b;
-	int i, j;
-
-	edev = v;
-	ctlr = edev->ctlr;
-	q = &ctlr->queue[Vrxq];
-
-	header = smalloc(VheaderSize);
-	blocks = smalloc(sizeof(Block*) * (q->qsize/2));
-
-	for(i = 0; i < q->qsize/2; i++){
-		j = i << 1;
-		q->desc[j].addr = PADDR(header);
-		q->desc[j].len = VheaderSize;
-		q->desc[j].next = j | 1;
-		q->desc[j].flags = Dwrite|Dnext;
-
-		q->availent[i] = q->availent[i + q->qsize/2] = j;
-
-		j |= 1;
-		q->desc[j].next = 0;
-		q->desc[j].flags = Dwrite;
-	}
-
-	q->avail->flags &= ~Rnointerrupt;
-
-	while(waserror())
-		;
-
-	for(;;){
-		/* replenish receive ring */
-		do {
-			i = q->avail->idx & (q->qmask >> 1);
-			if(blocks[i] != nil)
-				break;
-			if((b = iallocb(ETHERMAXTU)) == nil)
-				break;
-			blocks[i] = b;
-			j = (i << 1) | 1;
-			q->desc[j].addr = PADDR(b->rp);
-			q->desc[j].len = BALLOC(b);
-			coherence();
-			q->avail->idx++;
-		} while(q->avail->idx != q->used->idx);
-		vqnotify(ctlr, Vrxq);
-
-		/* wait for any packets to complete */
-		if(!vhasroom(q))
-			sleep(q, vhasroom, q);
-
-		/* retire completed packets */
-		while((i = q->lastused) != q->used->idx) {
-			u = &q->usedent[i & q->qmask];
-			i = (u->id & q->qmask) >> 1;
-			if((b = blocks[i]) == nil)
-				break;
-
-			blocks[i] = nil;
-			b->wp = b->rp + u->len - VheaderSize;
-			etheriq(edev, b);
-			q->lastused++;
-		}
-	}
-}
-
-static int
-vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
-{
-	uchar hdr[2], ack[1];
-	Ctlr *ctlr;
-	Vqueue *q;
-	Vdesc *d;
-	int i;
-
-	ctlr = edev->ctlr;
-	q = &ctlr->queue[Vctlq];
-	if(q->qsize < 3)
-		return -1;
-
-	qlock(&ctlr->ctllock);
-	while(waserror())
-		;
-
-	ack[0] = 0x55;
-	hdr[0] = class;
-	hdr[1] = cmd;
-
-	d = &q->desc[0];
-	d->addr = PADDR(hdr);
-	d->len = sizeof(hdr);
-	d->next = 1;
-	d->flags = Dnext;
-	d++;
-	d->addr = PADDR(data);
-	d->len = ndata;
-	d->next = 2;
-	d->flags = Dnext;
-	d++;
-	d->addr = PADDR(ack);
-	d->len = sizeof(ack);
-	d->next = 0;
-	d->flags = Dwrite;
-
-	i = q->avail->idx & q->qmask;
-	q->availent[i] = 0;
-	coherence();
-
-	q->avail->flags &= ~Rnointerrupt;
-	q->avail->idx++;
-	vqnotify(ctlr, Vctlq);
-	while(!vhasroom(q))
-		sleep(q, vhasroom, q);
-	q->lastused = q->used->idx;
-	q->avail->flags |= Rnointerrupt;
-
-	qunlock(&ctlr->ctllock);
-	poperror();
-
-	if(ack[0] != 0)
-		print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
-
-	return ack[0];
-}
-
-static void
-interrupt(Ureg*, void* arg)
-{
-	Ether *edev;
-	Ctlr *ctlr;
-	Vqueue *q;
-	int i;
-
-	edev = arg;
-	ctlr = edev->ctlr;
-	if(*ctlr->isr & 1){
-		for(i = 0; i < ctlr->nqueue; i++){
-			q = &ctlr->queue[i];
-			if(vhasroom(q)){
-				q->nintr++;
-				wakeup(q);
-			}
-		}
-	}
-}
-
-static void
-attach(Ether* edev)
-{
-	char name[KNAMELEN];
-	Ctlr* ctlr;
-	int i;
-
-	ctlr = edev->ctlr;
-	ilock(ctlr);
-	if(ctlr->attached){
-		iunlock(ctlr);
-		return;
-	}
-	ctlr->attached = 1;
-
-	/* enable the queues */
-	for(i = 0; i < ctlr->nqueue; i++){
-		ctlr->cfg->queuesel = i;
-		ctlr->cfg->queueenable = 1;
-	}
-
-	/* driver is ready */
-	ctlr->cfg->status |= Sdriverok;
-
-	iunlock(ctlr);
-
-	/* start kprocs */
-	snprint(name, sizeof name, "#l%drx", edev->ctlrno);
-	kproc(name, rxproc, edev);
-	snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
-	kproc(name, txproc, edev);
-}
-
-static long
-ifstat(Ether *edev, void *a, long n, ulong offset)
-{
-	int i, l;
-	char *p;
-	Ctlr *ctlr;
-	Vqueue *q;
-
-	ctlr = edev->ctlr;
-
-	p = smalloc(READSTR);
-
-	l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
-	l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);
-
-	for(i = 0; i < ctlr->nqueue; i++){
-		q = &ctlr->queue[i];
-		l += snprint(p+l, READSTR-l,
-			"vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
-			i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
-	}
-
-	n = readstr(offset, a, n, p);
-	free(p);
-
-	return n;
-}
-
-static void
-shutdown(Ether* edev)
-{
-	Ctlr *ctlr = edev->ctlr;
-
-	coherence();
-	ctlr->cfg->status = 0;
-	coherence();
-
-	pciclrbme(ctlr->pcidev);
-}
-
-static void
-promiscuous(void *arg, int on)
-{
-	Ether *edev = arg;
-	uchar b[1];
-
-	b[0] = on != 0;
-	vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
-}
-
-static void
-multicast(void *arg, uchar*, int)
-{
-	Ether *edev = arg;
-	uchar b[1];
-
-	b[0] = edev->nmaddr > 0;
-	vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
-}
-
-static int
-initqueue(Vqueue *q, int size)
-{
-	uchar *p;
-
-	q->desc = mallocalign(VdescSize*size, 16, 0, 0);
-	if(q->desc == nil)
-		return -1;
-	p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
-	if(p == nil){
-FreeDesc:
-		free(q->desc);
-		q->desc = nil;
-		return -1;
-	}
-	q->avail = (void*)p;
-	p += VringSize;
-	q->availent = (void*)p;
-	p += sizeof(u16int)*size;
-	q->availevent = (void*)p;
-	p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
-	if(p == nil){
-		free(q->avail);
-		q->avail = nil;
-		goto FreeDesc;
-	}
-	q->used = (void*)p;
-	p += VringSize;
-	q->usedent = (void*)p;
-	p += VusedSize*size;
-	q->usedevent = (void*)p;
-
-	q->qsize = size;
-	q->qmask = q->qsize - 1;
-
-	q->lastused = q->avail->idx = q->used->idx = 0;
-
-	q->avail->flags |= Rnointerrupt;
-
-	return 0;
-}
-
-static int
-matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
-{
-	int bar;
-
-	if(cap != 9 || pcicfgr8(p, off+3) != typ)
-		return 1;
-
-	/* skip invalid or non memory bars */
-	bar = pcicfgr8(p, off+4);
-	if(bar < 0 || bar >= nelem(p->mem) 
-	|| p->mem[bar].size == 0
-	|| (p->mem[bar].bar & 3) != 0)
-		return 1;
-
-	return 0;
-}
-
-static int
-virtiocap(Pcidev *p, int typ)
-{
-	return pcienumcaps(p, matchvirtiocfgcap, typ);
-}
-
-static void*
-virtiomapregs(Pcidev *p, int cap, int size)
-{
-	int bar, len;
-	uvlong addr;
-
-	if(cap < 0)
-		return nil;
-	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
-	addr = pcicfgr32(p, cap+8);
-	len = pcicfgr32(p, cap+12);
-	if(size <= 0)
-		size = len;
-	else if(len < size)
-		return nil;
-	if(addr+len > p->mem[bar].size)
-		return nil;
-	addr += p->mem[bar].bar & ~0xFULL;
-	return vmap(addr, size);
-}
-
-static Ctlr*
-pciprobe(void)
-{
-	Ctlr *c, *h, *t;
-	Pcidev *p;
-	Vconfig *cfg;
-	int bar, cap, n, i;
-
-	h = t = nil;
-
-	/* §4.1.2 PCI Device Discovery */
-	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
-		/* non-transitional devices will have a revision > 0 */
-		if(p->rid == 0)
-			continue;
-		if((cap = virtiocap(p, 1)) < 0)
-			continue;
-		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
-		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
-		if(cfg == nil)
-			continue;
-		if((c = mallocz(sizeof(Ctlr), 1)) == nil){
-			print("ethervirtio: no memory for Ctlr\n");
-			break;
-		}
-		c->cfg = cfg;
-		c->pcidev = p;
-		c->port = p->mem[bar].bar & ~0xFULL;
-
-		pcienable(p);
-		c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
-		if(c->dev == nil)
-			goto Baddev;
-		c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
-		if(c->isr == nil)
-			goto Baddev;
-		cap = virtiocap(p, 2);
-		c->notify = virtiomapregs(p, cap, 0);
-		if(c->notify == nil)
-			goto Baddev;
-		c->notifyoffmult = pcicfgr32(p, cap+16);
-
-		/* device reset */
-		coherence();
-		cfg->status = 0;
-		while(cfg->status != 0)
-			delay(1);
-		cfg->status = Sacknowledge|Sdriver;
-
-		/* negotiate feature bits */
-		cfg->devfeatsel = 1;
-		c->feat[1] = cfg->devfeat;
-
-		cfg->devfeatsel = 0;
-		c->feat[0] = cfg->devfeat;
-
-		cfg->drvfeatsel = 1;
-		cfg->drvfeat = c->feat[1] & Fversion1;
-
-		cfg->drvfeatsel = 0;
-		cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);
-
-		cfg->status |= Sfeaturesok;
-
-		for(i=0; i<nelem(c->queue); i++){
-			cfg->queuesel = i;
-			n = cfg->queuesize;
-			if(n == 0 || (n & (n-1)) != 0){
-				if(i < 2)
-					print("ethervirtio: queue %d has invalid size %d\n", i, n);
-				break;
-			}
-			if(initqueue(&c->queue[i], n) < 0)
-				break;
-			c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
-			coherence();
-			cfg->queuedesc = PADDR(c->queue[i].desc);
-			cfg->queueavail = PADDR(c->queue[i].avail);
-			cfg->queueused = PADDR(c->queue[i].used);
-		}
-		if(i < 2){
-			print("ethervirtio: no queues\n");
-Baddev:
-			pcidisable(p);
-			/* TODO, vunmap */
-			free(c);
-			continue;
-		}
-		c->nqueue = i;		
-
-		if(h == nil)
-			h = c;
-		else
-			t->next = c;
-		t = c;
-	}
-
-	return h;
-}
-
-
-static int
-reset(Ether* edev)
-{
-	static uchar zeros[Eaddrlen];
-	Ctlr *ctlr;
-	int i;
-
-	if(ctlrhead == nil)
-		ctlrhead = pciprobe();
-
-	for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
-		if(ctlr->active)
-			continue;
-		if(edev->port == 0 || edev->port == ctlr->port){
-			ctlr->active = 1;
-			break;
-		}
-	}
-
-	if(ctlr == nil)
-		return -1;
-
-	edev->ctlr = ctlr;
-	edev->port = ctlr->port;
-	edev->irq = ctlr->pcidev->intl;
-	edev->tbdf = ctlr->pcidev->tbdf;
-	edev->mbps = 1000;
-	edev->link = 1;
-
-	if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
-		for(i = 0; i < Eaddrlen; i++)
-			edev->ea[i] = ((uchar*)ctlr->dev)[i];
-	} else {
-		for(i = 0; i < Eaddrlen; i++)
-			((uchar*)ctlr->dev)[i] = edev->ea[i];
-	}
-
-	edev->arg = edev;
-
-	edev->attach = attach;
-	edev->shutdown = shutdown;
-	edev->ifstat = ifstat;
-
-	if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
-		edev->multicast = multicast;
-		edev->promiscuous = promiscuous;
-	}
-
-	pcisetbme(ctlr->pcidev);
-	intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
-
-	return 0;
-}
-
-void
-ethervirtio10link(void)
-{
-	addethercard("virtio10", reset);
-}
--- a/sys/src/9/pc/sdvirtio10.c
+++ /dev/null
@@ -1,810 +1,0 @@
-/*
- * virtio 1.0 disk driver
- * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
- *
- * In contrast to sdvirtio.c, this driver handles the non-legacy
- * interface for virtio disk which uses mmio for all register accesses
- * and requires a laborate pci capability structure dance to get working.
- *
- * It is kind of pointless as it is most likely slower than
- * port i/o (harder to emulate on the pc platform).
- * 
- * The reason why this driver is needed it is that vultr set the
- * disable-legacy=on option in the -device parameter for qemu
- * on their hypervisor.
- */
-#include "u.h"
-#include "../port/lib.h"
-#include "mem.h"
-#include "dat.h"
-#include "fns.h"
-#include "io.h"
-#include "../port/pci.h"
-#include "ureg.h"
-#include "../port/error.h"
-
-#include "../port/sd.h"
-
-typedef struct Vscsidev Vscsidev;
-typedef struct Vblkdev Vblkdev;
-
-typedef struct Vconfig Vconfig;
-typedef struct Vring Vring;
-typedef struct Vdesc Vdesc;
-typedef struct Vused Vused;
-typedef struct Vqueue Vqueue;
-typedef struct Vdev Vdev;
-
-
-/* device types */
-enum {
-	TypBlk	= 2,
-	TypSCSI	= 8,
-};
-
-/* status flags */
-enum {
-	Acknowledge = 1,
-	Driver = 2,
-	FeaturesOk = 8,
-	DriverOk = 4,
-	Failed = 0x80,
-};
-
-/* descriptor flags */
-enum {
-	Next = 1,
-	Write = 2,
-	Indirect = 4,
-};
-
-/* struct sizes */
-enum {
-	VringSize = 4,
-};
-
-enum {
-	CDBSIZE		= 32,
-	SENSESIZE	= 96,
-};
-
-	
-struct Vscsidev
-{
-	u32int	num_queues;
-	u32int	seg_max;
-	u32int	max_sectors;
-	u32int	cmd_per_lun;
-	u32int	event_info_size;
-	u32int	sense_size;
-	u32int	cdb_size;
-	u16int	max_channel;
-	u16int	max_target;
-	u32int	max_lun;
-};
-
-struct Vblkdev
-{
-	u64int	capacity;
-};
-
-struct Vconfig {
-	u32int	devfeatsel;
-	u32int	devfeat;
-	u32int	drvfeatsel;
-	u32int	drvfeat;
-
-	u16int	msixcfg;
-	u16int	nqueues;
-
-	u8int	status;
-	u8int	cfggen;
-	u16int	queuesel;
-
-	u16int	queuesize;
-	u16int	queuemsixvect;
-
-	u16int	queueenable;
-	u16int	queuenotifyoff;
-
-	u64int	queuedesc;
-	u64int	queueavail;
-	u64int	queueused;
-};
-
-struct Vring
-{
-	u16int	flags;
-	u16int	idx;
-};
-
-struct Vdesc
-{
-	u64int	addr;
-	u32int	len;
-	u16int	flags;
-	u16int	next;
-};
-
-struct Vused
-{
-	u32int	id;
-	u32int	len;
-};
-
-struct Vqueue
-{
-	Lock;
-
-	Vdev	*dev;
-	void	*notify;
-	int	idx;
-
-	int	size;
-
-	int	free;
-	int	nfree;
-
-	Vdesc	*desc;
-
-	Vring	*avail;
-	u16int	*availent;
-	u16int	*availevent;
-
-	Vring	*used;
-	Vused	*usedent;
-	u16int	*usedevent;
-	u16int	lastused;
-
-	void	*rock[];
-};
-
-struct Vdev
-{
-	int	typ;
-
-	Pcidev	*pci;
-
-	uvlong	port;
-	ulong	feat[2];
-
-	int	nqueue;
-	Vqueue	*queue[16];
-
-	void	*dev;	/* device specific config (for scsi) */
-
-	/* registers */
-	Vconfig	*cfg;
-	u8int	*isr;
-	u8int	*notify;
-	u32int	notifyoffmult;
-
-	Vdev	*next;
-};
-
-static Vqueue*
-mkvqueue(int size)
-{
-	Vqueue *q;
-	uchar *p;
-	int i;
-
-	q = malloc(sizeof(*q) + sizeof(void*)*size);
-	p = mallocalign(
-		PGROUND(sizeof(Vdesc)*size + 
-			VringSize + 
-			sizeof(u16int)*size + 
-			sizeof(u16int)) +
-		PGROUND(VringSize + 
-			sizeof(Vused)*size + 
-			sizeof(u16int)), 
-		BY2PG, 0, 0);
-	if(p == nil || q == nil){
-		print("virtio: no memory for Vqueue\n");
-		free(p);
-		free(q);
-		return nil;
-	}
-
-	q->desc = (void*)p;
-	p += sizeof(Vdesc)*size;
-	q->avail = (void*)p;
-	p += VringSize;
-	q->availent = (void*)p;
-	p += sizeof(u16int)*size;
-	q->availevent = (void*)p;
-	p += sizeof(u16int);
-
-	p = (uchar*)PGROUND((uintptr)p);
-	q->used = (void*)p;
-	p += VringSize;
-	q->usedent = (void*)p;
-	p += sizeof(Vused)*size;
-	q->usedevent = (void*)p;
-
-	q->free = -1;
-	q->nfree = q->size = size;
-	for(i=0; i<size; i++){
-		q->desc[i].next = q->free;
-		q->free = i;
-	}
-
-	return q;
-}
-
-static int
-matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
-{
-	int bar;
-
-	if(cap != 9 || pcicfgr8(p, off+3) != typ)
-		return 1;
-
-	/* skip invalid or non memory bars */
-	bar = pcicfgr8(p, off+4);
-	if(bar < 0 || bar >= nelem(p->mem) 
-	|| p->mem[bar].size == 0
-	|| (p->mem[bar].bar & 3) != 0)
-		return 1;
-
-	return 0;
-}
-
-static int
-virtiocap(Pcidev *p, int typ)
-{
-	return pcienumcaps(p, matchvirtiocfgcap, typ);
-}
-
-static void*
-virtiomapregs(Pcidev *p, int cap, int size)
-{
-	int bar, len;
-	uvlong addr;
-
-	if(cap < 0)
-		return nil;
-	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
-	addr = pcicfgr32(p, cap+8);
-	len = pcicfgr32(p, cap+12);
-	if(size <= 0)
-		size = len;
-	else if(len < size)
-		return nil;
-	if(addr+len > p->mem[bar].size)
-		return nil;
-	addr += p->mem[bar].bar & ~0xFULL;
-	return vmap(addr, size);
-}
-
-static Vdev*
-viopnpdevs(int typ)
-{
-	Vdev *vd, *h, *t;
-	Vconfig *cfg;
-	Vqueue *q;
-	Pcidev *p;
-	int cap, bar;
-	int n, i;
-
-	h = t = nil;
-	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
-		if(p->rid == 0)
-			continue;
-		if((cap = virtiocap(p, 1)) < 0)
-			continue;
-		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
-		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
-		if(cfg == nil)
-			continue;
-		if((vd = malloc(sizeof(*vd))) == nil){
-			print("virtio: no memory for Vdev\n");
-			break;
-		}
-		vd->port = p->mem[bar].bar & ~0xFULL;
-		vd->typ = typ;
-		vd->pci = p;
-		vd->cfg = cfg;
-		pcienable(p);
-
-		vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
-		if(vd->isr == nil){
-Baddev:
-			pcidisable(p);
-			/* TODO: vunmap */
-			free(vd);
-			continue;
-		}
-		cap = virtiocap(p, 2);
-		vd->notify = virtiomapregs(p, cap, 0);
-		if(vd->notify == nil)
-			goto Baddev;
-		vd->notifyoffmult = pcicfgr32(p, cap+16);
-
-		/* reset */
-		cfg->status = 0;
-		while(cfg->status != 0)
-			delay(1);
-		cfg->status = Acknowledge|Driver;
-
-		/* negotiate feature bits */
-		cfg->devfeatsel = 1;
-		vd->feat[1] = cfg->devfeat;
-		cfg->devfeatsel = 0;
-		vd->feat[0] = cfg->devfeat;
-		cfg->drvfeatsel = 1;
-		cfg->drvfeat = vd->feat[1] & 1;
-		cfg->drvfeatsel = 0;
-		cfg->drvfeat = 0;
-		cfg->status |= FeaturesOk;
-
-		for(i=0; i<nelem(vd->queue); i++){
-			cfg->queuesel = i;
-			n = cfg->queuesize;
-			if(n == 0 || (n & (n-1)) != 0)
-				break;
-			if((q = mkvqueue(n)) == nil)
-				break;
-			q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
-			q->dev = vd;
-			q->idx = i;
-			vd->queue[i] = q;
-			coherence();
-			cfg->queuedesc = PADDR(q->desc);
-			cfg->queueavail = PADDR(q->avail);
-			cfg->queueused = PADDR(q->used);
-		}
-		vd->nqueue = i;
-	
-		if(h == nil)
-			h = vd;
-		else
-			t->next = vd;
-		t = vd;
-	}
-
-	return h;
-}
-
-struct Rock {
-	int done;
-	Rendez *sleep;
-};
-
-static void
-vqinterrupt(Vqueue *q)
-{
-	int id, free, m;
-	struct Rock *r;
-	Rendez *z;
-
-	m = q->size-1;
-
-	ilock(q);
-	while((q->lastused ^ q->used->idx) & m){
-		id = q->usedent[q->lastused++ & m].id;
-		if(r = q->rock[id]){
-			q->rock[id] = nil;
-			z = r->sleep;
-			r->done = 1;	/* hands off */
-			if(z != nil)
-				wakeup(z);
-		}
-		do {
-			free = id;
-			id = q->desc[free].next;
-			q->desc[free].next = q->free;
-			q->free = free;
-			q->nfree++;
-		} while(q->desc[free].flags & Next);
-	}
-	iunlock(q);
-}
-
-static void
-viointerrupt(Ureg *, void *arg)
-{
-	Vdev *vd = arg;
-
-	if(vd->isr[0] & 1)
-		vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
-}
-
-static int
-viodone(void *arg)
-{
-	return ((struct Rock*)arg)->done;
-}
-
-static void
-vqio(Vqueue *q, int head)
-{
-	struct Rock rock;
-
-	rock.done = 0;
-	rock.sleep = &up->sleep;
-	q->rock[head] = &rock;
-	q->availent[q->avail->idx & (q->size-1)] = head;
-	coherence();
-	q->avail->idx++;
-	iunlock(q);
-	if((q->used->flags & 1) == 0)
-		*((u16int*)q->notify) = q->idx;
-	while(!rock.done){
-		while(waserror())
-			;
-		tsleep(rock.sleep, viodone, &rock, 1000);
-		poperror();
-
-		if(!rock.done)
-			vqinterrupt(q);
-	}
-}
-
-static int
-vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
-{
-	int need, free, head;
-	Vqueue *q;
-	Vdesc *d;
-
-	u8int status;
-	struct Vioblkreqhdr {
-		u32int	typ;
-		u32int	prio;
-		u64int	lba;
-	} req;
-
-	need = 2;
-	if(a != nil)
-		need = 3;
-
-	status = -1;
-	req.typ = typ;
-	req.prio = 0;
-	req.lba = lba;
-
-	q = vd->queue[0];
-	ilock(q);
-	while(q->nfree < need){
-		iunlock(q);
-
-		if(!waserror())
-			tsleep(&up->sleep, return0, 0, 500);
-		poperror();
-
-		ilock(q);
-	}
-
-	head = free = q->free;
-
-	d = &q->desc[free]; free = d->next;
-	d->addr = PADDR(&req);
-	d->len = sizeof(req);
-	d->flags = Next;
-
-	if(a != nil){
-		d = &q->desc[free]; free = d->next;
-		d->addr = PADDR(a);
-		d->len = secsize*count;
-		d->flags = typ ? Next : (Write|Next);
-	}
-
-	d = &q->desc[free]; free = d->next;
-	d->addr = PADDR(&status);
-	d->len = sizeof(status);
-	d->flags = Write;
-
-	q->free = free;
-	q->nfree -= need;
-
-	/* queue io, unlock and wait for completion */
-	vqio(q, head);
-
-	return status;
-}
-
-static int
-vioscsireq(SDreq *r)
-{
-	u8int resp[4+4+2+2+SENSESIZE];
-	u8int req[8+8+3+CDBSIZE];
-	int free, head;
-	u32int len;
-	Vqueue *q;
-	Vdesc *d;
-	Vdev *vd;
-	SDunit *u;
-	Vscsidev *scsi;
-
-	u = r->unit;
-	vd = u->dev->ctlr;
-	scsi = vd->dev;
-
-	memset(resp, 0, sizeof(resp));
-	memset(req, 0, sizeof(req));
-	req[0] = 1;
-	req[1] = u->subno;
-	req[2] = r->lun>>8;
-	req[3] = r->lun&0xFF;
-	*(u64int*)(&req[8]) = (uintptr)r;
-
-	memmove(&req[8+8+3], r->cmd, r->clen);
-
-	q = vd->queue[2];
-	ilock(q);
-	while(q->nfree < 3){
-		iunlock(q);
-
-		if(!waserror())
-			tsleep(&up->sleep, return0, 0, 500);
-		poperror();
-
-		ilock(q);
-	}
-
-	head = free = q->free;
-
-	d = &q->desc[free]; free = d->next;
-	d->addr = PADDR(req);
-	d->len = 8+8+3+scsi->cdb_size;
-	d->flags = Next;
-
-	if(r->write && r->dlen > 0){
-		d = &q->desc[free]; free = d->next;
-		d->addr = PADDR(r->data);
-		d->len = r->dlen;
-		d->flags = Next;
-	}
-
-	d = &q->desc[free]; free = d->next;
-	d->addr = PADDR(resp);
-	d->len = 4+4+2+2+scsi->sense_size;
-	d->flags = Write;
-
-	if(!r->write && r->dlen > 0){
-		d->flags |= Next;
-
-		d = &q->desc[free]; free = d->next;
-		d->addr = PADDR(r->data);
-		d->len = r->dlen;
-		d->flags = Write;
-	}
-	
-	q->free = free;
-	q->nfree -= 2 + (r->dlen > 0);
-
-	/* queue io, unlock and wait for completion */
-	vqio(q, head);
-
-	/* response+status */
-	r->status = resp[10];
-	if(resp[11] != 0)
-		r->status = SDcheck;
-
-	/* sense_len */
-	len = *((u32int*)&resp[0]);
-	if(len > 0){
-		if(len > sizeof(r->sense))
-			len = sizeof(r->sense);
-		memmove(r->sense, &resp[4+4+2+2], len);
-		r->flags |= SDvalidsense;
-	}
-
-	/* data residue */
-	len = *((u32int*)&resp[4]);
-	if(len > r->dlen)
-		r->rlen = 0;
-	else
-		r->rlen = r->dlen - len;
-
-	return r->status;
-
-}
-
-static long
-viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
-{
-	long ss, cc, max, ret;
-	Vdev *vd;
-
-	vd = u->dev->ctlr;
-	if(vd->typ == TypSCSI)
-		return scsibio(u, lun, write, a, count, lba);
-
-	max = 32;
-	ss = u->secsize;
-	ret = 0;
-	while(count > 0){
-		if((cc = count) > max)
-			cc = max;
-		if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
-			error(Eio);
-		ret += cc*ss;
-		count -= cc;
-		lba += cc;
-	}
-	return ret;
-}
-
-static int
-viorio(SDreq *r)
-{
-	int i, count, rw;
-	uvlong lba;
-	SDunit *u;
-	Vdev *vd;
-
-	u = r->unit;
-	vd = u->dev->ctlr;
-	if(vd->typ == TypSCSI)
-		return vioscsireq(r);
-	if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
-		if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
-			return sdsetsense(r, SDcheck, 3, 0xc, 2);
-		return sdsetsense(r, SDok, 0, 0, 0);
-	}
-	if((i = sdfakescsi(r)) != SDnostatus)
-		return r->status = i;
-	if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
-		return i;
-	r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
-	return r->status = SDok;
-}
-
-static int
-vioonline(SDunit *u)
-{
-	Vdev *vd;
-	Vblkdev *blk;
-	uvlong cap;
-
-	vd = u->dev->ctlr;
-	if(vd->typ == TypSCSI)
-		return scsionline(u);
-
-	blk = vd->dev;
-	cap = blk->capacity;
-	if(u->sectors != cap){
-		u->sectors = cap;
-		u->secsize = 512;
-		return 2;
-	}
-	return 1;
-}
-
-static int
-vioverify(SDunit *u)
-{
-	Vdev *vd;
-
-	vd = u->dev->ctlr;
-	if(vd->typ == TypSCSI)
-		return scsiverify(u);
-
-	return 1;
-}
-
-SDifc sdvirtio10ifc;
-
-static int
-vioenable(SDev *sd)
-{
-	char name[32];
-	Vdev *vd;
-	int i;
-
-	vd = sd->ctlr;
-	pcisetbme(vd->pci);
-	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
-	intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
-	coherence();
-
-	for(i = 0; i < vd->nqueue; i++){
-		vd->cfg->queuesel = i;
-		vd->cfg->queueenable = 1;
-	}
-	vd->cfg->status |= DriverOk;
-
-	return 1;
-}
-
-static int
-viodisable(SDev *sd)
-{
-	char name[32];
-	Vdev *vd;
-
-	vd = sd->ctlr;
-	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
-	intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
-	pciclrbme(vd->pci);
-	return 1;
-}
-
-static SDev*
-viopnp(void)
-{
-	SDev *s, *h, *t;
-	Vdev *vd;
-	int id;
-
-	h = t = nil;
-
-	id = 'F';
-	for(vd =  viopnpdevs(TypBlk); vd; vd = vd->next){
-		if(vd->nqueue == 0)
-			continue;
-
-		if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
-			break;
-		if((s = malloc(sizeof(*s))) == nil)
-			break;
-		s->ctlr = vd;
-		s->idno = id++;
-		s->ifc = &sdvirtio10ifc;
-		s->nunit = 1;
-		if(h)
-			t->next = s;
-		else
-			h = s;
-		t = s;
-	}
-
-	id = '0';
-	for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
-		Vscsidev *scsi;
-
-		if(vd->nqueue < 3)
-			continue;
-
-		if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
-			break;
-		if(scsi->max_target == 0){
-			vunmap(scsi, sizeof(Vscsidev));
-			continue;
-		}
-		if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
-			print("sdvirtio: cdb %ud or sense size %ud too big\n",
-				scsi->cdb_size, scsi->sense_size);
-			vunmap(scsi, sizeof(Vscsidev));
-			continue;
-		}
-		vd->dev = scsi;
-
-		if((s = malloc(sizeof(*s))) == nil)
-			break;
-		s->ctlr = vd;
-		s->idno = id++;
-		s->ifc = &sdvirtio10ifc;
-		s->nunit = scsi->max_target;
-
-		if(h)
-			t->next = s;
-		else
-			h = s;
-		t = s;
-	}
-	return h;
-}
-
-SDifc sdvirtio10ifc = {
-	"virtio10",			/* name */
-
-	viopnp,				/* pnp */
-	nil,				/* legacy */
-	vioenable,			/* enable */
-	viodisable,			/* disable */
-
-	vioverify,			/* verify */
-	vioonline,			/* online */
-	viorio,				/* rio */
-	nil,				/* rctl */
-	nil,				/* wctl */
-
-	viobio,				/* bio */
-	nil,				/* probe */
-	nil,				/* clear */
-	nil,				/* rtopctl */
-	nil,				/* wtopctl */
-};
--- /dev/null
+++ b/sys/src/9/port/ethervirtio10.c
@@ -1,0 +1,793 @@
+/*
+ * virtio 1.0 ethernet driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to ethervirtio.c, this driver handles the non-legacy
+ * interface for virtio ethernet which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ * 
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+#include "../port/etherif.h"
+
+typedef struct Vconfig Vconfig;
+typedef struct Vnetcfg Vnetcfg;
+
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vheader Vheader;
+typedef struct Vqueue Vqueue;
+
+typedef struct Ctlr Ctlr;
+
+enum {
+	/* §2.1 Device Status Field */
+	Sacknowledge = 1,
+	Sdriver = 2,
+	Sdriverok = 4,
+	Sfeaturesok = 8,
+	Sfailed = 128,
+
+	/* flags in Qnetstatus */
+	Nlinkup = (1<<0),
+	Nannounce = (1<<1),
+
+	/* feat[0] bits */
+	Fmac = 1<<5,
+	Fstatus = 1<<16,
+	Fctrlvq = 1<<17,
+	Fctrlrx = 1<<18,
+
+	/* feat[1] bits */
+	Fversion1 = 1<<(32-32),
+
+	/* vring used flags */
+	Unonotify = 1,
+	/* vring avail flags */
+	Rnointerrupt = 1,
+
+	/* descriptor flags */
+	Dnext = 1,
+	Dwrite = 2,
+	Dindirect = 4,
+
+	/* struct sizes */
+	VringSize = 4,
+	VdescSize = 16,
+	VusedSize = 8,
+	VheaderSize = 12,
+
+	Vrxq	= 0,
+	Vtxq	= 1,
+	Vctlq	= 2,
+
+	/* class/cmd for Vctlq */
+	CtrlRx	= 0x00,
+		CmdPromisc	= 0x00,
+		CmdAllmulti	= 0x01,
+	CtrlMac	= 0x01,
+		CmdMacTableSet	= 0x00,
+	CtrlVlan= 0x02,
+		CmdVlanAdd	= 0x00,
+		CmdVlanDel	= 0x01,
+};
+
+struct Vconfig {
+	u32int	devfeatsel;
+	u32int	devfeat;
+	u32int	drvfeatsel;
+	u32int	drvfeat;
+
+	u16int	msixcfg;
+	u16int	nqueues;
+
+	u8int	status;
+	u8int	cfggen;
+	u16int	queuesel;
+
+	u16int	queuesize;
+	u16int	queuemsixvect;
+
+	u16int	queueenable;
+	u16int	queuenotifyoff;
+
+	u64int	queuedesc;
+	u64int	queueavail;
+	u64int	queueused;
+};
+
+struct Vnetcfg
+{
+	u16int	mac0;
+	u16int	mac1;
+	u16int	mac2;
+	u16int	status;
+	u16int	maxqueuepairs;
+	u16int	mtu;
+};
+
+struct Vring
+{
+	u16int	flags;
+	u16int	idx;
+};
+
+struct Vdesc
+{
+	u64int	addr;
+	u32int	len;
+	u16int	flags;
+	u16int	next;
+};
+
+struct Vused
+{
+	u32int	id;
+	u32int	len;
+};
+
+struct Vheader
+{
+	u8int	flags;
+	u8int	segtype;
+	u16int	hlen;
+	u16int	seglen;
+	u16int	csumstart;
+	u16int	csumend;
+};
+
+struct Vqueue
+{
+	Rendez;
+
+	uint	qsize;
+	uint	qmask;
+
+	Vdesc	*desc;
+
+	Vring	*avail;
+	u16int	*availent;
+	u16int	*availevent;
+
+	Vring	*used;
+	Vused	*usedent;
+	u16int	*usedevent;
+	u16int	lastused;
+
+	uint	nintr;
+	uint	nnote;
+
+	/* notify register */
+	void	*notify;
+};
+
+struct Ctlr {
+	Lock;
+
+	QLock	ctllock;
+
+	int	attached;
+
+	/* registers */
+	Vconfig	*cfg;
+	Vnetcfg *dev;
+	u8int	*isr;
+	u8int	*notify;
+	u32int	notifyoffmult;
+
+	uvlong	port;
+	Pcidev	*pcidev;
+	Ctlr	*next;
+	int	active;
+	ulong	feat[2];
+	int	nqueue;
+
+	/* virtioether has 3 queues: rx, tx and ctl */
+	Vqueue	queue[3];
+};
+
+static Ctlr *ctlrhead;
+
+static int
+vhasroom(void *v)
+{
+	Vqueue *q = v;
+	return q->lastused != q->used->idx;
+}
+
+static void
+vqnotify(Ctlr *ctlr, int x)
+{
+	Vqueue *q;
+
+	coherence();
+	q = &ctlr->queue[x];
+	if(q->used->flags & Unonotify)
+		return;
+	q->nnote++;
+	*((u16int*)q->notify) = x;
+}
+
+static void
+txproc(void *v)
+{
+	Vheader *header;
+	Block **blocks;
+	Ether *edev;
+	Ctlr *ctlr;
+	Vqueue *q;
+	Vused *u;
+	Block *b;
+	int i, j;
+
+	edev = v;
+	ctlr = edev->ctlr;
+	q = &ctlr->queue[Vtxq];
+
+	header = smalloc(VheaderSize);
+	blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+	for(i = 0; i < q->qsize/2; i++){
+		j = i << 1;
+		q->desc[j].addr = PADDR(header);
+		q->desc[j].len = VheaderSize;
+		q->desc[j].next = j | 1;
+		q->desc[j].flags = Dnext;
+
+		q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+		j |= 1;
+		q->desc[j].next = 0;
+		q->desc[j].flags = 0;
+	}
+
+	q->avail->flags &= ~Rnointerrupt;
+
+	while(waserror())
+		;
+
+	while((b = qbread(edev->oq, 1000000)) != nil){
+		for(;;){
+			/* retire completed packets */
+			while((i = q->lastused) != q->used->idx){
+				u = &q->usedent[i & q->qmask];
+				i = (u->id & q->qmask) >> 1;
+				if(blocks[i] == nil)
+					break;
+				freeb(blocks[i]);
+				blocks[i] = nil;
+				q->lastused++;
+			}
+
+			/* have free slot? */
+			i = q->avail->idx & (q->qmask >> 1);
+			if(blocks[i] == nil)
+				break;
+
+			/* ring full, wait and retry */
+			if(!vhasroom(q))
+				sleep(q, vhasroom, q);
+		}
+
+		/* slot is free, fill in descriptor */
+		blocks[i] = b;
+		j = (i << 1) | 1;
+		q->desc[j].addr = PADDR(b->rp);
+		q->desc[j].len = BLEN(b);
+		coherence();
+		q->avail->idx++;
+		vqnotify(ctlr, Vtxq);
+	}
+
+	pexit("ether out queue closed", 1);
+}
+
+static void
+rxproc(void *v)
+{
+	Vheader *header;
+	Block **blocks;
+	Ether *edev;
+	Ctlr *ctlr;
+	Vqueue *q;
+	Vused *u;
+	Block *b;
+	int i, j;
+
+	edev = v;
+	ctlr = edev->ctlr;
+	q = &ctlr->queue[Vrxq];
+
+	header = smalloc(VheaderSize);
+	blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+	for(i = 0; i < q->qsize/2; i++){
+		j = i << 1;
+		q->desc[j].addr = PADDR(header);
+		q->desc[j].len = VheaderSize;
+		q->desc[j].next = j | 1;
+		q->desc[j].flags = Dwrite|Dnext;
+
+		q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+		j |= 1;
+		q->desc[j].next = 0;
+		q->desc[j].flags = Dwrite;
+	}
+
+	q->avail->flags &= ~Rnointerrupt;
+
+	while(waserror())
+		;
+
+	for(;;){
+		/* replenish receive ring */
+		do {
+			i = q->avail->idx & (q->qmask >> 1);
+			if(blocks[i] != nil)
+				break;
+			if((b = iallocb(ETHERMAXTU)) == nil)
+				break;
+			blocks[i] = b;
+			j = (i << 1) | 1;
+			q->desc[j].addr = PADDR(b->rp);
+			q->desc[j].len = BALLOC(b);
+			coherence();
+			q->avail->idx++;
+		} while(q->avail->idx != q->used->idx);
+		vqnotify(ctlr, Vrxq);
+
+		/* wait for any packets to complete */
+		if(!vhasroom(q))
+			sleep(q, vhasroom, q);
+
+		/* retire completed packets */
+		while((i = q->lastused) != q->used->idx) {
+			u = &q->usedent[i & q->qmask];
+			i = (u->id & q->qmask) >> 1;
+			if((b = blocks[i]) == nil)
+				break;
+
+			blocks[i] = nil;
+			b->wp = b->rp + u->len - VheaderSize;
+			etheriq(edev, b);
+			q->lastused++;
+		}
+	}
+}
+
+static int
+vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
+{
+	uchar hdr[2], ack[1];
+	Ctlr *ctlr;
+	Vqueue *q;
+	Vdesc *d;
+	int i;
+
+	ctlr = edev->ctlr;
+	q = &ctlr->queue[Vctlq];
+	if(q->qsize < 3)
+		return -1;
+
+	qlock(&ctlr->ctllock);
+	while(waserror())
+		;
+
+	ack[0] = 0x55;
+	hdr[0] = class;
+	hdr[1] = cmd;
+
+	d = &q->desc[0];
+	d->addr = PADDR(hdr);
+	d->len = sizeof(hdr);
+	d->next = 1;
+	d->flags = Dnext;
+	d++;
+	d->addr = PADDR(data);
+	d->len = ndata;
+	d->next = 2;
+	d->flags = Dnext;
+	d++;
+	d->addr = PADDR(ack);
+	d->len = sizeof(ack);
+	d->next = 0;
+	d->flags = Dwrite;
+
+	i = q->avail->idx & q->qmask;
+	q->availent[i] = 0;
+	coherence();
+
+	q->avail->flags &= ~Rnointerrupt;
+	q->avail->idx++;
+	vqnotify(ctlr, Vctlq);
+	while(!vhasroom(q))
+		sleep(q, vhasroom, q);
+	q->lastused = q->used->idx;
+	q->avail->flags |= Rnointerrupt;
+
+	qunlock(&ctlr->ctllock);
+	poperror();
+
+	if(ack[0] != 0)
+		print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
+
+	return ack[0];
+}
+
+static void
+interrupt(Ureg*, void* arg)
+{
+	Ether *edev;
+	Ctlr *ctlr;
+	Vqueue *q;
+	int i;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+	if(*ctlr->isr & 1){
+		for(i = 0; i < ctlr->nqueue; i++){
+			q = &ctlr->queue[i];
+			if(vhasroom(q)){
+				q->nintr++;
+				wakeup(q);
+			}
+		}
+	}
+}
+
+static void
+attach(Ether* edev)
+{
+	char name[KNAMELEN];
+	Ctlr* ctlr;
+	int i;
+
+	ctlr = edev->ctlr;
+	ilock(ctlr);
+	if(ctlr->attached){
+		iunlock(ctlr);
+		return;
+	}
+	ctlr->attached = 1;
+
+	/* enable the queues */
+	for(i = 0; i < ctlr->nqueue; i++){
+		ctlr->cfg->queuesel = i;
+		ctlr->cfg->queueenable = 1;
+	}
+
+	/* driver is ready */
+	ctlr->cfg->status |= Sdriverok;
+
+	iunlock(ctlr);
+
+	/* start kprocs */
+	snprint(name, sizeof name, "#l%drx", edev->ctlrno);
+	kproc(name, rxproc, edev);
+	snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
+	kproc(name, txproc, edev);
+}
+
+static long
+ifstat(Ether *edev, void *a, long n, ulong offset)
+{
+	int i, l;
+	char *p;
+	Ctlr *ctlr;
+	Vqueue *q;
+
+	ctlr = edev->ctlr;
+
+	p = smalloc(READSTR);
+
+	l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
+	l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);
+
+	for(i = 0; i < ctlr->nqueue; i++){
+		q = &ctlr->queue[i];
+		l += snprint(p+l, READSTR-l,
+			"vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
+			i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
+	}
+
+	n = readstr(offset, a, n, p);
+	free(p);
+
+	return n;
+}
+
+static void
+shutdown(Ether* edev)
+{
+	Ctlr *ctlr = edev->ctlr;
+
+	coherence();
+	ctlr->cfg->status = 0;
+	coherence();
+
+	pciclrbme(ctlr->pcidev);
+}
+
+static void
+promiscuous(void *arg, int on)
+{
+	Ether *edev = arg;
+	uchar b[1];
+
+	b[0] = on != 0;
+	vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
+}
+
+static void
+multicast(void *arg, uchar*, int)
+{
+	Ether *edev = arg;
+	uchar b[1];
+
+	b[0] = edev->nmaddr > 0;
+	vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
+}
+
+static int
+initqueue(Vqueue *q, int size)
+{
+	uchar *p;
+
+	q->desc = mallocalign(VdescSize*size, 16, 0, 0);
+	if(q->desc == nil)
+		return -1;
+	p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
+	if(p == nil){
+FreeDesc:
+		free(q->desc);
+		q->desc = nil;
+		return -1;
+	}
+	q->avail = (void*)p;
+	p += VringSize;
+	q->availent = (void*)p;
+	p += sizeof(u16int)*size;
+	q->availevent = (void*)p;
+	p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
+	if(p == nil){
+		free(q->avail);
+		q->avail = nil;
+		goto FreeDesc;
+	}
+	q->used = (void*)p;
+	p += VringSize;
+	q->usedent = (void*)p;
+	p += VusedSize*size;
+	q->usedevent = (void*)p;
+
+	q->qsize = size;
+	q->qmask = q->qsize - 1;
+
+	q->lastused = q->avail->idx = q->used->idx = 0;
+
+	q->avail->flags |= Rnointerrupt;
+
+	return 0;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+	int bar;
+
+	if(cap != 9 || pcicfgr8(p, off+3) != typ)
+		return 1;
+
+	/* skip invalid or non memory bars */
+	bar = pcicfgr8(p, off+4);
+	if(bar < 0 || bar >= nelem(p->mem) 
+	|| p->mem[bar].size == 0
+	|| (p->mem[bar].bar & 3) != 0)
+		return 1;
+
+	return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+	return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+	int bar, len;
+	uvlong addr;
+
+	if(cap < 0)
+		return nil;
+	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+	addr = pcicfgr32(p, cap+8);
+	len = pcicfgr32(p, cap+12);
+	if(size <= 0)
+		size = len;
+	else if(len < size)
+		return nil;
+	if(addr+len > p->mem[bar].size)
+		return nil;
+	addr += p->mem[bar].bar & ~0xFULL;
+	return vmap(addr, size);
+}
+
+static Ctlr*
+pciprobe(void)
+{
+	Ctlr *c, *h, *t;
+	Pcidev *p;
+	Vconfig *cfg;
+	int bar, cap, n, i;
+
+	h = t = nil;
+
+	/* §4.1.2 PCI Device Discovery */
+	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
+		/* non-transitional devices will have a revision > 0 */
+		if(p->rid == 0)
+			continue;
+		if((cap = virtiocap(p, 1)) < 0)
+			continue;
+		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+		if(cfg == nil)
+			continue;
+		if((c = mallocz(sizeof(Ctlr), 1)) == nil){
+			print("ethervirtio: no memory for Ctlr\n");
+			break;
+		}
+		c->cfg = cfg;
+		c->pcidev = p;
+		c->port = p->mem[bar].bar & ~0xFULL;
+
+		pcienable(p);
+		c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
+		if(c->dev == nil)
+			goto Baddev;
+		c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+		if(c->isr == nil)
+			goto Baddev;
+		cap = virtiocap(p, 2);
+		c->notify = virtiomapregs(p, cap, 0);
+		if(c->notify == nil)
+			goto Baddev;
+		c->notifyoffmult = pcicfgr32(p, cap+16);
+
+		/* device reset */
+		coherence();
+		cfg->status = 0;
+		while(cfg->status != 0)
+			delay(1);
+		cfg->status = Sacknowledge|Sdriver;
+
+		/* negotiate feature bits */
+		cfg->devfeatsel = 1;
+		c->feat[1] = cfg->devfeat;
+
+		cfg->devfeatsel = 0;
+		c->feat[0] = cfg->devfeat;
+
+		cfg->drvfeatsel = 1;
+		cfg->drvfeat = c->feat[1] & Fversion1;
+
+		cfg->drvfeatsel = 0;
+		cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);
+
+		cfg->status |= Sfeaturesok;
+
+		for(i=0; i<nelem(c->queue); i++){
+			cfg->queuesel = i;
+			n = cfg->queuesize;
+			if(n == 0 || (n & (n-1)) != 0){
+				if(i < 2)
+					print("ethervirtio: queue %d has invalid size %d\n", i, n);
+				break;
+			}
+			if(initqueue(&c->queue[i], n) < 0)
+				break;
+			c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
+			coherence();
+			cfg->queuedesc = PADDR(c->queue[i].desc);
+			cfg->queueavail = PADDR(c->queue[i].avail);
+			cfg->queueused = PADDR(c->queue[i].used);
+		}
+		if(i < 2){
+			print("ethervirtio: no queues\n");
+Baddev:
+			pcidisable(p);
+			/* TODO, vunmap */
+			free(c);
+			continue;
+		}
+		c->nqueue = i;		
+
+		if(h == nil)
+			h = c;
+		else
+			t->next = c;
+		t = c;
+	}
+
+	return h;
+}
+
+
+static int
+reset(Ether* edev)
+{
+	static uchar zeros[Eaddrlen];
+	Ctlr *ctlr;
+	int i;
+
+	if(ctlrhead == nil)
+		ctlrhead = pciprobe();
+
+	for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
+		if(ctlr->active)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port){
+			ctlr->active = 1;
+			break;
+		}
+	}
+
+	if(ctlr == nil)
+		return -1;
+
+	edev->ctlr = ctlr;
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pcidev->intl;
+	edev->tbdf = ctlr->pcidev->tbdf;
+	edev->mbps = 1000;
+	edev->link = 1;
+
+	if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
+		for(i = 0; i < Eaddrlen; i++)
+			edev->ea[i] = ((uchar*)ctlr->dev)[i];
+	} else {
+		for(i = 0; i < Eaddrlen; i++)
+			((uchar*)ctlr->dev)[i] = edev->ea[i];
+	}
+
+	edev->arg = edev;
+
+	edev->attach = attach;
+	edev->shutdown = shutdown;
+	edev->ifstat = ifstat;
+
+	if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
+		edev->multicast = multicast;
+		edev->promiscuous = promiscuous;
+	}
+
+	pcisetbme(ctlr->pcidev);
+	intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
+
+	return 0;
+}
+
+void
+ethervirtio10link(void)
+{
+	addethercard("virtio10", reset);
+}
--- /dev/null
+++ b/sys/src/9/port/sdvirtio10.c
@@ -1,0 +1,810 @@
+/*
+ * virtio 1.0 disk driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to sdvirtio.c, this driver handles the non-legacy
+ * interface for virtio disk which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ * 
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "ureg.h"
+#include "../port/error.h"
+
+#include "../port/sd.h"
+
+typedef struct Vscsidev Vscsidev;
+typedef struct Vblkdev Vblkdev;
+
+typedef struct Vconfig Vconfig;
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vqueue Vqueue;
+typedef struct Vdev Vdev;
+
+
+/* device types */
+enum {
+	TypBlk	= 2,
+	TypSCSI	= 8,
+};
+
+/* status flags */
+enum {
+	Acknowledge = 1,
+	Driver = 2,
+	FeaturesOk = 8,
+	DriverOk = 4,
+	Failed = 0x80,
+};
+
+/* descriptor flags */
+enum {
+	Next = 1,
+	Write = 2,
+	Indirect = 4,
+};
+
+/* struct sizes */
+enum {
+	VringSize = 4,
+};
+
+enum {
+	CDBSIZE		= 32,
+	SENSESIZE	= 96,
+};
+
+	
+struct Vscsidev
+{
+	u32int	num_queues;
+	u32int	seg_max;
+	u32int	max_sectors;
+	u32int	cmd_per_lun;
+	u32int	event_info_size;
+	u32int	sense_size;
+	u32int	cdb_size;
+	u16int	max_channel;
+	u16int	max_target;
+	u32int	max_lun;
+};
+
+struct Vblkdev
+{
+	u64int	capacity;
+};
+
+struct Vconfig {
+	u32int	devfeatsel;
+	u32int	devfeat;
+	u32int	drvfeatsel;
+	u32int	drvfeat;
+
+	u16int	msixcfg;
+	u16int	nqueues;
+
+	u8int	status;
+	u8int	cfggen;
+	u16int	queuesel;
+
+	u16int	queuesize;
+	u16int	queuemsixvect;
+
+	u16int	queueenable;
+	u16int	queuenotifyoff;
+
+	u64int	queuedesc;
+	u64int	queueavail;
+	u64int	queueused;
+};
+
+struct Vring
+{
+	u16int	flags;
+	u16int	idx;
+};
+
+struct Vdesc
+{
+	u64int	addr;
+	u32int	len;
+	u16int	flags;
+	u16int	next;
+};
+
+struct Vused
+{
+	u32int	id;
+	u32int	len;
+};
+
+struct Vqueue
+{
+	Lock;
+
+	Vdev	*dev;
+	void	*notify;
+	int	idx;
+
+	int	size;
+
+	int	free;
+	int	nfree;
+
+	Vdesc	*desc;
+
+	Vring	*avail;
+	u16int	*availent;
+	u16int	*availevent;
+
+	Vring	*used;
+	Vused	*usedent;
+	u16int	*usedevent;
+	u16int	lastused;
+
+	void	*rock[];
+};
+
+struct Vdev
+{
+	int	typ;
+
+	Pcidev	*pci;
+
+	uvlong	port;
+	ulong	feat[2];
+
+	int	nqueue;
+	Vqueue	*queue[16];
+
+	void	*dev;	/* device specific config (for scsi) */
+
+	/* registers */
+	Vconfig	*cfg;
+	u8int	*isr;
+	u8int	*notify;
+	u32int	notifyoffmult;
+
+	Vdev	*next;
+};
+
+static Vqueue*
+mkvqueue(int size)
+{
+	Vqueue *q;
+	uchar *p;
+	int i;
+
+	q = malloc(sizeof(*q) + sizeof(void*)*size);
+	p = mallocalign(
+		PGROUND(sizeof(Vdesc)*size + 
+			VringSize + 
+			sizeof(u16int)*size + 
+			sizeof(u16int)) +
+		PGROUND(VringSize + 
+			sizeof(Vused)*size + 
+			sizeof(u16int)), 
+		BY2PG, 0, 0);
+	if(p == nil || q == nil){
+		print("virtio: no memory for Vqueue\n");
+		free(p);
+		free(q);
+		return nil;
+	}
+
+	q->desc = (void*)p;
+	p += sizeof(Vdesc)*size;
+	q->avail = (void*)p;
+	p += VringSize;
+	q->availent = (void*)p;
+	p += sizeof(u16int)*size;
+	q->availevent = (void*)p;
+	p += sizeof(u16int);
+
+	p = (uchar*)PGROUND((uintptr)p);
+	q->used = (void*)p;
+	p += VringSize;
+	q->usedent = (void*)p;
+	p += sizeof(Vused)*size;
+	q->usedevent = (void*)p;
+
+	q->free = -1;
+	q->nfree = q->size = size;
+	for(i=0; i<size; i++){
+		q->desc[i].next = q->free;
+		q->free = i;
+	}
+
+	return q;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+	int bar;
+
+	if(cap != 9 || pcicfgr8(p, off+3) != typ)
+		return 1;
+
+	/* skip invalid or non memory bars */
+	bar = pcicfgr8(p, off+4);
+	if(bar < 0 || bar >= nelem(p->mem) 
+	|| p->mem[bar].size == 0
+	|| (p->mem[bar].bar & 3) != 0)
+		return 1;
+
+	return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+	return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+	int bar, len;
+	uvlong addr;
+
+	if(cap < 0)
+		return nil;
+	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+	addr = pcicfgr32(p, cap+8);
+	len = pcicfgr32(p, cap+12);
+	if(size <= 0)
+		size = len;
+	else if(len < size)
+		return nil;
+	if(addr+len > p->mem[bar].size)
+		return nil;
+	addr += p->mem[bar].bar & ~0xFULL;
+	return vmap(addr, size);
+}
+
+static Vdev*
+viopnpdevs(int typ)
+{
+	Vdev *vd, *h, *t;
+	Vconfig *cfg;
+	Vqueue *q;
+	Pcidev *p;
+	int cap, bar;
+	int n, i;
+
+	h = t = nil;
+	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
+		if(p->rid == 0)
+			continue;
+		if((cap = virtiocap(p, 1)) < 0)
+			continue;
+		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+		if(cfg == nil)
+			continue;
+		if((vd = malloc(sizeof(*vd))) == nil){
+			print("virtio: no memory for Vdev\n");
+			break;
+		}
+		vd->port = p->mem[bar].bar & ~0xFULL;
+		vd->typ = typ;
+		vd->pci = p;
+		vd->cfg = cfg;
+		pcienable(p);
+
+		vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+		if(vd->isr == nil){
+Baddev:
+			pcidisable(p);
+			/* TODO: vunmap */
+			free(vd);
+			continue;
+		}
+		cap = virtiocap(p, 2);
+		vd->notify = virtiomapregs(p, cap, 0);
+		if(vd->notify == nil)
+			goto Baddev;
+		vd->notifyoffmult = pcicfgr32(p, cap+16);
+
+		/* reset */
+		cfg->status = 0;
+		while(cfg->status != 0)
+			delay(1);
+		cfg->status = Acknowledge|Driver;
+
+		/* negotiate feature bits */
+		cfg->devfeatsel = 1;
+		vd->feat[1] = cfg->devfeat;
+		cfg->devfeatsel = 0;
+		vd->feat[0] = cfg->devfeat;
+		cfg->drvfeatsel = 1;
+		cfg->drvfeat = vd->feat[1] & 1;
+		cfg->drvfeatsel = 0;
+		cfg->drvfeat = 0;
+		cfg->status |= FeaturesOk;
+
+		for(i=0; i<nelem(vd->queue); i++){
+			cfg->queuesel = i;
+			n = cfg->queuesize;
+			if(n == 0 || (n & (n-1)) != 0)
+				break;
+			if((q = mkvqueue(n)) == nil)
+				break;
+			q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
+			q->dev = vd;
+			q->idx = i;
+			vd->queue[i] = q;
+			coherence();
+			cfg->queuedesc = PADDR(q->desc);
+			cfg->queueavail = PADDR(q->avail);
+			cfg->queueused = PADDR(q->used);
+		}
+		vd->nqueue = i;
+	
+		if(h == nil)
+			h = vd;
+		else
+			t->next = vd;
+		t = vd;
+	}
+
+	return h;
+}
+
+struct Rock {
+	int done;
+	Rendez *sleep;
+};
+
+static void
+vqinterrupt(Vqueue *q)
+{
+	int id, free, m;
+	struct Rock *r;
+	Rendez *z;
+
+	m = q->size-1;
+
+	ilock(q);
+	while((q->lastused ^ q->used->idx) & m){
+		id = q->usedent[q->lastused++ & m].id;
+		if(r = q->rock[id]){
+			q->rock[id] = nil;
+			z = r->sleep;
+			r->done = 1;	/* hands off */
+			if(z != nil)
+				wakeup(z);
+		}
+		do {
+			free = id;
+			id = q->desc[free].next;
+			q->desc[free].next = q->free;
+			q->free = free;
+			q->nfree++;
+		} while(q->desc[free].flags & Next);
+	}
+	iunlock(q);
+}
+
+static void
+viointerrupt(Ureg *, void *arg)
+{
+	Vdev *vd = arg;
+
+	if(vd->isr[0] & 1)
+		vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
+}
+
+static int
+viodone(void *arg)
+{
+	return ((struct Rock*)arg)->done;
+}
+
+static void
+vqio(Vqueue *q, int head)
+{
+	struct Rock rock;
+
+	rock.done = 0;
+	rock.sleep = &up->sleep;
+	q->rock[head] = &rock;
+	q->availent[q->avail->idx & (q->size-1)] = head;
+	coherence();
+	q->avail->idx++;
+	iunlock(q);
+	if((q->used->flags & 1) == 0)
+		*((u16int*)q->notify) = q->idx;
+	while(!rock.done){
+		while(waserror())
+			;
+		tsleep(rock.sleep, viodone, &rock, 1000);
+		poperror();
+
+		if(!rock.done)
+			vqinterrupt(q);
+	}
+}
+
+static int
+vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
+{
+	int need, free, head;
+	Vqueue *q;
+	Vdesc *d;
+
+	u8int status;
+	struct Vioblkreqhdr {
+		u32int	typ;
+		u32int	prio;
+		u64int	lba;
+	} req;
+
+	need = 2;
+	if(a != nil)
+		need = 3;
+
+	status = -1;
+	req.typ = typ;
+	req.prio = 0;
+	req.lba = lba;
+
+	q = vd->queue[0];
+	ilock(q);
+	while(q->nfree < need){
+		iunlock(q);
+
+		if(!waserror())
+			tsleep(&up->sleep, return0, 0, 500);
+		poperror();
+
+		ilock(q);
+	}
+
+	head = free = q->free;
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(&req);
+	d->len = sizeof(req);
+	d->flags = Next;
+
+	if(a != nil){
+		d = &q->desc[free]; free = d->next;
+		d->addr = PADDR(a);
+		d->len = secsize*count;
+		d->flags = typ ? Next : (Write|Next);
+	}
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(&status);
+	d->len = sizeof(status);
+	d->flags = Write;
+
+	q->free = free;
+	q->nfree -= need;
+
+	/* queue io, unlock and wait for completion */
+	vqio(q, head);
+
+	return status;
+}
+
+static int
+vioscsireq(SDreq *r)
+{
+	u8int resp[4+4+2+2+SENSESIZE];
+	u8int req[8+8+3+CDBSIZE];
+	int free, head;
+	u32int len;
+	Vqueue *q;
+	Vdesc *d;
+	Vdev *vd;
+	SDunit *u;
+	Vscsidev *scsi;
+
+	u = r->unit;
+	vd = u->dev->ctlr;
+	scsi = vd->dev;
+
+	memset(resp, 0, sizeof(resp));
+	memset(req, 0, sizeof(req));
+	req[0] = 1;
+	req[1] = u->subno;
+	req[2] = r->lun>>8;
+	req[3] = r->lun&0xFF;
+	*(u64int*)(&req[8]) = (uintptr)r;
+
+	memmove(&req[8+8+3], r->cmd, r->clen);
+
+	q = vd->queue[2];
+	ilock(q);
+	while(q->nfree < 3){
+		iunlock(q);
+
+		if(!waserror())
+			tsleep(&up->sleep, return0, 0, 500);
+		poperror();
+
+		ilock(q);
+	}
+
+	head = free = q->free;
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(req);
+	d->len = 8+8+3+scsi->cdb_size;
+	d->flags = Next;
+
+	if(r->write && r->dlen > 0){
+		d = &q->desc[free]; free = d->next;
+		d->addr = PADDR(r->data);
+		d->len = r->dlen;
+		d->flags = Next;
+	}
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(resp);
+	d->len = 4+4+2+2+scsi->sense_size;
+	d->flags = Write;
+
+	if(!r->write && r->dlen > 0){
+		d->flags |= Next;
+
+		d = &q->desc[free]; free = d->next;
+		d->addr = PADDR(r->data);
+		d->len = r->dlen;
+		d->flags = Write;
+	}
+	
+	q->free = free;
+	q->nfree -= 2 + (r->dlen > 0);
+
+	/* queue io, unlock and wait for completion */
+	vqio(q, head);
+
+	/* response+status */
+	r->status = resp[10];
+	if(resp[11] != 0)
+		r->status = SDcheck;
+
+	/* sense_len */
+	len = *((u32int*)&resp[0]);
+	if(len > 0){
+		if(len > sizeof(r->sense))
+			len = sizeof(r->sense);
+		memmove(r->sense, &resp[4+4+2+2], len);
+		r->flags |= SDvalidsense;
+	}
+
+	/* data residue */
+	len = *((u32int*)&resp[4]);
+	if(len > r->dlen)
+		r->rlen = 0;
+	else
+		r->rlen = r->dlen - len;
+
+	return r->status;
+
+}
+
+static long
+viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
+{
+	long ss, cc, max, ret;
+	Vdev *vd;
+
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return scsibio(u, lun, write, a, count, lba);
+
+	max = 32;
+	ss = u->secsize;
+	ret = 0;
+	while(count > 0){
+		if((cc = count) > max)
+			cc = max;
+		if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
+			error(Eio);
+		ret += cc*ss;
+		count -= cc;
+		lba += cc;
+	}
+	return ret;
+}
+
+static int
+viorio(SDreq *r)
+{
+	int i, count, rw;
+	uvlong lba;
+	SDunit *u;
+	Vdev *vd;
+
+	u = r->unit;
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return vioscsireq(r);
+	if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
+		if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
+			return sdsetsense(r, SDcheck, 3, 0xc, 2);
+		return sdsetsense(r, SDok, 0, 0, 0);
+	}
+	if((i = sdfakescsi(r)) != SDnostatus)
+		return r->status = i;
+	if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
+		return i;
+	r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
+	return r->status = SDok;
+}
+
+static int
+vioonline(SDunit *u)
+{
+	Vdev *vd;
+	Vblkdev *blk;
+	uvlong cap;
+
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return scsionline(u);
+
+	blk = vd->dev;
+	cap = blk->capacity;
+	if(u->sectors != cap){
+		u->sectors = cap;
+		u->secsize = 512;
+		return 2;
+	}
+	return 1;
+}
+
+static int
+vioverify(SDunit *u)
+{
+	Vdev *vd;
+
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return scsiverify(u);
+
+	return 1;
+}
+
+SDifc sdvirtio10ifc;
+
+static int
+vioenable(SDev *sd)
+{
+	char name[32];
+	Vdev *vd;
+	int i;
+
+	vd = sd->ctlr;
+	pcisetbme(vd->pci);
+	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+	intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+	coherence();
+
+	for(i = 0; i < vd->nqueue; i++){
+		vd->cfg->queuesel = i;
+		vd->cfg->queueenable = 1;
+	}
+	vd->cfg->status |= DriverOk;
+
+	return 1;
+}
+
+static int
+viodisable(SDev *sd)
+{
+	char name[32];
+	Vdev *vd;
+
+	vd = sd->ctlr;
+	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+	intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+	pciclrbme(vd->pci);
+	return 1;
+}
+
+static SDev*
+viopnp(void)
+{
+	SDev *s, *h, *t;
+	Vdev *vd;
+	int id;
+
+	h = t = nil;
+
+	id = 'F';
+	for(vd =  viopnpdevs(TypBlk); vd; vd = vd->next){
+		if(vd->nqueue == 0)
+			continue;
+
+		if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
+			break;
+		if((s = malloc(sizeof(*s))) == nil)
+			break;
+		s->ctlr = vd;
+		s->idno = id++;
+		s->ifc = &sdvirtio10ifc;
+		s->nunit = 1;
+		if(h)
+			t->next = s;
+		else
+			h = s;
+		t = s;
+	}
+
+	id = '0';
+	for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
+		Vscsidev *scsi;
+
+		if(vd->nqueue < 3)
+			continue;
+
+		if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
+			break;
+		if(scsi->max_target == 0){
+			vunmap(scsi, sizeof(Vscsidev));
+			continue;
+		}
+		if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
+			print("sdvirtio: cdb %ud or sense size %ud too big\n",
+				scsi->cdb_size, scsi->sense_size);
+			vunmap(scsi, sizeof(Vscsidev));
+			continue;
+		}
+		vd->dev = scsi;
+
+		if((s = malloc(sizeof(*s))) == nil)
+			break;
+		s->ctlr = vd;
+		s->idno = id++;
+		s->ifc = &sdvirtio10ifc;
+		s->nunit = scsi->max_target;
+
+		if(h)
+			t->next = s;
+		else
+			h = s;
+		t = s;
+	}
+	return h;
+}
+
+SDifc sdvirtio10ifc = {
+	"virtio10",			/* name */
+
+	viopnp,				/* pnp */
+	nil,				/* legacy */
+	vioenable,			/* enable */
+	viodisable,			/* disable */
+
+	vioverify,			/* verify */
+	vioonline,			/* online */
+	viorio,				/* rio */
+	nil,				/* rctl */
+	nil,				/* wctl */
+
+	viobio,				/* bio */
+	nil,				/* probe */
+	nil,				/* clear */
+	nil,				/* rtopctl */
+	nil,				/* wtopctl */
+};