ref: c83d6627b1ecdabdd1864e6c9a524aa90bfd1a9d
dir: /sys/src/9/pc/sdvirtio10.c/
/*
 * virtio 1.0 disk driver
 * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
 *
 * In contrast to sdvirtio.c, this driver handles the non-legacy
 * interface for virtio disk which uses mmio for all register accesses
 * and requires a laborate pci capability structure dance to get working.
 *
 * It is kind of pointless as it is most likely slower than
 * port i/o (harder to emulate on the pc platform).
 * 
 * The reason why this driver is needed it is that vultr set the
 * disable-legacy=on option in the -device parameter for qemu
 * on their hypervisor.
 */
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/pci.h"
#include "ureg.h"
#include "../port/error.h"
#include "../port/sd.h"
typedef struct Vscsidev Vscsidev;
typedef struct Vblkdev Vblkdev;
typedef struct Vconfig Vconfig;
typedef struct Vring Vring;
typedef struct Vdesc Vdesc;
typedef struct Vused Vused;
typedef struct Vqueue Vqueue;
typedef struct Vdev Vdev;
/* device types */
enum {
	TypBlk	= 2,
	TypSCSI	= 8,
};
/* status flags */
enum {
	Acknowledge = 1,
	Driver = 2,
	FeaturesOk = 8,
	DriverOk = 4,
	Failed = 0x80,
};
/* descriptor flags */
enum {
	Next = 1,
	Write = 2,
	Indirect = 4,
};
/* struct sizes */
enum {
	VringSize = 4,
};
enum {
	CDBSIZE		= 32,
	SENSESIZE	= 96,
};
	
struct Vscsidev
{
	u32int	num_queues;
	u32int	seg_max;
	u32int	max_sectors;
	u32int	cmd_per_lun;
	u32int	event_info_size;
	u32int	sense_size;
	u32int	cdb_size;
	u16int	max_channel;
	u16int	max_target;
	u32int	max_lun;
};
struct Vblkdev
{
	u64int	capacity;
};
struct Vconfig {
	u32int	devfeatsel;
	u32int	devfeat;
	u32int	drvfeatsel;
	u32int	drvfeat;
	u16int	msixcfg;
	u16int	nqueues;
	u8int	status;
	u8int	cfggen;
	u16int	queuesel;
	u16int	queuesize;
	u16int	queuemsixvect;
	u16int	queueenable;
	u16int	queuenotifyoff;
	u64int	queuedesc;
	u64int	queueavail;
	u64int	queueused;
};
struct Vring
{
	u16int	flags;
	u16int	idx;
};
struct Vdesc
{
	u64int	addr;
	u32int	len;
	u16int	flags;
	u16int	next;
};
struct Vused
{
	u32int	id;
	u32int	len;
};
struct Vqueue
{
	Lock;
	Vdev	*dev;
	void	*notify;
	int	idx;
	int	size;
	int	free;
	int	nfree;
	Vdesc	*desc;
	Vring	*avail;
	u16int	*availent;
	u16int	*availevent;
	Vring	*used;
	Vused	*usedent;
	u16int	*usedevent;
	u16int	lastused;
	void	*rock[];
};
struct Vdev
{
	int	typ;
	Pcidev	*pci;
	uvlong	port;
	ulong	feat[2];
	int	nqueue;
	Vqueue	*queue[16];
	void	*dev;	/* device specific config (for scsi) */
	/* registers */
	Vconfig	*cfg;
	u8int	*isr;
	u8int	*notify;
	u32int	notifyoffmult;
	Vdev	*next;
};
static Vqueue*
mkvqueue(int size)
{
	Vqueue *q;
	uchar *p;
	int i;
	q = malloc(sizeof(*q) + sizeof(void*)*size);
	p = mallocalign(
		PGROUND(sizeof(Vdesc)*size + 
			VringSize + 
			sizeof(u16int)*size + 
			sizeof(u16int)) +
		PGROUND(VringSize + 
			sizeof(Vused)*size + 
			sizeof(u16int)), 
		BY2PG, 0, 0);
	if(p == nil || q == nil){
		print("virtio: no memory for Vqueue\n");
		free(p);
		free(q);
		return nil;
	}
	q->desc = (void*)p;
	p += sizeof(Vdesc)*size;
	q->avail = (void*)p;
	p += VringSize;
	q->availent = (void*)p;
	p += sizeof(u16int)*size;
	q->availevent = (void*)p;
	p += sizeof(u16int);
	p = (uchar*)PGROUND((uintptr)p);
	q->used = (void*)p;
	p += VringSize;
	q->usedent = (void*)p;
	p += sizeof(Vused)*size;
	q->usedevent = (void*)p;
	q->free = -1;
	q->nfree = q->size = size;
	for(i=0; i<size; i++){
		q->desc[i].next = q->free;
		q->free = i;
	}
	return q;
}
static int
matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
{
	int bar;
	if(cap != 9 || pcicfgr8(p, off+3) != typ)
		return 1;
	/* skip invalid or non memory bars */
	bar = pcicfgr8(p, off+4);
	if(bar < 0 || bar >= nelem(p->mem) 
	|| p->mem[bar].size == 0
	|| (p->mem[bar].bar & 3) != 0)
		return 1;
	return 0;
}
static int
virtiocap(Pcidev *p, int typ)
{
	return pcienumcaps(p, matchvirtiocfgcap, typ);
}
static void*
virtiomapregs(Pcidev *p, int cap, int size)
{
	int bar, len;
	uvlong addr;
	if(cap < 0)
		return nil;
	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
	addr = pcicfgr32(p, cap+8);
	len = pcicfgr32(p, cap+12);
	if(size <= 0)
		size = len;
	else if(len < size)
		return nil;
	if(addr+len > p->mem[bar].size)
		return nil;
	addr += p->mem[bar].bar & ~0xFULL;
	return vmap(addr, size);
}
static Vdev*
viopnpdevs(int typ)
{
	Vdev *vd, *h, *t;
	Vconfig *cfg;
	Vqueue *q;
	Pcidev *p;
	int cap, bar;
	int n, i;
	h = t = nil;
	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
		if(p->rid == 0)
			continue;
		if((cap = virtiocap(p, 1)) < 0)
			continue;
		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
		if(cfg == nil)
			continue;
		if((vd = malloc(sizeof(*vd))) == nil){
			print("virtio: no memory for Vdev\n");
			break;
		}
		vd->port = p->mem[bar].bar & ~0xFULL;
		vd->typ = typ;
		vd->pci = p;
		vd->cfg = cfg;
		pcienable(p);
		vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
		if(vd->isr == nil){
Baddev:
			pcidisable(p);
			/* TODO: vunmap */
			free(vd);
			continue;
		}
		cap = virtiocap(p, 2);
		vd->notify = virtiomapregs(p, cap, 0);
		if(vd->notify == nil)
			goto Baddev;
		vd->notifyoffmult = pcicfgr32(p, cap+16);
		/* reset */
		cfg->status = 0;
		while(cfg->status != 0)
			delay(1);
		cfg->status = Acknowledge|Driver;
		/* negotiate feature bits */
		cfg->devfeatsel = 1;
		vd->feat[1] = cfg->devfeat;
		cfg->devfeatsel = 0;
		vd->feat[0] = cfg->devfeat;
		cfg->drvfeatsel = 1;
		cfg->drvfeat = vd->feat[1] & 1;
		cfg->drvfeatsel = 0;
		cfg->drvfeat = 0;
		cfg->status |= FeaturesOk;
		for(i=0; i<nelem(vd->queue); i++){
			cfg->queuesel = i;
			n = cfg->queuesize;
			if(n == 0 || (n & (n-1)) != 0)
				break;
			if((q = mkvqueue(n)) == nil)
				break;
			q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
			q->dev = vd;
			q->idx = i;
			vd->queue[i] = q;
			coherence();
			cfg->queuedesc = PADDR(q->desc);
			cfg->queueavail = PADDR(q->avail);
			cfg->queueused = PADDR(q->used);
		}
		vd->nqueue = i;
	
		if(h == nil)
			h = vd;
		else
			t->next = vd;
		t = vd;
	}
	return h;
}
struct Rock {
	int done;
	Rendez *sleep;
};
static void
vqinterrupt(Vqueue *q)
{
	int id, free, m;
	struct Rock *r;
	Rendez *z;
	m = q->size-1;
	ilock(q);
	while((q->lastused ^ q->used->idx) & m){
		id = q->usedent[q->lastused++ & m].id;
		if(r = q->rock[id]){
			q->rock[id] = nil;
			z = r->sleep;
			r->done = 1;	/* hands off */
			if(z != nil)
				wakeup(z);
		}
		do {
			free = id;
			id = q->desc[free].next;
			q->desc[free].next = q->free;
			q->free = free;
			q->nfree++;
		} while(q->desc[free].flags & Next);
	}
	iunlock(q);
}
static void
viointerrupt(Ureg *, void *arg)
{
	Vdev *vd = arg;
	if(vd->isr[0] & 1)
		vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
}
static int
viodone(void *arg)
{
	return ((struct Rock*)arg)->done;
}
static void
vqio(Vqueue *q, int head)
{
	struct Rock rock;
	rock.done = 0;
	rock.sleep = &up->sleep;
	q->rock[head] = &rock;
	q->availent[q->avail->idx & (q->size-1)] = head;
	coherence();
	q->avail->idx++;
	iunlock(q);
	if((q->used->flags & 1) == 0)
		*((u16int*)q->notify) = q->idx;
	while(!rock.done){
		while(waserror())
			;
		tsleep(rock.sleep, viodone, &rock, 1000);
		poperror();
		if(!rock.done)
			vqinterrupt(q);
	}
}
static int
vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
{
	int need, free, head;
	Vqueue *q;
	Vdesc *d;
	u8int status;
	struct Vioblkreqhdr {
		u32int	typ;
		u32int	prio;
		u64int	lba;
	} req;
	need = 2;
	if(a != nil)
		need = 3;
	status = -1;
	req.typ = typ;
	req.prio = 0;
	req.lba = lba;
	q = vd->queue[0];
	ilock(q);
	while(q->nfree < need){
		iunlock(q);
		if(!waserror())
			tsleep(&up->sleep, return0, 0, 500);
		poperror();
		ilock(q);
	}
	head = free = q->free;
	d = &q->desc[free]; free = d->next;
	d->addr = PADDR(&req);
	d->len = sizeof(req);
	d->flags = Next;
	if(a != nil){
		d = &q->desc[free]; free = d->next;
		d->addr = PADDR(a);
		d->len = secsize*count;
		d->flags = typ ? Next : (Write|Next);
	}
	d = &q->desc[free]; free = d->next;
	d->addr = PADDR(&status);
	d->len = sizeof(status);
	d->flags = Write;
	q->free = free;
	q->nfree -= need;
	/* queue io, unlock and wait for completion */
	vqio(q, head);
	return status;
}
static int
vioscsireq(SDreq *r)
{
	u8int resp[4+4+2+2+SENSESIZE];
	u8int req[8+8+3+CDBSIZE];
	int free, head;
	u32int len;
	Vqueue *q;
	Vdesc *d;
	Vdev *vd;
	SDunit *u;
	Vscsidev *scsi;
	u = r->unit;
	vd = u->dev->ctlr;
	scsi = vd->dev;
	memset(resp, 0, sizeof(resp));
	memset(req, 0, sizeof(req));
	req[0] = 1;
	req[1] = u->subno;
	req[2] = r->lun>>8;
	req[3] = r->lun&0xFF;
	*(u64int*)(&req[8]) = (uintptr)r;
	memmove(&req[8+8+3], r->cmd, r->clen);
	q = vd->queue[2];
	ilock(q);
	while(q->nfree < 3){
		iunlock(q);
		if(!waserror())
			tsleep(&up->sleep, return0, 0, 500);
		poperror();
		ilock(q);
	}
	head = free = q->free;
	d = &q->desc[free]; free = d->next;
	d->addr = PADDR(req);
	d->len = 8+8+3+scsi->cdb_size;
	d->flags = Next;
	if(r->write && r->dlen > 0){
		d = &q->desc[free]; free = d->next;
		d->addr = PADDR(r->data);
		d->len = r->dlen;
		d->flags = Next;
	}
	d = &q->desc[free]; free = d->next;
	d->addr = PADDR(resp);
	d->len = 4+4+2+2+scsi->sense_size;
	d->flags = Write;
	if(!r->write && r->dlen > 0){
		d->flags |= Next;
		d = &q->desc[free]; free = d->next;
		d->addr = PADDR(r->data);
		d->len = r->dlen;
		d->flags = Write;
	}
	
	q->free = free;
	q->nfree -= 2 + (r->dlen > 0);
	/* queue io, unlock and wait for completion */
	vqio(q, head);
	/* response+status */
	r->status = resp[10];
	if(resp[11] != 0)
		r->status = SDcheck;
	/* sense_len */
	len = *((u32int*)&resp[0]);
	if(len > 0){
		if(len > sizeof(r->sense))
			len = sizeof(r->sense);
		memmove(r->sense, &resp[4+4+2+2], len);
		r->flags |= SDvalidsense;
	}
	/* data residue */
	len = *((u32int*)&resp[4]);
	if(len > r->dlen)
		r->rlen = 0;
	else
		r->rlen = r->dlen - len;
	return r->status;
}
static long
viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
{
	long ss, cc, max, ret;
	Vdev *vd;
	vd = u->dev->ctlr;
	if(vd->typ == TypSCSI)
		return scsibio(u, lun, write, a, count, lba);
	max = 32;
	ss = u->secsize;
	ret = 0;
	while(count > 0){
		if((cc = count) > max)
			cc = max;
		if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
			error(Eio);
		ret += cc*ss;
		count -= cc;
		lba += cc;
	}
	return ret;
}
static int
viorio(SDreq *r)
{
	int i, count, rw;
	uvlong lba;
	SDunit *u;
	Vdev *vd;
	u = r->unit;
	vd = u->dev->ctlr;
	if(vd->typ == TypSCSI)
		return vioscsireq(r);
	if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
		if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
			return sdsetsense(r, SDcheck, 3, 0xc, 2);
		return sdsetsense(r, SDok, 0, 0, 0);
	}
	if((i = sdfakescsi(r)) != SDnostatus)
		return r->status = i;
	if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
		return i;
	r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
	return r->status = SDok;
}
static int
vioonline(SDunit *u)
{
	Vdev *vd;
	Vblkdev *blk;
	uvlong cap;
	vd = u->dev->ctlr;
	if(vd->typ == TypSCSI)
		return scsionline(u);
	blk = vd->dev;
	cap = blk->capacity;
	if(u->sectors != cap){
		u->sectors = cap;
		u->secsize = 512;
		return 2;
	}
	return 1;
}
static int
vioverify(SDunit *u)
{
	Vdev *vd;
	vd = u->dev->ctlr;
	if(vd->typ == TypSCSI)
		return scsiverify(u);
	return 1;
}
SDifc sdvirtio10ifc;
static int
vioenable(SDev *sd)
{
	char name[32];
	Vdev *vd;
	int i;
	vd = sd->ctlr;
	pcisetbme(vd->pci);
	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
	intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
	coherence();
	for(i = 0; i < vd->nqueue; i++){
		vd->cfg->queuesel = i;
		vd->cfg->queueenable = 1;
	}
	vd->cfg->status |= DriverOk;
	return 1;
}
static int
viodisable(SDev *sd)
{
	char name[32];
	Vdev *vd;
	vd = sd->ctlr;
	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
	intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
	pciclrbme(vd->pci);
	return 1;
}
static SDev*
viopnp(void)
{
	SDev *s, *h, *t;
	Vdev *vd;
	int id;
	h = t = nil;
	id = 'F';
	for(vd =  viopnpdevs(TypBlk); vd; vd = vd->next){
		if(vd->nqueue == 0)
			continue;
		if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
			break;
		if((s = malloc(sizeof(*s))) == nil)
			break;
		s->ctlr = vd;
		s->idno = id++;
		s->ifc = &sdvirtio10ifc;
		s->nunit = 1;
		if(h)
			t->next = s;
		else
			h = s;
		t = s;
	}
	id = '0';
	for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
		Vscsidev *scsi;
		if(vd->nqueue < 3)
			continue;
		if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
			break;
		if(scsi->max_target == 0){
			vunmap(scsi, sizeof(Vscsidev));
			continue;
		}
		if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
			print("sdvirtio: cdb %ud or sense size %ud too big\n",
				scsi->cdb_size, scsi->sense_size);
			vunmap(scsi, sizeof(Vscsidev));
			continue;
		}
		vd->dev = scsi;
		if((s = malloc(sizeof(*s))) == nil)
			break;
		s->ctlr = vd;
		s->idno = id++;
		s->ifc = &sdvirtio10ifc;
		s->nunit = scsi->max_target;
		if(h)
			t->next = s;
		else
			h = s;
		t = s;
	}
	return h;
}
SDifc sdvirtio10ifc = {
	"virtio10",			/* name */
	viopnp,				/* pnp */
	nil,				/* legacy */
	vioenable,			/* enable */
	viodisable,			/* disable */
	vioverify,			/* verify */
	vioonline,			/* online */
	viorio,				/* rio */
	nil,				/* rctl */
	nil,				/* wctl */
	viobio,				/* bio */
	nil,				/* probe */
	nil,				/* clear */
	nil,				/* rtopctl */
	nil,				/* wtopctl */
};