git: 9front

ref: 30999a82bb72c1870a800bcb1fc7b4558dd93b8c
dir: /sys/src/9/port/devfs.c/

View raw version
/*
 * File system devices.
 * Follows device config in Ken's file server.
 * Builds mirrors, concatenations, interleavings, and partitions
 * of devices out of other (inner) devices.
 * It is ok if inner devices are provided by this driver.
 *
 * Built files are grouped on different directories
 * (called trees, and used to represent disks).
 * The "#k/fs" tree is always available and never goes away.
 * Configuration changes happen only while no I/O is in progress.
 *
 * Default sector size is one byte unless changed by the "disk" ctl.
 */

#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "ureg.h"
#include "../port/error.h"
#include "../port/sd.h"
#include <libsec.h>

enum
{
	Fnone,
	Fmirror,		/* mirror of others */
	Fcat,			/* catenation of others */
	Finter,			/* interleaving of others */
	Fpart,			/* part of other */
	Fclear,			/* start over */
	Fdel,			/* delete a configure device */
	Fdisk,			/* set default tree and sector sz*/
	Fcrypt,			/* encrypted device */

	Sectorsz = 1,
	Blksize	= 8*1024,	/* for Finter only */
	Cryptsectsz = 512,	/* for Fcrypt only */

	Incr = 5,		/* Increments for the dev array */

	/*
	 * All qids are decorated with the tree number.
	 * #k/fs is tree number 0, is automatically added and
	 * its first qid is for the ctl file. It never goes away.
	 */
	Qtop	= 0,		/* #k */
	Qdir,			/* directory (#k/fs) */
	Qctl,			/* ctl, only for #k/fs/ctl */
	Qfirst,			/* first qid assigned for device */

	Iswrite = 0,
	Isread,

	Optional = 0,
	Mustexist,

	/* tunable parameters */
	Maxconf	= 4*1024,	/* max length for config */
	Ndevs	= 32,		/* max. inner devs per command */
	Ntrees	= 128,		/* max. number of trees */
	Maxretries = 3,		/* max. retries of i/o errors */
	Retrypause = 5000,	/* ms. to pause between retries */
};

typedef struct Inner Inner;
typedef struct Fsdev Fsdev;
typedef struct Tree Tree;
typedef struct Key Key;

struct Inner
{
	char	*iname;		/* inner device name */
	vlong	isize;		/* size of inner device */
	Chan	*idev;		/* inner device */
};

struct Fsdev
{
	Ref;			/* one per Chan doing I/O */
	int	gone;		/* true if removed */
	int	vers;		/* qid version for this device */
	int	type;		/* Fnone, Fmirror, ... */
	char	*name;		/* name for this fsdev */
	Tree*	tree;		/* where the device is kept */
	vlong	size;		/* min(inner[X].isize) */
	vlong	start;		/* start address (for Fpart) */
	uint	ndevs;		/* number of inner devices */
	Inner	*inner[Ndevs];	/* inner devices */
	Key	*key;		/* crypt key */
};

struct Tree
{
	char	*name;		/* name for #k/<name> */
	Fsdev	**devs;		/* devices in dir. */
	uint	ndevs;		/* number of devices */
	uint	nadevs;		/* number of allocated devices in devs */
};

struct Key {
	AESstate tweak, ecb;
};

#define dprint if(debug)print

extern Dev fsdevtab;		/* forward */

static RWlock lck;		/* r: use devices; w: change config  */
static Tree fstree;		/* The main "fs" tree. Never goes away */
static Tree *trees[Ntrees];	/* internal representation of config */
static int ntrees;		/* max number of trees */
static int qidvers;
static char *disk;		/* default tree name used */
static char *source;		/* default inner device used */
static int sectorsz = Sectorsz;	/* default sector size */
static char *confstr;		/* textual configuration */

static int debug;

static Qid tqid = {Qtop, 0, QTDIR};
static Qid cqid = {Qctl, 0, 0};

static char* tnames[] = {
	[Fmirror]	"mirror",
	[Fcat]		"cat",
	[Finter]	"inter",
	[Fpart]		"part",
	[Fcrypt]	"crypt",
};

static Cmdtab configs[] = {
	Fmirror,"mirror",	0,
	Fcat,	"cat",		0,
	Finter,	"inter",	0,
	Fpart,	"part",		0,
	Fclear,	"clear",	1,
	Fdel,	"del",		2,
	Fdisk,	"disk",		0,
	Fcrypt,	"crypt",	0,
};

static char Egone[] = "device is gone";		/* file has been removed */

static char*
seprintdev(char *s, char *e, Fsdev *mp)
{
	int i;

	if(mp == nil)
		return seprint(s, e, "<null Fsdev>");
	if(mp->type < 0 || mp->type >= nelem(tnames) || tnames[mp->type] == nil)
		return seprint(s, e, "bad device type %d\n", mp->type);

	s = strecpy(s, e, tnames[mp->type]);
	if(mp->tree != &fstree)
		s = seprint(s, e, " %s/%s", mp->tree->name, mp->name);
	else
		s = seprint(s, e, " %s", mp->name);
	for(i = 0; i < mp->ndevs; i++)
		s = seprint(s, e, " %s", mp->inner[i]->iname);
	switch(mp->type){
	case Fmirror:
	case Fcat:
	case Finter:
	case Fcrypt:
		s = strecpy(s, e, "\n");
		break;
	case Fpart:
		s = seprint(s, e, " %ulld %ulld\n", mp->start, mp->size);
		break;
	default:
		panic("#k: seprintdev bug");
	}
	return s;
}

static char*
seprintconf(char *s, char *e)
{
	int	i, j;
	Tree	*t;

	*s = 0;
	for(i = 0; i < ntrees; i++){
		t = trees[i];
		if(t != nil)
			for(j = 0; j < t->nadevs; j++)
				if(t->devs[j] != nil)
					s = seprintdev(s, e, t->devs[j]);
	}
	return s;
}

/* called with lck w */
static void
setconfstr(void)
{
	char *s;

	s = confstr;
	if(s == nil)
		s = smalloc(Maxconf);
	seprintconf(s, s+Maxconf);
	confstr = s;
}

static vlong
mkpath(int tree, int devno)
{
	return (tree&0xFFFF)<<16 | devno&0xFFFF;
}

static int
path2treeno(int q)
{
	return q>>16 & 0xFFFF;
}

static int
path2devno(int q)
{
	return q & 0xFFFF;
}

static Tree*
gettree(int i, int mustexist)
{
	dprint("gettree %d\n", i);
	if(i < 0)
		panic("#k: bug: bad tree index %d in gettree", i);
	if(i >= ntrees || trees[i] == nil)
		if(mustexist)
			error(Enonexist);
		else
			return nil;
	return trees[i];
}

static Fsdev*
getdev(Tree *t, int i, int mustexist)
{
	dprint("getdev %d\n", i);
	if(i < 0)
		panic("#k: bug: bad dev index %d in getdev", i);
	if(i >= t->nadevs || t->devs[i] == nil)
		if(mustexist)
			error(Enonexist);
		else
			return nil;
	return t->devs[i];
}

static Fsdev*
path2dev(int q)
{
	Tree	*t;

	dprint("path2dev %ux\n", q);
	t = gettree(path2treeno(q), Mustexist);
	return getdev(t, path2devno(q) - Qfirst, Mustexist);
}

static Tree*
treealloc(char *name)
{
	int	i;
	Tree	*t;

	dprint("treealloc %s\n", name);
	for(i = 0; i < nelem(trees); i++)
		if(trees[i] == nil)
			break;
	if(i == nelem(trees))
		return nil;
	t = trees[i] = mallocz(sizeof(Tree), 1);
	if(t == nil)
		error(Enomem);
	if(i == ntrees)
		ntrees++;
	kstrdup(&t->name, name);
	return t;
}

static Tree*
lookuptree(char *name)
{
	int i;

	dprint("lookuptree %s\n", name);
	for(i = 0; i < ntrees; i++)
		if(trees[i] != nil && strcmp(trees[i]->name, name) == 0)
			return trees[i];
	return nil;
}

static Fsdev*
devalloc(Tree *t, char *name)
{
	int	i, ndevs;
	Fsdev	*mp, **devs;

	dprint("devalloc %s %s\n", t->name, name);
	mp = mallocz(sizeof(Fsdev), 1);
	if(mp == nil)
		return nil;
	for(i = 0; i < t->nadevs; i++)
		if(t->devs[i] == nil)
			break;
	if(i >= t->nadevs){
		if(t->nadevs % Incr == 0){
			ndevs = t->nadevs + Incr;
			devs = realloc(t->devs, ndevs * sizeof(Fsdev*));
			if(devs == nil){
				free(mp);
				return nil;
			}
			t->devs = devs;
		}
		t->devs[t->nadevs] = nil;
		t->nadevs++;
	}
	kstrdup(&mp->name, name);
	mp->vers = ++qidvers;
	mp->tree = t;
	t->devs[i] = mp;
	t->ndevs++;
	return mp;
}

static void
deltree(Tree *t)
{
	int i;

	dprint("deltree %s\n", t->name);
	for(i = 0; i < ntrees; i++)
		if(trees[i] == t){
			if(i > 0){		/* "fs" never goes away */
				free(t->name);
				free(t->devs);
				free(t);
				trees[i] = nil;
			}
			return;
		}
	panic("#k: deltree: bug: tree not found");
}

/*
 * A device is gone and we know that all its users are gone.
 * A tree is gone when all its devices are gone ("fs" is never gone).
 * Must close devices outside locks, so we could nest our own devices.
 */
static void
mdeldev(Fsdev *mp)
{
	int	i;
	Inner	*in;
	Tree	*t;

	dprint("deldev %s gone %d ref %uld\n", mp->name, mp->gone, mp->ref);

	mp->gone = 1;
	mp->vers = ++qidvers;

	wlock(&lck);
	t = mp->tree;
	for(i = 0; i < t->nadevs; i++)
		if(t->devs[i] == mp){
			t->devs[i] = nil;
			t->ndevs--;
			if(t->ndevs == 0)
				deltree(t);
			break;
		}
	wunlock(&lck);

	secfree(mp->key);
	free(mp->name);
	for(i = 0; i < mp->ndevs; i++){
		in = mp->inner[i];
		if(in->idev != nil)
			cclose(in->idev);
		free(in->iname);
		free(in);
	}
	free(mp);
}

/*
 * Delete one or all devices in one or all trees.
 */
static void
mdelctl(char *tname, char *dname)
{
	int i, alldevs, alltrees, some;
	Fsdev *mp;
	Tree *t;

	dprint("delctl %s\n", dname);
	alldevs = strcmp(dname, "*") == 0;
	alltrees = strcmp(tname, "*") == 0;
	some = 0;
Again:
	wlock(&lck);
	for(i = 0; i < ntrees; i++){
		t = trees[i];
		if(t == nil)
			continue;
		if(alltrees == 0 && strcmp(t->name, tname) != 0)
			continue;
		for(i = 0; i < t->nadevs; i++){
			mp = t->devs[i];
			if(t->devs[i] == nil)
				continue;
			if(alldevs == 0 && strcmp(mp->name, dname) != 0)
				continue;
			/*
			 * Careful: must close outside locks and that
			 * may change the file tree we are looking at.
			 */
			some++;
			mp->gone = 1;
			if(mp->ref == 0){
				incref(mp);	/* keep it there */
				wunlock(&lck);
				mdeldev(mp);
				goto Again;	/* tree can change */
			}
		}
	}
	if(some)
		setconfstr();
	wunlock(&lck);
	if(some == 0 && alltrees == 0)
		error(Enonexist);
}

static void
setdsize(Fsdev* mp, vlong *ilen)
{
	int	i;
	vlong	inlen;
	Inner	*in;

	dprint("setdsize %s\n", mp->name);
	for (i = 0; i < mp->ndevs; i++){
		in = mp->inner[i];
		in->isize = ilen[i];
		inlen = in->isize;
		switch(mp->type){
		case Finter:
			/* truncate to multiple of Blksize */
			inlen &= ~(Blksize-1);
			in->isize = inlen;
			/* fall through */
		case Fmirror:
			/* use size of smallest inner device */
			if (mp->size == 0 || mp->size > inlen)
				mp->size = inlen;
			break;
		case Fcat:
			mp->size += inlen;
			break;
		case Fpart:
			if(mp->start > inlen)
				error("partition starts after device end");
			if(inlen < mp->start + mp->size){
				print("#k: %s: partition truncated from "
					"%lld to %lld bytes\n", mp->name,
					mp->size, inlen - mp->start);
				mp->size = inlen - mp->start;
			}
			break;
		case Fcrypt:
			if(mp->start > inlen)
				error("crypt starts after device end");
			mp->size = (inlen - mp->start) & ~((vlong)Cryptsectsz-1);
			break;
		}
	}
	if(mp->type == Finter)
		mp->size *= mp->ndevs;
}

static void
validdevname(Tree *t, char *dname)
{
	int i;

	for(i = 0; i < t->nadevs; i++)
		if(t->devs[i] != nil && strcmp(t->devs[i]->name, dname) == 0)
			error(Eexist);
}

static void
parseconfig(char *a, long n, Cmdbuf **cbp, Cmdtab **ctp)
{
	Cmdbuf	*cb;
	Cmdtab	*ct;

	*cbp = cb = parsecmd(a, n);
	*ctp = ct = lookupcmd(cb, configs, nelem(configs));

	cb->f++;			/* skip command */
	cb->nf--;
	switch(ct->index){
	case Fmirror:
	case Fcat:
	case Finter:
		if(cb->nf < 2)
			error("too few arguments for ctl");
		if(cb->nf - 1 > Ndevs)
			error("too many devices in ctl");
		break;
	case Fdisk:
		if(cb->nf < 1 || cb->nf > 3)
			error("ctl usage: disk name [sz dev]");
		break;
	case Fpart:
		if(cb->nf != 4 && (cb->nf != 3 || source == nil))
			error("ctl usage: part new [file] off len");
		break;
	case Fcrypt:
		if(cb->nf != 3)
			error("ctl usage: crypt newname device keyhex");
		break;
	}
}

static void
parsename(char *name, char *disk, char **tree, char **dev)
{
	char *slash;

	slash = strchr(name, '/');
	if(slash == nil){
		if(disk != nil)
			*tree = disk;
		else
			*tree = "fs";
		*dev = name;
	}else{
		*tree = name;
		*slash++ = 0;
		*dev = slash;
	}
	validname(*tree, 0);
	validname(*dev, 0);
}

static vlong
getlen(Chan *c)
{
	uchar	buf[128];	/* old DIRLEN plus a little should be plenty */
	Dir	d;
	long	l;

	l = devtab[c->type]->stat(c, buf, sizeof buf);
	convM2D(buf, l, &d, nil);
	return d.length;
}

/*
 * Process a single line of configuration,
 * often of the form "cmd newname idev0 idev1".
 * locking is tricky, because we need a write lock to
 * add/remove devices yet adding/removing them may lead
 * to calls to this driver that require a read lock (when
 * inner devices are also provided by us).
 */
static void
mconfig(char* a, long n)
{
	int	i;
	vlong	size, start;
	vlong	*ilen;
	char	*tname, *dname, *fakef[4];
	uchar	key[2*256/8];
	int	keylen;
	Chan	**idev;
	Cmdbuf	*cb;
	Cmdtab	*ct;
	Fsdev	*mp;
	Inner	*inprv;
	Tree	*t;

	/* ignore comments & empty lines */
	if (n < 1 || *a == '\0' || *a == '#' || *a == '\n')
		return;

	/* ignore historical config signature */
	if (n >= 6 && memcmp(a, "fsdev:", 6) == 0)
		return;

	dprint("mconfig\n");
	size = 0;
	start = 0;
	mp = nil;
	cb = nil;
	idev = nil;
	ilen = nil;
	keylen = 0;

	if(waserror()){
		free(cb);
		nexterror();
	}

	parseconfig(a, n, &cb, &ct);
	switch (ct->index) {
	case Fdisk:
		kstrdup(&disk, cb->f[0]);
		if(cb->nf >= 2)
			sectorsz = strtoul(cb->f[1], 0, 0);
		else
			sectorsz = Sectorsz;
		if(cb->nf == 3)
			kstrdup(&source, cb->f[2]);
		else{
			free(source);
			source = nil;
		}
		poperror();
		free(cb);
		return;
	case Fclear:
		poperror();
		free(cb);
		mdelctl("*", "*");		/* del everything */
		return;
	case Fcrypt:
		if(cb->nf >= 4) {
			start = strtoul(cb->f[3], 0, 0);
			cb->nf = 3;
		} else
			start = 64*1024;	/* cryptsetup header */
		keylen = dec16(key, sizeof(key), cb->f[2], strlen(cb->f[2]));
		switch(keylen){
		default:
			error("bad hexkey");
		case 2*128/8:
		case 2*256/8:
			break;
		}
		cb->nf -= 1;
		break;
	case Fpart:
		if(cb->nf == 3){
			/*
			 * got a request in the format of sd(3),
			 * pretend we got one in our format.
			 * later we change end to be len.
			 */
			fakef[0] = cb->f[0];
			fakef[1] = source;
			fakef[2] = cb->f[1];
			fakef[3] = cb->f[2];
			cb->f = fakef;
			cb->nf = 4;
		}
		start = strtoll(cb->f[2], nil, 10);
		size =  strtoll(cb->f[3], nil, 10);
		if(cb->f == fakef)
			size -= start;		/* it was end */
		cb->nf -= 2;
		break;
	}
	parsename(cb->f[0], disk, &tname, &dname);
	for(i = 1; i < cb->nf; i++)
		validname(cb->f[i], 1);

	if(ct->index == Fdel){
		mdelctl(tname, dname);
		poperror();
		free(cb);
		return;
	}

	/*
	 * Open all inner devices while we have only a read lock.
	 */
	poperror();
	rlock(&lck);
	if(waserror()){
		runlock(&lck);
Fail:
		for(i = 1; i < cb->nf; i++)
			if(idev != nil && idev[i-1] != nil)
				cclose(idev[i-1]);
		if(mp != nil)
			mdeldev(mp);
		free(idev);
		free(ilen);
		free(cb);
		nexterror();
	}
	idev = smalloc(sizeof(Chan*) * Ndevs);
	ilen = smalloc(sizeof(vlong) * Ndevs);
	for(i = 1; i < cb->nf; i++){
		idev[i-1] = namec(cb->f[i], Aopen, ORDWR, 0);
		ilen[i-1] = getlen(idev[i-1]);
	}
	poperror();
	runlock(&lck);

	/*
	 * Get a write lock and add the device if we can.
	 */
	wlock(&lck);
	if(waserror()){
		wunlock(&lck);
		goto Fail;
	}

	t = lookuptree(tname);
	if(t != nil)
		validdevname(t, dname);
	else{
		t = treealloc(tname);
		if(t == nil)
			error("no more trees");
	}
	mp = devalloc(t, dname);
	if(mp == nil){
		if(t->ndevs == 0)	/* it was created for us */
			deltree(t);	/* but we will not mdeldev() */
		error(Enomem);
	}

	mp->type = ct->index;
	if(mp->type == Fpart){
		mp->start = start * sectorsz;
		mp->size = size * sectorsz;
	}
	if(mp->type == Fcrypt) {
		Key *k = secalloc(sizeof(Key));
		setupAESstate(&k->tweak, &key[0], keylen/2, nil);
		setupAESstate(&k->ecb, &key[keylen/2], keylen/2, nil);
		memset(key, 0, sizeof(key));
		mp->key = k;
		mp->start = start;
	}
	for(i = 1; i < cb->nf; i++){
		inprv = mp->inner[i-1] = mallocz(sizeof(Inner), 1);
		if(inprv == nil)
			error(Enomem);
		mp->ndevs++;
		kstrdup(&inprv->iname, cb->f[i]);
		inprv->idev = idev[i-1];
		idev[i-1] = nil;
	}
	setdsize(mp, ilen);

	setconfstr();
	wunlock(&lck);
	poperror();
	free(idev);
	free(ilen);
	free(cb);
}

static void
rdconf(void)
{
	char *c, *e, *p, *s;
	Chan *cc;
	int mustrd;

	/* only read config file once */
	if (confstr != nil)
		return;

	wlock(&lck);
	if (confstr != nil) {
		wunlock(&lck);
		return;	/* already done */
	}

	/* add the std "fs" tree */
	if(ntrees == 0){
		fstree.name = "fs";
		trees[ntrees++] = &fstree;
	}

	setconfstr();
	wunlock(&lck);

	dprint("rdconf\n");

	/* identify the config file */
	s = getconf("fsconfig");
	if (s == nil){
		mustrd = 0;
		s = "/dev/sdC0/fscfg";
	} else
		mustrd = 1;

	c = smalloc(Maxconf+1);
	if(waserror()){
		free(c);
		if(!mustrd)
			return;
		nexterror();
	}

	/* read it */
	cc = namec(s, Aopen, OREAD, 0);
	if(waserror()){
		cclose(cc);
		nexterror();
	}
	devtab[cc->type]->read(cc, c, Maxconf, 0);
	cclose(cc);
	poperror();

	/* process config copy one line at a time */
	for (p = c; *p != '\0'; p = e){
		e = strchr(p, '\n');
		if (e == nil)
			e = p + strlen(p);
		else
			e++;
		mconfig(p, e - p);
	}

	free(c);
	poperror();	/* c */
}

static int
mgen(Chan *c, char*, Dirtab*, int, int i, Dir *dp)
{
	int	treeno;
	Fsdev	*mp;
	Qid	qid;
	Tree	*t;

	dprint("mgen %#ullx %d\n", c->qid.path, i);
	qid.type = QTDIR;
	qid.vers = 0;
	if(c->qid.path == Qtop){
		if(i == DEVDOTDOT){
			devdir(c, tqid, "#k", 0, eve, 0775, dp);
			return 1;
		}
		t = gettree(i, Optional);
		if(t == nil){
			dprint("no\n");
			return -1;
		}
		qid.path = mkpath(i, Qdir);
		devdir(c, qid, t->name, 0, eve, 0775, dp);
		return 1;
	}

	treeno = path2treeno(c->qid.path);
	t = gettree(treeno, Optional);
	if(t == nil){
		dprint("no\n");
		return -1;
	}
	if((c->qid.type & QTDIR) != 0){
		if(i == DEVDOTDOT){
			devdir(c, tqid, "#k", 0, eve, 0775, dp);
			return 1;
		}
		if(treeno == 0){
			/* take care of #k/fs/ctl */
			if(i == 0){
				devdir(c, cqid, "ctl", 0, eve, 0664, dp);
				return 1;
			}
			i--;
		}
		mp = getdev(t, i, Optional);
		if(mp == nil){
			dprint("no\n");
			return -1;
		}
		qid.type = QTFILE;
		qid.vers = mp->vers;
		qid.path = mkpath(treeno, Qfirst+i);
		devdir(c, qid, mp->name, mp->size, eve, 0664, dp);
		return 1;
	}

	if(i == DEVDOTDOT){
		qid.path = mkpath(treeno, Qdir);
		devdir(c, qid, t->name, 0, eve, 0775, dp);
		return 1;
	}
	dprint("no\n");
	return -1;
}

static Chan*
mattach(char *spec)
{
	dprint("mattach\n");
	return devattach(fsdevtab.dc, spec);
}

static Walkqid*
mwalk(Chan *c, Chan *nc, char **name, int nname)
{
	Walkqid *wq;

	rdconf();

	dprint("mwalk %llux\n", c->qid.path);
	rlock(&lck);
	if(waserror()){
		runlock(&lck);
		nexterror();
	}
	wq = devwalk(c, nc, name, nname, 0, 0, mgen);
	poperror();
	runlock(&lck);
	return wq;
}

static int
mstat(Chan *c, uchar *db, int n)
{
	int	p;
	Dir	d;
	Fsdev	*mp;
	Qid	q;
	Tree	*t;

	dprint("mstat %llux\n", c->qid.path);
	rlock(&lck);
	if(waserror()){
		runlock(&lck);
		nexterror();
	}
	p = c->qid.path;
	memset(&d, 0, sizeof d);
	switch(p){
	case Qtop:
		devdir(c, tqid, "#k", 0, eve, 0775, &d);
		break;
	case Qctl:
		devdir(c, cqid, "ctl", 0, eve, 0664, &d);
		break;
	default:
		t = gettree(path2treeno(p), Mustexist);
		if(c->qid.type & QTDIR)
			devdir(c, c->qid, t->name, 0, eve, 0775, &d);
		else{
			mp = getdev(t, path2devno(p) - Qfirst, Mustexist);
			q = c->qid;
			q.vers = mp->vers;
			devdir(c, q, mp->name, mp->size, eve, 0664, &d);
		}
	}
	n = convD2M(&d, db, n);
	if (n == 0)
		error(Ebadarg);
	poperror();
	runlock(&lck);
	return n;
}

static Chan*
mopen(Chan *c, int omode)
{
	int	q;
	Fsdev	*mp;

	dprint("mopen %llux\n", c->qid.path);
	if((c->qid.type & QTDIR) && omode != OREAD)
		error(Eperm);
	if(c->qid.path != Qctl && (c->qid.type&QTDIR) == 0){
		rlock(&lck);
		if(waserror()){
			runlock(&lck);
			nexterror();
		}
		q = c->qid.path;
		mp = path2dev(q);
		if(mp->gone)
			error(Egone);
		incref(mp);
		poperror();
		runlock(&lck);
	}
	/*
	 * Our mgen does not return the info for the qid
	 * but only for its children. Don't use devopen here.
	 */
	c->offset = 0;
	c->mode = openmode(omode);
	c->flag |= COPEN;
	return c;
}

static void
mclose(Chan *c)
{
	int	mustdel, q;
	Fsdev	*mp;

	dprint("mclose %llux\n", c->qid.path);
	if(c->qid.type & QTDIR || !(c->flag & COPEN))
		return;
	rlock(&lck);
	if(waserror()){
		runlock(&lck);
		nexterror();
	}
	mustdel = 0;
	mp = nil;
	q = c->qid.path;
	if(q == Qctl){
		free(disk);
		disk = nil;	/* restore defaults */
		free(source);
		source = nil;
		sectorsz = Sectorsz;
	}else{
		mp = path2dev(q);
		if(mp->gone != 0 && mp->ref == 1)
			mustdel = 1;
		else
			decref(mp);
	}
	poperror();
	runlock(&lck);
	if(mustdel)
		mdeldev(mp);
}

static long
io(Fsdev *mp, Inner *in, int isread, void *a, long l, vlong off)
{
	long wl;
	Chan	*mc;

	mc = in->idev;
	if(mc == nil)
		error(Egone);
	if (waserror()) {
		print("#k: %s: byte %,lld count %ld (of #k/%s): %s error: %s\n",
			in->iname, off, l, mp->name, (isread? "read": "write"),
			(up && up->errstr? up->errstr: ""));
		nexterror();
	}
	if (isread)
		wl = devtab[mc->type]->read(mc, a, l, off);
	else
		wl = devtab[mc->type]->write(mc, a, l, off);
	poperror();
	return wl;
}

static long
cryptio(Fsdev *mp, int isread, uchar *a, long n, vlong off)
{
	long l, m, o, nb;
	uchar *b;

	if((((ulong)off|n) & (Cryptsectsz-1)))
		error(Ebadarg);
	if(isread){
		l = io(mp, mp->inner[0], Isread, a, n, off);
		if(l > 0){
			l &= ~(Cryptsectsz-1);
			for(o=0; o<l; o+=Cryptsectsz)
				aes_xts_decrypt(&mp->key->tweak, &mp->key->ecb,
					off+o, a+o, a+o, Cryptsectsz);
		}
		return l;
	}
	nb = n < SDmaxio ? n : SDmaxio;
	while((b = sdmalloc(nb)) == nil){
		if(!waserror()){
			resrcwait("no memory for cryptio");
			poperror();
		}
	}
	if(waserror()) {
		sdfree(b);
		nexterror();
	}
	for(l = 0; (m = n - l) > 0; l += m){
		if(m > nb) m = nb;
		for(o=0; o<m; o+=Cryptsectsz)
			aes_xts_encrypt(&mp->key->tweak, &mp->key->ecb,
				off+o, a+o, b+o, Cryptsectsz);
		if(io(mp, mp->inner[0], Iswrite, b, m, off) != m)
			error(Eio);
		off += m;
		a += m;
	}
	sdfree(b);
	poperror();
	return l;
}

/* NB: a transfer could span multiple inner devices */
static long
catio(Fsdev *mp, int isread, void *a, long n, vlong off)
{
	int	i;
	long	l, res;
	Inner	*in;

	if(debug)
		print("catio %d %p %ld %lld\n", isread, a, n, off);
	res = n;
	for (i = 0; n > 0 && i < mp->ndevs; i++){
		in = mp->inner[i];
		if (off >= in->isize){
			off -= in->isize;
			continue;		/* not there yet */
		}
		if (off + n > in->isize)
			l = in->isize - off;
		else
			l = n;
		if(debug)
			print("\tdev %d %p %ld %lld\n", i, a, l, off);

		if (io(mp, in, isread, a, l, off) != l)
			error(Eio);

		a = (char*)a + l;
		off = 0;
		n -= l;
	}
	if(debug)
		print("\tres %ld\n", res - n);
	return res - n;
}

static long
interio(Fsdev *mp, int isread, void *a, long n, vlong off)
{
	int	i;
	long	boff, res, l, wl, wsz;
	vlong	woff, blk, mblk;

	blk  = off / Blksize;
	boff = off % Blksize;
	wsz  = Blksize - boff;
	res = n;
	while(n > 0){
		mblk = blk / mp->ndevs;
		i    = blk % mp->ndevs;
		woff = mblk*Blksize + boff;
		if (n > wsz)
			l = wsz;
		else
			l = n;

		wl = io(mp, mp->inner[i], isread, a, l, woff);
		if (wl != l)
			error(Eio);

		blk++;
		boff = 0;
		wsz = Blksize;
		a = (char*)a + l;
		n -= l;
	}
	return res;
}

static long
mread(Chan *c, void *a, long n, vlong off)
{
	int	i, retry;
	long	l, res;
	Fsdev	*mp;
	Tree	*t;

	dprint("mread %llux\n", c->qid.path);
	rlock(&lck);
	if(waserror()){
		runlock(&lck);
		nexterror();
	}
	res = -1;
	if(c->qid.type & QTDIR){
		res = devdirread(c, a, n, 0, 0, mgen);
		goto Done;
	}
	if(c->qid.path == Qctl){
		res = readstr((long)off, a, n, confstr);
		goto Done;
	}

	t = gettree(path2treeno(c->qid.path), Mustexist);
	mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);

	if(off >= mp->size){
		res = 0;
		goto Done;
	}
	if(off + n > mp->size)
		n = mp->size - off;
	if(n == 0){
		res = 0;
		goto Done;
	}

	switch(mp->type){
	case Fcat:
		res = catio(mp, Isread, a, n, off);
		break;
	case Finter:
		res = interio(mp, Isread, a, n, off);
		break;
	case Fpart:
		res = io(mp, mp->inner[0], Isread, a, n, mp->start + off);
		break;
	case Fmirror:
		retry = 0;
		do {
			if (retry > 0) {
				print("#k/%s: retry %d read for byte %,lld "
					"count %ld: %s\n", mp->name, retry, off,
					n, (up && up->errstr? up->errstr: ""));
				/*
				 * pause before retrying in case it's due to
				 * a transient bus or controller problem.
				 */
				tsleep(&up->sleep, return0, 0, Retrypause);
			}
			for (i = 0; i < mp->ndevs; i++){
				if (waserror())
					continue;
				l = io(mp, mp->inner[i], Isread, a, n, off);
				poperror();
				if (l >= 0){
					res = l;
					break;		/* read a good copy */
				}
			}
		} while (i == mp->ndevs && ++retry <= Maxretries);
		if (retry > Maxretries) {
			/* no mirror had a good copy of the block */
			print("#k/%s: byte %,lld count %ld: CAN'T READ "
				"from mirror: %s\n", mp->name, off, n,
				(up && up->errstr? up->errstr: ""));
			error(Eio);
		} else if (retry > 0)
			print("#k/%s: byte %,lld count %ld: retry read OK "
				"from mirror: %s\n", mp->name, off, n,
				(up && up->errstr? up->errstr: ""));
		break;
	case Fcrypt:
		res = cryptio(mp, Isread, a, n, mp->start + off);
		break;
	}
Done:
	poperror();
	runlock(&lck);
	return res;
}

static long
mwrite(Chan *c, void *a, long n, vlong off)
{
	int	i, allbad, anybad, retry;
	long	l, res;
	Fsdev	*mp;
	Tree	*t;

	dprint("mwrite %llux\n", c->qid.path);
	if (c->qid.type & QTDIR)
		error(Eisdir);
	if (c->qid.path == Qctl){
		mconfig(a, n);
		return n;
	}

	rlock(&lck);
	if(waserror()){
		runlock(&lck);
		nexterror();
	}

	t = gettree(path2treeno(c->qid.path), Mustexist);
	mp = getdev(t, path2devno(c->qid.path) - Qfirst, Mustexist);

	if(off >= mp->size){
		res = 0;
		goto Done;
	}
	if(off + n > mp->size)
		n = mp->size - off;
	if(n == 0){
		res = 0;
		goto Done;
	}
	res = n;
	switch(mp->type){
	case Fcat:
		res = catio(mp, Iswrite, a, n, off);
		break;
	case Finter:
		res = interio(mp, Iswrite, a, n, off);
		break;
	case Fpart:
		res = io(mp, mp->inner[0], Iswrite, a, n, mp->start + off);
		if (res != n)
			error(Eio);
		break;
	case Fmirror:
		retry = 0;
		do {
			if (retry > 0) {
				print("#k/%s: retry %d write for byte %,lld "
					"count %ld: %s\n", mp->name, retry, off,
					n, (up && up->errstr? up->errstr: ""));
				/*
				 * pause before retrying in case it's due to
				 * a transient bus or controller problem.
				 */
				tsleep(&up->sleep, return0, 0, Retrypause);
			}
			allbad = 1;
			anybad = 0;
			for (i = mp->ndevs - 1; i >= 0; i--){
				if (waserror()) {
					anybad = 1;
					continue;
				}
				l = io(mp, mp->inner[i], Iswrite, a, n, off);
				poperror();
				if (l == n)
					allbad = 0;	/* wrote a good copy */
				else
					anybad = 1;
			}
		} while (anybad && ++retry <= Maxretries);
		if (allbad) {
			/* no mirror took a good copy of the block */
			print("#k/%s: byte %,lld count %ld: CAN'T WRITE "
				"to mirror: %s\n", mp->name, off, n,
				(up && up->errstr? up->errstr: ""));
			error(Eio);
		} else if (retry > 0)
			print("#k/%s: byte %,lld count %ld: retry wrote OK "
				"to mirror: %s\n", mp->name, off, n,
				(up && up->errstr? up->errstr: ""));

		break;
	case Fcrypt:
		res = cryptio(mp, Iswrite, a, n, mp->start + off);
		break;
	}
Done:
	poperror();
	runlock(&lck);
	return res;
}

Dev fsdevtab = {
	'k',
	"fs",

	devreset,
	devinit,
	devshutdown,
	mattach,
	mwalk,
	mstat,
	mopen,
	devcreate,
	mclose,
	mread,
	devbread,
	mwrite,
	devbwrite,
	devremove,
	devwstat,
	devpower,
	devconfig,
};