git: 9front

ref: cbe10b1e54da785690055afacda45bc3d56c22ef
dir: /sys/src/cmd/venti/srv/dat.h/

View raw version
typedef struct Config		Config;
typedef struct AMap		AMap;
typedef struct AMapN		AMapN;
typedef struct Arena		Arena;
typedef struct AState	AState;
typedef struct ArenaCIG	ArenaCIG;
typedef struct ArenaHead	ArenaHead;
typedef struct ArenaPart	ArenaPart;
typedef struct ArenaTail	ArenaTail;
typedef struct ATailStats	ATailStats;
typedef struct CIBlock		CIBlock;
typedef struct Clump		Clump;
typedef struct ClumpInfo	ClumpInfo;
typedef struct Graph Graph;
typedef struct IAddr		IAddr;
typedef struct IBucket		IBucket;
typedef struct IEStream		IEStream;
typedef struct IEntry		IEntry;
typedef struct IFile		IFile;
typedef struct ISect		ISect;
typedef struct Index		Index;
typedef struct Lump		Lump;
typedef struct DBlock		DBlock;
typedef struct Part		Part;
typedef struct Statbin Statbin;
typedef struct Statdesc	Statdesc;
typedef struct Stats		Stats;
typedef struct ZBlock		ZBlock;
typedef struct Round	Round;
typedef struct Bloom	Bloom;

#pragma incomplete IEStream

#define	TWID32	((u32int)~(u32int)0)
#define	TWID64	((u64int)~(u64int)0)
#define	TWID8	((u8int)~(u8int)0)

enum
{
	ABlockLog		= 9,		/* log2(512), the quantum for reading arenas */
	ANameSize		= 64,
	MaxDiskBlock		= 64*1024,	/* max. allowed size for a disk block */
	MaxIoSize		= 64*1024,	/* max. allowed size for a disk io operation */
	PartBlank		= 256*1024,	/* untouched section at beginning of partition */
	HeadSize		= 512,		/* size of a header after PartBlank */
	MinArenaSize		= 1*1024*1024,	/* smallest reasonable arena size */
	IndexBase		= 1024*1024,	/* initial address to use in an index */
	MaxIo			= 64*1024,	/* max size of a single read or write operation */
	ICacheBits		= 16,		/* default bits for indexing icache */
	MaxAMap			= 31*1024,	/* max. allowed arenas in an address mapping; must be < 32*1024 */
	Unspecified		= ~0ul,

	/*
	 * return codes from syncarena
	 */
	SyncDataErr	= 1 << 0,		/* problem reading the clump data */
	SyncCIErr	= 1 << 1,		/* found erroneous clump directory entries */
	SyncCIZero	= 1 << 2,		/* found unwritten clump directory entries */
	SyncFixErr	= 1 << 3,		/* error writing fixed data */
	SyncHeader	= 1 << 4,		/* altered header fields */

	/*
	 * error severity
	 */
	EOk			= 0,		/* error expected in normal operation */
	EStrange,				/* strange error that should be logged */
	ECorrupt,				/* corrupted data found in arenas */
	EICorrupt,				/* corrupted data found in index */
	EAdmin,					/* should be brought to administrators' attention */
	ECrash,					/* really bad internal error */
	EBug,					/* a limitation which should be fixed */
	EInconsist,				/* inconsistencies between index and arena */
	EMax,

	/*
	 * internal disk formats for the venti archival storage system
	 */
	/*
	 * magic numbers on disk
	 */
	_ClumpMagic		= 0xd15cb10cU,	/* clump header, deprecated */
	ClumpFreeMagic		= 0,		/* free clump; terminates active clump log */

	ArenaPartMagic		= 0xa9e4a5e7U,	/* arena partition header */
	ArenaMagic		= 0xf2a14eadU,	/* arena trailer */
	ArenaHeadMagic		= 0xd15c4eadU,	/* arena header */
	
	BloomMagic		= 0xb1004eadU,	/* bloom filter header */
	BloomMaxHash	= 32,

	ISectMagic		= 0xd15c5ec7U,	/* index header */

	ArenaPartVersion	= 3,
	ArenaVersion4		= 4,
	ArenaVersion5		= 5,
	BloomVersion		= 1,
	IndexVersion		= 1,
	ISectVersion1		= 1,
	ISectVersion2		= 2,

	/*
	 * encodings of clumps on disk
	 */
	ClumpEErr		= 0,		/* can't happen */
	ClumpENone,				/* plain */
	ClumpECompress,				/* compressed */
	ClumpEMax,

	/*
	 * sizes in bytes on disk
	 */
	U8Size			= 1,
	U16Size			= 2,
	U32Size			= 4,
	U64Size			= 8,

	ArenaPartSize		= 4 * U32Size,
	ArenaSize4		= 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
	ArenaSize5			= ArenaSize4 + U32Size,
	ArenaSize5a		= ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
	ArenaHeadSize4		= U64Size + 3 * U32Size + ANameSize,
	ArenaHeadSize5		= ArenaHeadSize4 + U32Size,
	BloomHeadSize	= 4 * U32Size,
	ISectSize1		= 7 * U32Size + 2 * ANameSize,
	ISectSize2		= ISectSize1 + U32Size,
	ClumpInfoSize		= U8Size + 2 * U16Size + VtScoreSize,
	ClumpSize		= ClumpInfoSize + U8Size + 3 * U32Size,
	MaxBloomSize		= 1<<(32-3),	/* 2^32 bits */
	MaxBloomHash	= 32,		/* bits per score */
	/*
	 * BUG - The various block copies that manipulate entry buckets
	 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
	 * so that everything is word-aligned.  Buildindex is actually cpu-bound
	 * by the (byte at a time) copying in qsort.
	 */
	IBucketSize		= U32Size + U16Size,
	IEntrySize		= U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
	IEntryTypeOff		= VtScoreSize + U32Size + U16Size + U64Size + U16Size,
	IEntryAddrOff		= VtScoreSize + U32Size + U16Size,

	MaxClumpBlocks		=  (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
	
	IcacheFrac		= 1000000,	/* denominator */

	SleepForever		= 1000000000,	/* magic value for sleep time */
	/*
	 * dirty flags - order controls disk write order
	 */
	DirtyArena		= 1,
	DirtyArenaCib,
	DirtyArenaTrailer,
	DirtyMax,
	
	ArenaCIGSize = 10*1024,	// about 0.5 MB worth of IEntry.

	VentiZZZZZZZZ
};

extern char TraceDisk[];
extern char TraceLump[];
extern char TraceBlock[];
extern char TraceProc[];
extern char TraceWork[];
extern char TraceQuiet[];
extern char TraceRpc[];

/*
 * results of parsing and initializing a config file
 */
struct Config
{
	char		*index;			/* name of the index to initialize */
	int		naparts;		/* arena partitions initialized */
	ArenaPart	**aparts;
	int		nsects;			/* index sections initialized */
	ISect		**sects;
	Bloom	*bloom;		/* bloom filter */
	u32int	bcmem;
	u32int	mem;
	u32int	icmem;
	int		queuewrites;
	char*	haddr;
	char*	vaddr;
	char*	webroot;
};

/*
 * a Part is the low level interface to files or disks.
 * there are two main types of partitions
 *	arena paritions, which some number of arenas, each in a sub-partition.
 *	index partition, which only have one subpartition.
 */
struct Part
{
	int		fd;			/* rock for accessing the disk */
	int		mode;
	u64int		offset;
	u64int		size;			/* size of the partiton */
	u32int		blocksize;		/* block size for reads and writes */
	u32int		fsblocksize;	/* minimum file system block size */
	char		*name;
	char		*filename;
	Channel		*writechan;		/* chan[dcache.nblock](DBlock*) */
};

/*
 * a cached block from the partition
 * yuck -- most of this is internal structure for the cache
 * all other routines should only use data
 */
struct DBlock
{
	u8int	*data;

	Part	*part;			/* partition in which cached */
	u64int	addr;			/* base address on the partition */
	u32int	size;			/* amount of data available, not amount allocated; should go away */
	u32int	mode;
	u32int	dirty;
	u32int	dirtying;
	DBlock	*next;			/* doubly linked hash chains */
	DBlock	*prev;
	u32int	heap;			/* index in heap table */
	u32int	used;			/* last reference times */
	u32int	used2;
	u32int	ref;			/* reference count */
	RWLock	lock;			/* for access to data only */
	Channel	*writedonechan;	
	void*	chanbuf[1];		/* buffer for the chan! */
};

/*
 * a cached block from the partition
 * yuck -- most of this is internal structure for the cache
 * all other routines should only use data
 * double yuck -- this is mostly the same as a DBlock
 */
struct Lump
{
	Packet	*data;

	Part	*part;			/* partition in which cached */
	u8int	score[VtScoreSize];	/* score of packet */
	u8int	type;			/* type of packet */
	u32int	size;			/* amount of data allocated to hold packet */
	Lump	*next;			/* doubly linked hash chains */
	Lump	*prev;
	u32int	heap;			/* index in heap table */
	u32int	used;			/* last reference times */
	u32int	used2;
	u32int	ref;			/* reference count */
	QLock	lock;			/* for access to data only */
};

/*
 * mapping between names and address ranges
 */
struct AMap
{
	u64int		start;
	u64int		stop;
	char		name[ANameSize];
};

/*
 * an AMap along with a length
 */
struct AMapN
{
	int		n;
	AMap		*map;
};

/*
 * an ArenaPart is a partition made up of Arenas
 * it exists because most os's don't support many partitions,
 * and we want to have many different Arenas
 */
struct ArenaPart
{
	Part		*part;
	u64int		size;			/* size of underlying partition, rounded down to blocks */
	Arena		**arenas;
	u32int		tabbase;		/* base address of arena table on disk */
	u32int		tabsize;		/* max. bytes in arena table */

	/*
	 * fields stored on disk
	 */
	u32int		version;
	u32int		blocksize;		/* "optimal" block size for reads and writes */
	u32int		arenabase;		/* base address of first arena */

	/*
	 * stored in the arena mapping table on disk
	 */
	AMap		*map;
	int		narenas;
};

/*
 * info about one block in the clump info cache
 */
struct CIBlock
{
	u32int		block;			/* blocks in the directory */
	int		offset;			/* offsets of one clump in the data */
	DBlock		*data;
};

/*
 * Statistics kept in the tail. 
 */
struct ATailStats
{
	u32int		clumps;		/* number of clumps */
	u32int		cclumps;		/* number of compressed clumps */
	u64int		used;
	u64int		uncsize;
	u8int		sealed;
};

/*
 * Arena state - represents a point in the data log
 */
struct AState
{
	Arena		*arena;
	u64int		aa;			/* index address */
	ATailStats		stats;
};

/*
 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
 * and followed by a Arena, each in one disk block.
 * struct on disk is not always up to date, but should be self-consistent.
 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
 * <struct name="Arena" type="Arena *">
 *	<field name="name" val="s->name" type="AName"/>
 *	<field name="version" val="s->version" type="U32int"/>
 *	<field name="partition" val="s->part->name" type="AName"/>
 *	<field name="blocksize" val="s->blocksize" type="U32int"/>
 *	<field name="start" val="s->base" type="U64int"/>
 *	<field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
 *	<field name="created" val="s->ctime" type="U32int"/>
 *	<field name="modified" val="s->wtime" type="U32int"/>
 *	<field name="sealed" val="s->sealed" type="Sealed"/>
 *	<field name="score" val="s->score" type="Score"/>
 *	<field name="clumps" val="s->clumps" type="U32int"/>
 *	<field name="compressedclumps" val="s->cclumps" type="U32int"/>
 *	<field name="data" val="s->uncsize" type="U64int"/>
 *	<field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
 *	<field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
 * </struct>
 */
struct Arena
{
	QLock		lock;			/* lock for arena fields, writing to disk */
	Part		*part;			/* partition in which arena lives */
	int		blocksize;		/* size of block to read or write */
	u64int		base;			/* base address on disk */
	u64int		size;			/* total space in the arena */
	u8int		score[VtScoreSize];	/* score of the entire sealed & summed arena */

	int		clumpmax;		/* ClumpInfos per block */
	AState		mem;
	int		inqueue;

	/*
	 * fields stored on disk
	 */
	u32int		version;
	char		name[ANameSize];	/* text label */
	ATailStats		memstats;
	ATailStats		diskstats;
	u32int		ctime;			/* first time a block was written */
	u32int		wtime;			/* last time a block was written */
	u32int		clumpmagic;
	
	ArenaCIG	*cig;
	int	ncig;
};

struct ArenaCIG
{
	u64int	offset;  // from arena base
};

/*
 * redundant storage of some fields at the beginning of each arena
 */
struct ArenaHead
{
	u32int		version;
	char		name[ANameSize];
	u32int		blocksize;
	u64int		size;
	u32int		clumpmagic;
};

/*
 * most interesting meta information for a clump.
 * stored in each clump's header and in the Arena's directory,
 * stored in reverse order just prior to the arena trailer
 */
struct ClumpInfo
{
	u8int		type;
	u16int		size;			/* size of disk data, not including header */
	u16int		uncsize;		/* size of uncompressed data */
	u8int		score[VtScoreSize];	/* score of the uncompressed data only */
};

/*
 * header for an immutable clump of data
 */
struct Clump
{
	ClumpInfo	info;
	u8int		encoding;
	u32int		creator;		/* initial client which wrote the block */
	u32int		time;			/* creation at gmt seconds since 1/1/1970 */
};

/*
 * index of all clumps according to their score
 * this is just a wrapper to tie together the index sections
 * <struct name="Index" type="Index *">
 *	<field name="name" val="s->name" type="AName"/>
 *	<field name="version" val="s->version" type="U32int"/>
 *	<field name="blocksize" val="s->blocksize" type="U32int"/>
 *	<field name="tabsize" val="s->tabsize" type="U32int"/>
 *	<field name="buckets" val="s->buckets" type="U32int"/>
 *	<field name="buckdiv" val="s->div" type="U32int"/>
 *	<field name="bitblocks" val="s->div" type="U32int"/>
 *	<field name="maxdepth" val="s->div" type="U32int"/>
 *	<field name="bitkeylog" val="s->div" type="U32int"/>
 *	<field name="bitkeymask" val="s->div" type="U32int"/>
 *	<array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
 *	<array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
 *	<array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
 * </struct>
 * <struct name="Amap" type="AMap *">
 *	<field name="name" val="s->name" type="AName"/>
 *	<field name="start" val="s->start" type="U64int"/>
 *	<field name="stop" val="s->stop" type="U64int"/>
 * </struct>
 */
struct Index
{
	u32int		div;			/* divisor for mapping score to bucket */
	u32int		buckets;		/* last bucket used in disk hash table */
	u32int		blocksize;
	u32int		tabsize;		/* max. bytes in index config */

	int		mapalloc;		/* first arena to check when adding a lump */
	Arena		**arenas;		/* arenas in the mapping */
	ISect		**sects;		/* sections which hold the buckets */
	Bloom		*bloom;	/* bloom filter */

	/*
	 * fields stored in config file 
	 */
	u32int		version;
	char		name[ANameSize];	/* text label */
	int		nsects;
	AMap		*smap;			/* mapping of buckets to index sections */
	int		narenas;
	AMap		*amap;			/* mapping from index addesses to arenas */
	
	QLock	writing;
};

/*
 * one part of the bucket storage for an index.
 * the index blocks are sequentially allocated
 * across all of the sections.
 */
struct ISect
{
	Part		*part;
	int		blocklog;		/* log2(blocksize) */
	int		buckmax;		/* max. entries in a index bucket */
	u32int		tabbase;		/* base address of index config table on disk */
	u32int		tabsize;		/* max. bytes in index config */
	Channel	*writechan;
	Channel	*writedonechan;
	void		*ig;		/* used by buildindex only */
	int		ng;

	/*
	 * fields stored on disk
	 */
	u32int		version;
	u32int		bucketmagic;
	char		name[ANameSize];	/* text label */
	char		index[ANameSize];	/* index owning the section */
	u32int		blocksize;		/* size of hash buckets in index */
	u32int		blockbase;		/* address of start of on disk index table */
	u32int		blocks;			/* total blocks on disk; some may be unused */
	u32int		start;			/* first bucket in this section */
	u32int		stop;			/* limit of buckets in this section */
};

/*
 * externally interesting part of an IEntry
 */
struct IAddr
{
	u64int		addr;
	u16int		size;			/* uncompressed size */
	u8int		type;			/* type of block */
	u8int		blocks;			/* arena io quanta for Clump + data */
};

/*
 * entries in the index
 * kept in IBuckets in the disk index table,
 * cached in the memory ICache.
 */
struct IEntry
{
	/* on disk data - 32 bytes*/
	u8int	score[VtScoreSize];
	IAddr	ia;
	
	IEntry	*nexthash;
	IEntry	*nextdirty;
	IEntry	*next;
	IEntry	*prev;
	u8int	state;
};
enum {
	IEClean = 0,
	IEDirty = 1,
	IESummary = 2,
};

/*
 * buckets in the on disk index table
 */
struct IBucket
{
	u16int		n;			/* number of active indices */
	u32int		buck;		/* used by buildindex/checkindex only */
	u8int		*data;
};

/*
 * temporary buffers used by individual threads
 */
struct ZBlock
{
	u32int		len;
	u32int		_size;
	u8int		*data;
	u8int		*free;
};

/*
 * simple input buffer for a '\0' terminated text file
 */
struct IFile
{
	char		*name;				/* name of the file */
	ZBlock		*b;				/* entire contents of file */
	u32int		pos;				/* current position in the file */
};

struct Statdesc
{
	char *name;
	ulong max;
};

/* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
enum
{
	StatRpcTotal,
	StatRpcRead,
	StatRpcReadOk,
	StatRpcReadFail,
	StatRpcReadBytes,
	StatRpcReadTime,
	StatRpcReadCached,
	StatRpcReadCachedTime,
	StatRpcReadUncached,
	StatRpcReadUncachedTime,
	StatRpcWrite,
	StatRpcWriteNew,
	StatRpcWriteOld,
	StatRpcWriteFail,
	StatRpcWriteBytes,
	StatRpcWriteTime,
	StatRpcWriteNewTime,
	StatRpcWriteOldTime,

	StatLcacheHit,
	StatLcacheMiss,
	StatLcacheRead,
	StatLcacheWrite,
	StatLcacheSize,
	StatLcacheStall,
	StatLcacheReadTime,

	StatDcacheHit,
	StatDcacheMiss,
	StatDcacheLookup,
	StatDcacheRead,
	StatDcacheWrite,
	StatDcacheDirty,
	StatDcacheSize,
	StatDcacheFlush,
	StatDcacheStall,
	StatDcacheLookupTime,

	StatDblockStall,
	StatLumpStall,

	StatIcacheHit,
	StatIcacheMiss,
	StatIcacheRead,
	StatIcacheWrite,
	StatIcacheFill,
	StatIcachePrefetch,
	StatIcacheDirty,
	StatIcacheSize,
	StatIcacheFlush,
	StatIcacheStall,
	StatIcacheReadTime,
	StatIcacheLookup,
	StatScacheHit,
	StatScachePrefetch,

	StatBloomHit,
	StatBloomMiss,
	StatBloomFalseMiss,
	StatBloomLookup,
	StatBloomOnes,
	StatBloomBits,

	StatApartRead,
	StatApartReadBytes,
	StatApartWrite,
	StatApartWriteBytes,

	StatIsectRead,
	StatIsectReadBytes,
	StatIsectWrite,
	StatIsectWriteBytes,

	StatSumRead,
	StatSumReadBytes,
	
	StatCigLoad,
	StatCigLoadTime,

	NStat
};

extern Statdesc statdesc[NStat];

/*
 * statistics about the operation of the server
 * mainly for performance monitoring and profiling.
 */
struct Stats
{
	ulong		now;
	ulong		n[NStat];
};

struct Statbin
{
	uint nsamp;
	uint min;
	uint max;
	uint avg;
};

struct Graph
{
	long (*fn)(Stats*, Stats*, void*);
	void *arg;
	long t0;
	long t1;
	long min;
	long max;
	long wid;
	long ht;
	int fill;
};

/*
 * for kicking background processes that run one round after another after another
 */
struct Round
{
	QLock	lock;
	Rendez	start;
	Rendez	finish;
	Rendez	delaywait;
	int		delaytime;
	int		delaykick;
	char*	name;
	int		last;
	int		current;
	int		next;
	int		doanother;
};

/*
 * Bloom filter of stored block hashes
 */
struct Bloom
{
	RWLock lk;		/* protects nhash, nbits, tab, mb */
	QLock mod;		/* one marker at a time, protects nb */
	int nhash;
	ulong size;		/* bytes in tab */
	ulong bitmask;		/* to produce bit index */
	u8int *data;
	Part *part;
	Channel *writechan;
	Channel *writedonechan;
};

extern	Index		*mainindex;
extern	u32int		maxblocksize;		/* max. block size used by any partition */
extern	int		paranoid;		/* should verify hashes on disk read */
extern	int		queuewrites;		/* put all lump writes on a queue and finish later */
extern	int		readonly;		/* only allowed to read the disk data */
extern	Stats		stats;
extern	u8int		zeroscore[VtScoreSize];
extern	int		compressblocks;
extern	int		writestodevnull;	/* dangerous - for performance debugging */
extern	int		collectstats;
extern	QLock	memdrawlock;
extern	int		icachesleeptime;
extern	int		minicachesleeptime;
extern	int		arenasumsleeptime;
extern	int		manualscheduling;
extern	int		l0quantum;
extern	int		l1quantum;
extern	int		ignorebloom;
extern	int		icacheprefetch;
extern	int		syncwrites;
extern	int		debugarena; /* print in arena error msgs; -1==unknown */

extern	Stats	*stathist;
extern	int	nstathist;
extern	ulong	stattime;

#ifndef PLAN9PORT
#pragma varargck type "V" uchar*
#define ODIRECT 0
#endif