ref: 843b2168c6a41b5fd91a7e9993d14dc0c186d4ca
parent: 90d8e4d9442cd3219211f12a2cc66ee5418c6b57
author: Ori Bernstein <ori@eigenstate.org>
date: Sun May 26 20:42:36 EDT 2024
gefs: tweak block address heuristic to sequentialize allocation When churning through data blocks and writing out of order, we would copy blocks and release them often, leading to gaps at the back of the file. Retrying an allocation could also lead to allocating a block in the same arena as the metadata. This would leave gaps in the allocated files, which would cause seeks when accessing the data later. This set of changes allocates blocks that are expected to be accessed sequentially from the start of the arena, and blocks that are expected to be accessed out of order at the end of the arena. For full data blocks (ie, where the write is at the end of the block), we assume that the accesses will be sequential. For partial data blocks, when the write offset is not at the end of the data block, we assume that future appends are inbound, and we allocate the block in the non sequential part of the arena. As a result, when we eventuall fill the block, we will allocate the sequential block. This doesn't help us too much if we have writes to files interleaved, or we overwrite the same section of a file over and over, but it's better than nothing.
--- a/sys/src/cmd/gefs/blk.c
+++ b/sys/src/cmd/gefs/blk.c
@@ -7,8 +7,8 @@
#include "fns.h"
#include "atomic.h"
-static vlong blkalloc_lk(Arena*);
-static vlong blkalloc(int, uint);
+static vlong blkalloc_lk(Arena*, int);
+static vlong blkalloc(int, uint, int);
static void blkdealloc_lk(Arena*, vlong);
static Blk* initblk(Blk*, vlong, vlong, int);
@@ -286,7 +286,7 @@
* and chaining.
*/
if(lb == nil || lb->logsz >= Logspc - Logslop){
- o = blkalloc_lk(a);
+ o = blkalloc_lk(a, 0);
if(o == -1)
error(Efull);
nl = mklogblk(a, o);
@@ -430,7 +430,7 @@
nexterror();
}
for(i = 0; i < nblks; i++){
- blks[i] = blkalloc_lk(a);
+ blks[i] = blkalloc_lk(a, 1);
if(blks[i] == -1)
error(Efull);
}
@@ -495,14 +495,15 @@
* the alloc log.
*/
static vlong
-blkalloc_lk(Arena *a)
+blkalloc_lk(Arena *a, int seq)
{
- Avltree *t;
Arange *r;
vlong b;
- t = a->free;
- r = (Arange*)t->root;
+ if(seq)
+ r = (Arange*)avlmin(a->free);
+ else
+ r = (Arange*)avlmax(a->free);
if(!usereserve && a->size - a->used <= a->reserve)
return -1;
if(r == nil)
@@ -515,11 +516,16 @@
* the sort order because the tree
* covers disjoint ranges
*/
- b = r->off;
- r->len -= Blksz;
- r->off += Blksz;
+ if(seq){
+ b = r->off;
+ r->len -= Blksz;
+ r->off += Blksz;
+ }else{
+ r->len -= Blksz;
+ b = r->off + r->len;
+ }
if(r->len == 0){
- avldelete(t, r);
+ avldelete(a->free, r);
free(r);
}
a->used += Blksz;
@@ -548,7 +554,7 @@
}
static vlong
-blkalloc(int ty, uint hint)
+blkalloc(int ty, uint hint, int seq)
{
Arena *a;
vlong b;
@@ -575,7 +581,7 @@
qunlock(a);
nexterror();
}
- b = blkalloc_lk(a);
+ b = blkalloc_lk(a, seq);
if(b == -1){
qunlock(a);
poperror();
@@ -633,13 +639,28 @@
}
Blk*
-newblk(Tree *t, int ty, vlong hint)
+newdblk(Tree *t, vlong hint, int seq)
{
vlong bp;
Blk *b;
- bp = blkalloc(ty, hint);
+ bp = blkalloc(Tdat, hint, seq);
b = cachepluck();
+ initblk(b, bp, t->memgen, Tdat);
+ b->alloced = getcallerpc(&t);
+ tracex("newblk" , b->bp, Tdat, -1);
+ return b;
+
+}
+
+Blk*
+newblk(Tree *t, int ty)
+{
+ vlong bp;
+ Blk *b;
+
+ bp = blkalloc(ty, 0, 0);
+ b = cachepluck();
initblk(b, bp, t->memgen, ty);
b->alloced = getcallerpc(&t);
tracex("newblk" , b->bp, ty, -1);
@@ -651,7 +672,7 @@
{
Blk *r;
- if((r = newblk(t, b->type, 0)) == nil)
+ if((r = newblk(t, b->type)) == nil)
return nil;
tracex("dup" , b->bp, b->type, t->gen);
--- a/sys/src/cmd/gefs/fns.h
+++ b/sys/src/cmd/gefs/fns.h
@@ -35,8 +35,9 @@
void* emalloc(usize, int);
-Blk* newblk(Tree *, int, vlong);
-Blk* dupblk(Tree *, Blk*);
+Blk* newdblk(Tree*, vlong, int);
+Blk* newblk(Tree*, int);
+Blk* dupblk(Tree*, Blk*);
Blk* getroot(Tree*, int*);
Blk* getblk(Bptr, int);
Blk* holdblk(Blk*);
--- a/sys/src/cmd/gefs/fs.c
+++ b/sys/src/cmd/gefs/fs.c
@@ -471,6 +471,7 @@
char buf[Kvmax];
vlong fb, fo;
Blk *b, *t;
+ int seq;
Tree *r;
Bptr bp;
Kvp kv;
@@ -482,7 +483,11 @@
PACK64(m->k+1, f->qpath);
PACK64(m->k+9, fb);
- b = newblk(f->mnt->root, Tdat, f->qpath);
+ if(fo+n >= Blksz)
+ seq = 1;
+ else
+ seq = 0;
+ b = newdblk(f->mnt->root, f->qpath, seq);
t = nil;
r = f->mnt->root;
if(btlookup(r, m, &kv, buf, sizeof(buf))){
--- a/sys/src/cmd/gefs/ream.c
+++ b/sys/src/cmd/gefs/ream.c
@@ -286,7 +286,7 @@
loadlog(a, a->loghd);
}
- if((mb = newblk(mnt->root, Tleaf, 0)) == nil)
+ if((mb = newblk(mnt->root, Tleaf)) == nil)
sysfatal("ream: allocate root: %r");
holdblk(mb);
initroot(mb);
@@ -296,9 +296,9 @@
mnt->root->ht = 1;
mnt->root->bp = mb->bp;
- if((ab = newblk(adm->root, Tleaf, 0)) == nil)
+ if((ab = newblk(adm->root, Tleaf)) == nil)
sysfatal("ream: allocate root: %r");
- if((ub = newblk(adm->root, Tdat, 0)) == nil)
+ if((ub = newdblk(adm->root, 0, 1)) == nil)
sysfatal("ream: allocate root: %r");
holdblk(ab);
holdblk(ub);
@@ -322,7 +322,7 @@
* a single snap block that the tree will insert
* into, and take a snapshot as the initial state.
*/
- if((tb = newblk(mnt->root, Tleaf, 0)) == nil)
+ if((tb = newblk(mnt->root, Tleaf)) == nil)
sysfatal("ream: allocate snaps: %r");
holdblk(tb);
initsnap(tb, mb, ab);
--- a/sys/src/cmd/gefs/snap.c
+++ b/sys/src/cmd/gefs/snap.c
@@ -596,7 +596,7 @@
nexterror();
}
if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
- b = newblk(&fs->snap, Tdlist, 0);
+ b = newblk(&fs->snap, Tdlist);
if(dl->ins != nil){
enqueue(dl->ins);
dropblk(dl->ins);
--- a/sys/src/cmd/gefs/tree.c
+++ b/sys/src/cmd/gefs/tree.c
@@ -482,7 +482,7 @@
*/
full = 0;
spc = Leafspc - blkfill(b);
- n = newblk(t, b->type, 0);
+ n = newblk(t, b->type);
assert(i >= 0 && j >= 0);
while(i < b->nval || j < up->hi){
if(i >= b->nval)
@@ -573,7 +573,7 @@
Msg m, u;
b = p->b;
- n = newblk(t, b->type, 0);
+ n = newblk(t, b->type);
for(i = 0; i < b->nval; i++){
if(pp != nil && i == p->midx){
copyup(n, pp, nil);
@@ -657,8 +657,8 @@
efreeblk(t, r);
nexterror();
}
- l = newblk(t, b->type, 0);
- r = newblk(t, b->type, 0);
+ l = newblk(t, b->type);
+ r = newblk(t, b->type);
d = l;
i = 0;
@@ -770,8 +770,8 @@
efreeblk(t, r);
nexterror();
}
- l = newblk(t, b->type, 0);
- r = newblk(t, b->type, 0);
+ l = newblk(t, b->type);
+ r = newblk(t, b->type);
d = l;
copied = 0;
halfsz = (2*b->nval + b->valsz)/2;
@@ -820,7 +820,7 @@
Msg m;
int i;
- d = newblk(t, a->type, 0);
+ d = newblk(t, a->type);
for(i = 0; i < a->nval; i++){
getval(a, i, &m);
setval(d, &m);
@@ -904,8 +904,8 @@
efreeblk(t, r);
nexterror();
}
- l = newblk(t, a->type, 0);
- r = newblk(t, a->type, 0);
+ l = newblk(t, a->type);
+ r = newblk(t, a->type);
d = l;
cp = 0;
sp = -1;
@@ -1088,7 +1088,7 @@
}
if(pp->nl != nil && pp->nr != nil){
rp = &path[0];
- rp->nl = newblk(t, Tpivot, 0);
+ rp->nl = newblk(t, Tpivot);
rp->npull = pp->npull;
rp->pullsz = pp->pullsz;
copyup(rp->nl, pp, nil);