ref: a4a975e8ac706cc2d9ef954c250b7ae4dd2e0e08
dir: /sys/src/cmd/venti/srv/fixarenas.c/
/* * Check and fix an arena partition. * * This is a lot grittier than the rest of Venti because * it can't just give up if a byte here or there is wrong. * * The rule here (hopefully followed!) is that block corruption * only ever has a local effect -- there are no blocks that you * can wipe out that will cause large portions of * uncorrupted data blocks to be useless. */ #include "stdinc.h" #include "dat.h" #include "fns.h" #include "whack.h" #define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1)) #pragma varargck type "Z" uvlong #pragma varargck type "Z" vlong #pragma varargck type "t" uint enum { K = 1024, M = 1024*1024, G = 1024*1024*1024, Block = 4096, }; int debugsha1; int verbose; Part *part; char *file; char *basename; char *dumpbase; int fix; int badreads; int unseal; uchar zero[MaxDiskBlock]; Arena lastarena; ArenaPart ap; uvlong arenasize; int nbadread; int nbad; uvlong partend; void checkarena(vlong, int); void usage(void) { fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n"); threadexitsall(0); } /* * Format number in simplest way that is okay with unittoull. */ static int Zfmt(Fmt *fmt) { vlong x; x = va_arg(fmt->args, vlong); if(x == 0) return fmtstrcpy(fmt, "0"); if(x%G == 0) return fmtprint(fmt, "%lldG", x/G); if(x%M == 0) return fmtprint(fmt, "%lldM", x/M); if(x%K == 0) return fmtprint(fmt, "%lldK", x/K); return fmtprint(fmt, "%lld", x); } /* * Format time like ctime without newline. */ static int tfmt(Fmt *fmt) { uint t; char buf[30]; t = va_arg(fmt->args, uint); strcpy(buf, ctime(t)); buf[28] = 0; return fmtstrcpy(fmt, buf); } /* * Coalesce messages about unreadable sectors into larger ranges. * bad(0, 0) flushes the buffer. */ static void bad(char *msg, vlong o, int len) { static vlong lb0, lb1; static char *lmsg; if(msg == nil) msg = lmsg; if(o == -1){ lmsg = nil; lb0 = 0; lb1 = 0; return; } if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){ if(lb0 != lb1) print("%s %#llux+%#llux (%,lld+%,lld)\n", lmsg, lb0, lb1-lb0, lb0, lb1-lb0); lb0 = o; } lmsg = msg; lb1 = o+len; } /* * Read in the len bytes of data at the offset. If can't for whatever reason, * fill it with garbage but print an error. */ static uchar* readdisk(uchar *buf, vlong offset, int len) { int i, j, k, n; if(offset >= partend){ memset(buf, 0xFB, sizeof buf); return buf; } if(offset+len > partend){ memset(buf, 0xFB, sizeof buf); len = partend - offset; } if(readpart(part, offset, buf, len) >= 0) return buf; /* * The read failed. Clear the buffer to nonsense, and * then try reading in smaller pieces. If that fails, * read in even smaller pieces. And so on down to sectors. */ memset(buf, 0xFD, len); for(i=0; i<len; i+=64*K){ n = 64*K; if(i+n > len) n = len-i; if(readpart(part, offset+i, buf+i, n) >= 0) continue; for(j=i; j<len && j<i+64*K; j+=4*K){ n = 4*K; if(j+n > len) n = len-j; if(readpart(part, offset+j, buf+j, n) >= 0) continue; for(k=j; k<len && k<j+4*K; k+=512){ if(readpart(part, offset+k, buf+k, 512) >= 0) continue; bad("disk read failed at", k, 512); badreads++; } } } bad(nil, 0, 0); return buf; } /* * Buffer to support running SHA1 hash of the disk. */ typedef struct Shabuf Shabuf; struct Shabuf { int fd; vlong offset; DigestState state; int rollback; vlong r0; DigestState *hist; int nhist; }; void sbdebug(Shabuf *sb, char *file) { int fd; if(sb->fd > 0){ close(sb->fd); sb->fd = 0; } if((fd = create(file, OWRITE, 0666)) < 0) return; if(fd == 0){ fd = dup(fd, -1); close(0); } sb->fd = fd; } void sbupdate(Shabuf *sb, uchar *p, vlong offset, int len) { int n, x; vlong o; if(sb->rollback && !sb->hist){ sb->r0 = offset; sb->nhist = 1; sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist); memset(sb->hist, 0, sizeof sb->hist[0]); } if(sb->r0 == 0) sb->r0 = offset; if(sb->offset < offset || sb->offset >= offset+len){ if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n", p, offset, len, sb->offset); return; } x = sb->offset - offset; if(0) print("sbupdate %p %#llux+%d skip %d\n", sb, offset, len, x); if(x){ p += x; offset += x; len -= x; } assert(sb->offset == offset); if(sb->fd > 0) pwrite(sb->fd, p, len, offset - sb->r0); if(!sb->rollback){ sha1(p, len, nil, &sb->state); sb->offset += len; return; } /* save state every 4M so we can roll back quickly */ o = offset - sb->r0; while(len > 0){ n = 4*M - o%(4*M); if(n > len) n = len; sha1(p, n, nil, &sb->state); sb->offset += n; o += n; p += n; len -= n; if(o%(4*M) == 0){ x = o/(4*M); if(x >= sb->nhist){ if(x != sb->nhist) print("oops! x=%d nhist=%d\n", x, sb->nhist); sb->nhist += 32; sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist); } sb->hist[x] = sb->state; } } } void sbdiskhash(Shabuf *sb, vlong eoffset) { static uchar dbuf[4*M]; int n; while(sb->offset < eoffset){ n = sizeof dbuf; if(sb->offset+n > eoffset) n = eoffset - sb->offset; readdisk(dbuf, sb->offset, n); sbupdate(sb, dbuf, sb->offset, n); } } void sbrollback(Shabuf *sb, vlong offset) { int x; vlong o; Dir d; if(!sb->rollback || !sb->r0){ print("cannot rollback sha\n"); return; } if(offset >= sb->offset) return; o = offset - sb->r0; x = o/(4*M); if(x >= sb->nhist){ print("cannot rollback sha\n"); return; } sb->state = sb->hist[x]; sb->offset = sb->r0 + x*4*M; assert(sb->offset <= offset); if(sb->fd > 0){ nulldir(&d); d.length = sb->offset - sb->r0; dirfwstat(sb->fd, &d); } } void sbscore(Shabuf *sb, uchar *score) { if(sb->hist){ free(sb->hist); sb->hist = nil; } sha1(nil, 0, score, &sb->state); } /* * If we're fixing arenas, then editing this memory edits the disk! * It will be written back out as new data is paged in. */ uchar buf[4*M]; uchar sbuf[4*M]; vlong bufoffset; int buflen; static void pageout(void); static uchar* pagein(vlong offset, int len) { pageout(); if(offset >= partend){ memset(buf, 0xFB, sizeof buf); return buf; } if(offset+len > partend){ memset(buf, 0xFB, sizeof buf); len = partend - offset; } bufoffset = offset; buflen = len; readdisk(buf, offset, len); memmove(sbuf, buf, len); return buf; } static void pageout(void) { if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){ buflen = 0; return; } if(writepart(part, bufoffset, buf, buflen) < 0) print("disk write failed at %#llux+%#ux (%,lld+%,d)\n", bufoffset, buflen, bufoffset, buflen); buflen = 0; } static void zerorange(vlong offset, int len) { int i; vlong ooff; int olen; enum { MinBlock = 4*K, MaxBlock = 8*K }; if(0) if(bufoffset <= offset && offset+len <= bufoffset+buflen){ memset(buf+(offset-bufoffset), 0, len); return; } ooff = bufoffset; olen = buflen; i = offset%MinBlock; if(i+len < MaxBlock){ pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1)); memset(buf+i, 0, len); }else{ pagein(offset-i, MaxBlock); memset(buf+i, 0, MaxBlock-i); offset += MaxBlock-i; len -= MaxBlock-i; while(len >= MaxBlock){ pagein(offset, MaxBlock); memset(buf, 0, MaxBlock); offset += MaxBlock; len -= MaxBlock; } pagein(offset, (len+MinBlock-1)&~(MinBlock-1)); memset(buf, 0, len); } pagein(ooff, olen); } /* * read/write integers * static void p16(uchar *p, u16int u) { p[0] = (u>>8) & 0xFF; p[1] = u & 0xFF; } */ static u16int u16(uchar *p) { return (p[0]<<8)|p[1]; } static void p32(uchar *p, u32int u) { p[0] = (u>>24) & 0xFF; p[1] = (u>>16) & 0xFF; p[2] = (u>>8) & 0xFF; p[3] = u & 0xFF; } static u32int u32(uchar *p) { return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3]; } /* static void p64(uchar *p, u64int u) { p32(p, u>>32); p32(p, u); } */ static u64int u64(uchar *p) { return ((u64int)u32(p)<<32) | u32(p+4); } static int vlongcmp(const void *va, const void *vb) { vlong a, b; a = *(vlong*)va; b = *(vlong*)vb; if(a < b) return -1; if(b > a) return 1; return 0; } /* D and S are in draw.h */ #define D VD #define S VS enum { D = 0x10000, Z = 0x20000, S = 0x30000, T = 0x40000, N = 0xFFFF }; typedef struct Info Info; struct Info { int len; char *name; }; Info partinfo[] = { 4, "magic", D|4, "version", Z|4, "blocksize", 4, "arenabase", 0 }; Info headinfo4[] = { 4, "magic", D|4, "version", S|ANameSize, "name", Z|4, "blocksize", Z|8, "size", 0 }; Info headinfo5[] = { 4, "magic", D|4, "version", S|ANameSize, "name", Z|4, "blocksize", Z|8, "size", 4, "clumpmagic", 0 }; Info tailinfo4[] = { 4, "magic", D|4, "version", S|ANameSize, "name", D|4, "clumps", D|4, "cclumps", T|4, "ctime", T|4, "wtime", D|8, "used", D|8, "uncsize", 1, "sealed", 0 }; Info tailinfo4a[] = { /* tailinfo 4 */ 4, "magic", D|4, "version", S|ANameSize, "name", D|4, "clumps", D|4, "cclumps", T|4, "ctime", T|4, "wtime", D|8, "used", D|8, "uncsize", 1, "sealed", /* mem stats */ 1, "extension", D|4, "mem.clumps", D|4, "mem.cclumps", D|8, "mem.used", D|8, "mem.uncsize", 1, "mem.sealed", 0 }; Info tailinfo5[] = { 4, "magic", D|4, "version", S|ANameSize, "name", D|4, "clumps", D|4, "cclumps", T|4, "ctime", T|4, "wtime", 4, "clumpmagic", D|8, "used", D|8, "uncsize", 1, "sealed", 0 }; Info tailinfo5a[] = { /* tailinfo 5 */ 4, "magic", D|4, "version", S|ANameSize, "name", D|4, "clumps", D|4, "cclumps", T|4, "ctime", T|4, "wtime", 4, "clumpmagic", D|8, "used", D|8, "uncsize", 1, "sealed", /* mem stats */ 1, "extension", D|4, "mem.clumps", D|4, "mem.cclumps", D|8, "mem.used", D|8, "mem.uncsize", 1, "mem.sealed", 0 }; void showdiffs(uchar *want, uchar *have, int len, Info *info) { int n; while(len > 0 && (n=info->len&N) > 0){ if(memcmp(have, want, n) != 0){ switch(info->len){ case 1: print("\t%s: correct=%d disk=%d\n", info->name, *want, *have); break; case 4: print("\t%s: correct=%#ux disk=%#ux\n", info->name, u32(want), u32(have)); break; case D|4: print("\t%s: correct=%,ud disk=%,ud\n", info->name, u32(want), u32(have)); break; case T|4: print("\t%s: correct=%t\n\t\tdisk=%t\n", info->name, u32(want), u32(have)); break; case Z|4: print("\t%s: correct=%Z disk=%Z\n", info->name, (uvlong)u32(want), (uvlong)u32(have)); break; case D|8: print("\t%s: correct=%,lld disk=%,lld\n", info->name, u64(want), u64(have)); break; case Z|8: print("\t%s: correct=%Z disk=%Z\n", info->name, u64(want), u64(have)); break; case S|ANameSize: print("\t%s: correct=%s disk=%.*s\n", info->name, (char*)want, utfnlen((char*)have, ANameSize-1), (char*)have); break; default: print("\t%s: correct=%.*H disk=%.*H\n", info->name, n, want, n, have); break; } } have += n; want += n; len -= n; info++; } if(len > 0 && memcmp(have, want, len) != 0){ if(memcmp(want, zero, len) != 0) print("!!\textra want data in showdiffs (bug in fixarenas)\n"); else print("\tnon-zero data on disk after structure\n"); if(verbose > 1){ print("want: %.*H\n", len, want); print("have: %.*H\n", len, have); } } } /* * Does part begin with an arena? */ int isonearena(void) { return u32(pagein(0, Block)) == ArenaHeadMagic; } static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, }; /* * Poke around on the disk to guess what the ArenaPart numbers are. */ void guessgeometry(void) { int i, j, n, bestn, ndiff, nhead, ntail; uchar *p, *ep, *sp; u64int diff[100], head[20], tail[20]; u64int offset, bestdiff; ap.version = ArenaPartVersion; if(arenasize == 0 || ap.blocksize == 0){ /* * The ArenaPart block at offset PartBlank may be corrupt or just wrong. * Instead, look for the individual arena headers and tails, which there * are many of, and once we've seen enough, infer the spacing. * * Of course, nothing in the file format requires that arenas be evenly * spaced, but fmtarenas always does that for us. */ nhead = 0; ntail = 0; for(offset=PartBlank; offset<partend; offset+=4*M){ p = pagein(offset, 4*M); for(sp=p, ep=p+4*M; p<ep; p+=K){ if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){ if(verbose) print("arena head at %#llx\n", offset+(p-sp)); head[nhead++] = offset+(p-sp); } if(u32(p) == ArenaMagic && ntail < nelem(tail)){ tail[ntail++] = offset+(p-sp); if(verbose) print("arena tail at %#llx\n", offset+(p-sp)); } } if(nhead == nelem(head) && ntail == nelem(tail)) break; } if(nhead < 3 && ntail < 3) sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail); /* * Arena size is likely the most common * inter-head or inter-tail spacing. */ ndiff = 0; for(i=1; i<nhead; i++) diff[ndiff++] = head[i] - head[i-1]; for(i=1; i<ntail; i++) diff[ndiff++] = tail[i] - tail[i-1]; qsort(diff, ndiff, sizeof diff[0], vlongcmp); bestn = 0; bestdiff = 0; for(i=1, n=1; i<=ndiff; i++, n++){ if(i==ndiff || diff[i] != diff[i-1]){ if(n > bestn){ bestn = n; bestdiff = diff[i-1]; } n = 0; } } print("arena size likely %Z (%d of %d)\n", bestdiff, bestn, ndiff); if(arenasize != 0 && arenasize != bestdiff) print("using user-specified size %Z instead\n", arenasize); else arenasize = bestdiff; /* * The arena tail for an arena is arenasize-blocksize from the head. */ ndiff = 0; for(i=j=0; i<nhead && j<ntail; ){ if(tail[j] < head[i]){ j++; continue; } if(tail[j] < head[i]+arenasize){ diff[ndiff++] = head[i]+arenasize - tail[j]; j++; continue; } i++; } if(ndiff < 3) sysfatal("too few intact arenas: %d head, tail pairs", ndiff); qsort(diff, ndiff, sizeof diff[0], vlongcmp); bestn = 0; bestdiff = 0; for(i=1, n=1; i<=ndiff; i++, n++){ if(i==ndiff || diff[i] != diff[i-1]){ if(n > bestn){ bestn = n; bestdiff = diff[i-1]; } n = 0; } } print("block size likely %Z (%d of %d)\n", bestdiff, bestn, ndiff); if(ap.blocksize != 0 && ap.blocksize != bestdiff) print("using user-specified size %Z instead\n", (vlong)ap.blocksize); else ap.blocksize = bestdiff; if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1)) sysfatal("block size not a power of two"); if(ap.blocksize > MaxDiskBlock) sysfatal("block size too big (max=%d)", MaxDiskBlock); /* * Use head/tail information to deduce arena base. */ ndiff = 0; for(i=0; i<nhead; i++) diff[ndiff++] = head[i]%arenasize; for(i=0; i<ntail; i++) diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize; qsort(diff, ndiff, sizeof diff[0], vlongcmp); bestn = 0; bestdiff = 0; for(i=1, n=1; i<=ndiff; i++, n++){ if(i==ndiff || diff[i] != diff[i-1]){ if(n > bestn){ bestn = n; bestdiff = diff[i-1]; } n = 0; } } ap.arenabase = bestdiff; } ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize); /* * XXX pick up table, check arenabase. * XXX pick up table, record base name. */ /* * Somewhat standard computation. * Fmtarenas used to use 64k tab, now uses 512k tab. */ if(ap.arenabase == 0){ print("trying standard arena bases...\n"); for(i=0; i<nelem(tabsizes); i++){ ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize); p = pagein(ap.arenabase, Block); if(u32(p) == ArenaHeadMagic) break; } } p = pagein(ap.arenabase, Block); print("arena base likely %Z%s\n", (vlong)ap.arenabase, u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : ""); ap.tabsize = ap.arenabase - ap.tabbase; } /* * Check the arena partition blocks and then the arenas listed in range. */ void checkarenas(char *range) { char *s, *t; int i, lo, hi, narena; uchar dbuf[HeadSize]; uchar *p; guessgeometry(); partend -= partend%ap.blocksize; memset(dbuf, 0, sizeof dbuf); packarenapart(&ap, dbuf); p = pagein(PartBlank, Block); if(memcmp(p, dbuf, HeadSize) != 0){ print("on-disk arena part superblock incorrect\n"); showdiffs(dbuf, p, HeadSize, partinfo); } memmove(p, dbuf, HeadSize); narena = (partend-ap.arenabase + arenasize-1)/arenasize; if(range == nil){ for(i=0; i<narena; i++) checkarena(ap.arenabase+(vlong)i*arenasize, i); }else if(strcmp(range, "none") == 0){ /* nothing */ }else{ /* parse, e.g., -4,8-9,10- */ for(s=range; *s; s=t){ t = strchr(s, ','); if(t) *t++ = 0; else t = s+strlen(s); if(*s == '-') lo = 0; else lo = strtol(s, &s, 0); hi = lo; if(*s == '-'){ s++; if(*s == 0) hi = narena-1; else hi = strtol(s, &s, 0); } if(*s != 0){ print("bad arena range: %s\n", s); continue; } for(i=lo; i<=hi; i++) checkarena(ap.arenabase+(vlong)i*arenasize, i); } } } /* * Is there a clump here at p? */ static int isclump(uchar *p, Clump *cl, u32int *pmagic) { int n; u32int magic; uchar score[VtScoreSize], *bp; Unwhack uw; uchar ubuf[70*1024]; bp = p; magic = u32(p); if(magic == 0) return 0; p += U32Size; cl->info.type = vtfromdisktype(*p); if(cl->info.type == 0xFF) return 0; p++; cl->info.size = u16(p); p += U16Size; cl->info.uncsize = u16(p); if(cl->info.size > cl->info.uncsize) return 0; p += U16Size; scorecp(cl->info.score, p); p += VtScoreSize; cl->encoding = *p; p++; cl->creator = u32(p); p += U32Size; cl->time = u32(p); p += U32Size; switch(cl->encoding){ case ClumpENone: if(cl->info.size != cl->info.uncsize) return 0; scoremem(score, p, cl->info.size); if(scorecmp(score, cl->info.score) != 0) return 0; break; case ClumpECompress: if(cl->info.size >= cl->info.uncsize) return 0; unwhackinit(&uw); n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size); if(n != cl->info.uncsize) return 0; scoremem(score, ubuf, cl->info.uncsize); if(scorecmp(score, cl->info.score) != 0) return 0; break; default: return 0; } p += cl->info.size; /* it all worked out in the end */ *pmagic = magic; return p - bp; } /* * All ClumpInfos seen in this arena. * Kept in binary tree so we can look up by score. */ typedef struct Cit Cit; struct Cit { int left; int right; vlong corrupt; ClumpInfo ci; }; Cit *cibuf; int ciroot; int ncibuf, mcibuf; void resetcibuf(void) { ncibuf = 0; ciroot = -1; } int* ltreewalk(int *p, uchar *score) { int i; for(;;){ if(*p == -1) return p; i = scorecmp(cibuf[*p].ci.score, score); if(i == 0) return p; if(i < 0) p = &cibuf[*p].right; else p = &cibuf[*p].left; } } void addcibuf(ClumpInfo *ci, vlong corrupt) { Cit *cit; if(ncibuf == mcibuf){ mcibuf += 131072; cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]); } cit = &cibuf[ncibuf]; cit->ci = *ci; cit->left = -1; cit->right = -1; cit->corrupt = corrupt; if(!corrupt) *ltreewalk(&ciroot, ci->score) = ncibuf; ncibuf++; } void addcicorrupt(vlong len) { static ClumpInfo zci; addcibuf(&zci, len); } int haveclump(uchar *score) { int i; int p; p = ciroot; for(;;){ if(p == -1) return 0; i = scorecmp(cibuf[p].ci.score, score); if(i == 0) return 1; if(i < 0) p = cibuf[p].right; else p = cibuf[p].left; } } int matchci(ClumpInfo *ci, uchar *p) { if(ci->type != vtfromdisktype(p[0])) return 0; if(ci->size != u16(p+1)) return 0; if(ci->uncsize != u16(p+3)) return 0; if(scorecmp(ci->score, p+5) != 0) return 0; return 1; } int sealedarena(uchar *p, int blocksize) { int v, n; v = u32(p+4); switch(v){ default: return 0; case ArenaVersion4: n = ArenaSize4; break; case ArenaVersion5: n = ArenaSize5; break; } if(p[n-1] != 1){ print("arena tail says not sealed\n"); return 0; } if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){ print("arena tail followed by non-zero data\n"); return 0; } if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){ print("arena score zero\n"); return 0; } return 1; } int okayname(char *name, int n) { char buf[20]; if(nameok(name) < 0) return 0; sprint(buf, "%d", n); if(n == 0) buf[0] = 0; if(strlen(name) < strlen(buf) || strcmp(name+strlen(name)-strlen(buf), buf) != 0) return 0; return 1; } int clumpinfocmp(ClumpInfo *a, ClumpInfo *b) { if(a->type != b->type) return a->type - b->type; if(a->size != b->size) return a->size - b->size; if(a->uncsize != b->uncsize) return a->uncsize - b->uncsize; return scorecmp(a->score, b->score); } ClumpInfo* loadci(vlong offset, Arena *arena, int nci) { int i, j, per; uchar *p, *sp; ClumpInfo *bci, *ci; per = arena->blocksize/ClumpInfoSize; bci = vtmalloc(nci*sizeof bci[0]); ci = bci; offset += arena->size - arena->blocksize; p = sp = nil; for(i=0; i<nci; i+=per){ if(p == sp){ sp = pagein(offset-4*M, 4*M); p = sp+4*M; } p -= arena->blocksize; offset -= arena->blocksize; for(j=0; j<per && i+j<nci; j++) unpackclumpinfo(ci++, p+j*ClumpInfoSize); } return bci; } vlong writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci) { int i, j, per; uchar *p, *sp; per = arena->blocksize/ClumpInfoSize; offset += arena->size - arena->blocksize; p = sp = nil; for(i=0; i<nci; i+=per){ if(p == sp){ sp = pagein(offset-4*M, 4*M); p = sp+4*M; } p -= arena->blocksize; offset -= arena->blocksize; memset(p, 0, arena->blocksize); for(j=0; j<per && i+j<nci; j++) packclumpinfo(ci++, p+j*ClumpInfoSize); } pageout(); return offset; } void loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena) { char dname[ANameSize]; static char lastbase[ANameSize]; uchar *p; Arena oarena; ArenaHead ohead; /* * Fmtarenas makes all arenas the same size * except the last, which may be smaller. * It uses the same block size for arenas as for * the arena partition blocks. */ arena->size = arenasize; if(offset0+arena->size > partend) arena->size = partend - offset0; head->size = arena->size; arena->blocksize = ap.blocksize; head->blocksize = arena->blocksize; /* * Look for clump magic and name in head/tail blocks. * All the other info we will reconstruct just in case. */ p = pagein(offset0, arena->blocksize); memset(&ohead, 0, sizeof ohead); if(unpackarenahead(&ohead, p) >= 0){ head->version = ohead.version; head->clumpmagic = ohead.clumpmagic; if(okayname(ohead.name, anum)) strcpy(head->name, ohead.name); } p = pagein(offset0+arena->size-arena->blocksize, arena->blocksize); memset(&oarena, 0, sizeof oarena); if(unpackarena(&oarena, p) >= 0){ arena->version = oarena.version; arena->clumpmagic = oarena.clumpmagic; if(okayname(oarena.name, anum)) strcpy(arena->name, oarena.name); arena->diskstats.clumps = oarena.diskstats.clumps; print("old arena: sealed=%d\n", oarena.diskstats.sealed); arena->diskstats.sealed = oarena.diskstats.sealed; } /* Head trumps arena. */ if(head->version){ arena->version = head->version; arena->clumpmagic = head->clumpmagic; } if(arena->version == 0) arena->version = ArenaVersion5; if(basename){ if(anum == -1) snprint(arena->name, ANameSize, "%s", basename); else snprint(arena->name, ANameSize, "%s%d", basename, anum); }else if(lastbase[0]) snprint(arena->name, ANameSize, "%s%d", lastbase, anum); else if(head->name[0]) strcpy(arena->name, head->name); else if(arena->name[0] == 0) sysfatal("cannot determine base name for arena; use -n"); strcpy(lastbase, arena->name); sprint(dname, "%d", anum); lastbase[strlen(lastbase)-strlen(dname)] = 0; /* Was working in arena, now copy to head. */ head->version = arena->version; memmove(head->name, arena->name, sizeof head->name); head->blocksize = arena->blocksize; head->size = arena->size; } void shahead(Shabuf *sb, vlong offset0, ArenaHead *head) { uchar headbuf[MaxDiskBlock]; sb->offset = offset0; memset(headbuf, 0, sizeof headbuf); packarenahead(head, headbuf); sbupdate(sb, headbuf, offset0, head->blocksize); } u32int newclumpmagic(int version) { u32int m; if(version == ArenaVersion4) return _ClumpMagic; do{ m = fastrand(); }while(m==0 || m == _ClumpMagic); return m; } /* * Poke around in the arena to find the clump data * and compute the relevant statistics. */ void guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena, uchar *oldscore, uchar *score) { uchar dbuf[MaxDiskBlock]; int needtozero, clumps, nb1, nb2, minclumps; int inbad, n, ncib, printed, sealing, smart; u32int magic; uchar *sp, *ep, *p; vlong boffset, eoffset, lastclumpend, leaked; vlong offset, toffset, totalcorrupt, v; Clump cl; ClumpInfo *bci, *ci, *eci, *xci; Cit *bcit, *cit, *ecit; Shabuf oldsha, newsha; /* * We expect to find an arena, with data, between offset * and offset+arenasize. With any luck, the data starts at * offset+ap.blocksize. The blocks have variable size and * aren't padded at all, which doesn't give us any alignment * constraints. The blocks are compressed or high entropy, * but the headers are pretty low entropy (except the score): * * type[1] (range 0 thru 9, 13) * size[2] * uncsize[2] (<= size) * * so we can look for these. We check the scores as we go, * so we can't make any wrong turns. If we find ourselves * in a dead end, scan forward looking for a new start. */ resetcibuf(); memset(head, 0, sizeof *head); memset(arena, 0, sizeof *arena); memset(oldscore, 0, VtScoreSize); memset(score, 0, VtScoreSize); memset(&oldsha, 0, sizeof oldsha); memset(&newsha, 0, sizeof newsha); newsha.rollback = 1; if(0){ sbdebug(&oldsha, "old.sha"); sbdebug(&newsha, "new.sha"); } loadarenabasics(offset0, anum, head, arena); /* start the clump hunt */ clumps = 0; totalcorrupt = 0; sealing = 1; boffset = offset0 + arena->blocksize; offset = boffset; eoffset = offset0+arena->size - arena->blocksize; toffset = eoffset; sp = pagein(offset0, 4*M); if(arena->diskstats.sealed){ oldsha.offset = offset0; sbupdate(&oldsha, sp, offset0, 4*M); } ep = sp+4*M; p = sp + (boffset - offset0); ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */ lastclumpend = offset; nbad = 0; inbad = 0; needtozero = 0; minclumps = 0; while(offset < eoffset){ /* * Shift buffer if we're running out of room. */ if(p+70*K >= ep){ /* * Start the post SHA1 buffer. By now we should know the * clumpmagic and arena version, so we can create a * correct head block to get things going. */ if(sealing && fix && newsha.offset == 0){ newsha.offset = offset0; if(arena->clumpmagic == 0){ if(arena->version == 0) arena->version = ArenaVersion5; arena->clumpmagic = newclumpmagic(arena->version); } head->clumpmagic = arena->clumpmagic; shahead(&newsha, offset0, head); } n = 4*M-256*K; if(sealing && fix){ sbdiskhash(&newsha, bufoffset); sbupdate(&newsha, buf, bufoffset, 4*M-256*K); } pagein(bufoffset+n, 4*M); p -= n; if(arena->diskstats.sealed) sbupdate(&oldsha, buf, bufoffset, 4*M); } /* * Check for a clump at p, which is at offset in the disk. * Duplicate clumps happen in corrupted disks * (the same pattern gets written many times in a row) * and should never happen during regular use. */ magic = 0; if((n = isclump(p, &cl, &magic)) > 0){ /* * If we were in the middle of some corrupted data, * flush a warning about it and then add any clump * info blocks as necessary. */ if(inbad){ inbad = 0; v = offset-lastclumpend; if(needtozero){ zerorange(lastclumpend, v); sbrollback(&newsha, lastclumpend); print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n", lastclumpend, v, v); } addcicorrupt(v); totalcorrupt += v; nb1 = (minclumps+ncib-1)/ncib; minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize); nb2 = (minclumps+ncib-1)/ncib; eoffset -= (nb2-nb1)*arena->blocksize; } if(haveclump(cl.info.score)) print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n); /* * If clumps use different magic numbers, we don't care. * We'll just use the first one we find and make the others * follow suit. */ if(arena->clumpmagic == 0){ print("clump type %d size %d score %V magic %x\n", cl.info.type, cl.info.size, cl.info.score, magic); arena->clumpmagic = magic; if(magic == _ClumpMagic) arena->version = ArenaVersion4; else arena->version = ArenaVersion5; } if(magic != arena->clumpmagic) p32(p, arena->clumpmagic); if(clumps == 0) arena->ctime = cl.time; /* * Record the clump, update arena stats, * grow clump info blocks if needed. */ if(verbose > 1) print("\tclump %d: %d %V at %#llux+%#ux (%d)\n", clumps, cl.info.type, cl.info.score, offset, n, n); addcibuf(&cl.info, 0); if(minclumps%ncib == 0) eoffset -= arena->blocksize; minclumps++; clumps++; if(cl.encoding != ClumpENone) arena->diskstats.cclumps++; arena->diskstats.uncsize += cl.info.uncsize; arena->wtime = cl.time; /* * Move to next clump. */ offset += n; p += n; lastclumpend = offset; }else{ /* * Overwrite malformed clump data with zeros later. * For now, just record whether it needs to be overwritten. * Bad regions must be of size at least ClumpSize. * Postponing the overwriting keeps us from writing past * the end of the arena data (which might be directory data) * with zeros. */ if(!inbad){ inbad = 1; needtozero = 0; if(memcmp(p, zero, ClumpSize) != 0) needtozero = 1; p += ClumpSize; offset += ClumpSize; nbad++; }else{ if(*p != 0) needtozero = 1; p++; offset++; } } } pageout(); if(verbose) print("readable clumps: %d; min. directory entries: %d\n", clumps, minclumps); arena->diskstats.used = lastclumpend - boffset; leaked = eoffset - lastclumpend; if(verbose) print("used from %#llux to %#llux = %,lld (%,lld unused)\n", boffset, lastclumpend, arena->diskstats.used, leaked); /* * Finish the SHA1 of the old data. */ if(arena->diskstats.sealed){ sbdiskhash(&oldsha, toffset); readdisk(dbuf, toffset, arena->blocksize); scorecp(dbuf+arena->blocksize-VtScoreSize, zero); sbupdate(&oldsha, dbuf, toffset, arena->blocksize); sbscore(&oldsha, oldscore); } /* * If we still don't know the clump magic, the arena * must be empty. It still needs a value, so make * something up. */ if(arena->version == 0) arena->version = ArenaVersion5; if(arena->clumpmagic == 0){ if(arena->version == ArenaVersion4) arena->clumpmagic = _ClumpMagic; else{ do arena->clumpmagic = fastrand(); while(arena->clumpmagic==_ClumpMagic ||arena->clumpmagic==0); } head->clumpmagic = arena->clumpmagic; } /* * Guess at number of clumpinfo blocks to load. * If we guess high, it's no big deal. If we guess low, * we'll be forced into rewriting the whole directory. * Still not such a big deal. */ if(clumps == 0 || arena->diskstats.used == totalcorrupt) goto Nocib; if(clumps < arena->diskstats.clumps) clumps = arena->diskstats.clumps; if(clumps < ncibuf) clumps = ncibuf; clumps += totalcorrupt/ ((arena->diskstats.used - totalcorrupt)/clumps); clumps += totalcorrupt/2000; if(clumps < minclumps) clumps = minclumps; clumps += ncib-1; clumps -= clumps%ncib; /* * Can't write into the actual data. */ v = offset0 + arena->size - arena->blocksize; v -= (clumps+ncib-1)/ncib * arena->blocksize; if(v < lastclumpend){ v = offset0 + arena->size - arena->blocksize; clumps = (v-lastclumpend)/arena->blocksize * ncib; } if(clumps < minclumps) print("cannot happen?\n"); /* * Check clumpinfo blocks against directory we created. * The tricky part is handling the corrupt sections of arena. * If possible, we remark just the affected directory entries * rather than slide everything down. * * Allocate clumps+1 blocks and check that we don't need * the last one at the end. */ bci = loadci(offset0, arena, clumps+1); eci = bci+clumps+1; bcit = cibuf; ecit = cibuf+ncibuf; smart = 0; /* Somehow the smart code doesn't do corrupt clumps right. */ Again: nbad = 0; ci = bci; for(cit=bcit; cit<ecit && ci<eci; cit++){ if(cit->corrupt){ vlong n, m; if(smart){ /* * If we can, just mark existing entries as corrupt. */ n = cit->corrupt; for(xci=ci; n>0 && xci<eci; xci++) n -= ClumpSize+xci->size; if(n > 0 || xci >= eci) goto Dumb; printed = 0; for(; ci<xci; ci++){ if(verbose && ci->type != VtCorruptType){ if(!printed){ print("marking directory %d-%d as corrupt\n", (int)(ci-bci), (int)(xci-bci)); printed = 1; } print("\ttype=%d size=%d uncsize=%d score=%V\n", ci->type, ci->size, ci->uncsize, ci->score); } ci->type = VtCorruptType; } }else{ Dumb: print("\trewriting clump directory\n"); /* * Otherwise, blaze a new trail. */ n = cit->corrupt; while(n > 0 && ci < eci){ if(n < ClumpSize) sysfatal("bad math in clump corrupt"); if(n <= VtMaxLumpSize+ClumpSize) m = n; else{ m = VtMaxLumpSize+ClumpSize; if(n-m < ClumpSize) m -= ClumpSize; } ci->type = VtCorruptType; ci->size = m-ClumpSize; ci->uncsize = m-ClumpSize; memset(ci->score, 0, VtScoreSize); ci++; n -= m; } } continue; } if(clumpinfocmp(&cit->ci, ci) != 0){ if(verbose && (smart || verbose>1)){ print("clumpinfo %d\n", (int)(ci-bci)); print("\twant: %d %d %d %V\n", cit->ci.type, cit->ci.size, cit->ci.uncsize, cit->ci.score); print("\thave: %d %d %d %V\n", ci->type, ci->size, ci->uncsize, ci->score); } *ci = cit->ci; nbad++; } ci++; } if(ci >= eci || cit < ecit){ print("ran out of space editing existing directory; rewriting\n"); print("# eci %zd ci %zd ecit %zd cit %zd\n", eci-bci, ci-bci, ecit-bcit, cit-bcit); assert(smart); /* can't happen second time thru */ smart = 0; goto Again; } assert(ci <= eci); arena->diskstats.clumps = ci-bci; eoffset = writeci(offset0, arena, bci, ci-bci); if(sealing && fix) sbrollback(&newsha, v); print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal); if(lastclumpend > eoffset) print("arena directory overwrote blocks! cannot happen!\n"); free(bci); if(smart && nbad) print("arena directory has %d bad or missing entries\n", nbad); Nocib: if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){ if(arena->diskstats.sealed) print("unsealing arena\n"); sealing = 0; memset(oldscore, 0, VtScoreSize); } /* * Finish the SHA1 of the new data - only meaningful * if we've been writing to disk (`fix'). */ arena->diskstats.sealed = sealing; arena->memstats = arena->diskstats; if(sealing && fix){ uchar tbuf[MaxDiskBlock]; sbdiskhash(&newsha, toffset); memset(tbuf, 0, sizeof tbuf); packarena(arena, tbuf); sbupdate(&newsha, tbuf, toffset, arena->blocksize); sbscore(&newsha, score); } } void dumparena(vlong offset, int anum, Arena *arena) { char buf[1000]; vlong o, e; int fd, n; snprint(buf, sizeof buf, "%s.%d", dumpbase, anum); if((fd = create(buf, OWRITE, 0666)) < 0){ fprint(2, "create %s: %r\n", buf); return; } e = offset+arena->size; for(o=offset; o<e; o+=n){ n = 4*M; if(o+n > e) n = e-o; if(pwrite(fd, pagein(o, n), n, o-offset) != n){ fprint(2, "write %s at %#llux: %r\n", buf, o-offset); return; } } } void checkarena(vlong offset, int anum) { uchar dbuf[MaxDiskBlock]; uchar *p, oldscore[VtScoreSize], score[VtScoreSize]; Arena arena, oarena; ArenaHead head; Info *fmt, *fmta; int sz; print("# arena %d: offset %#llux\n", anum, offset); if(offset >= partend){ print("arena offset out of bounds\n"); return; } guessarena(offset, anum, &head, &arena, oldscore, score); if(verbose){ print("#\tversion=%d name=%s blocksize=%d size=%Z", head.version, head.name, head.blocksize, head.size); if(head.clumpmagic) print(" clumpmagic=%#.8ux", head.clumpmagic); print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n", arena.diskstats.clumps, arena.diskstats.cclumps, arena.diskstats.used, arena.diskstats.uncsize); print("#\tctime=%t\n", arena.ctime); print("#\twtime=%t\n", arena.wtime); if(arena.diskstats.sealed) print("#\tsealed score=%V\n", score); } if(dumpbase){ dumparena(offset, anum, &arena); return; } memset(dbuf, 0, sizeof dbuf); packarenahead(&head, dbuf); p = pagein(offset, arena.blocksize); if(memcmp(dbuf, p, arena.blocksize) != 0){ print("on-disk arena header incorrect\n"); showdiffs(dbuf, p, arena.blocksize, arena.version==ArenaVersion4 ? headinfo4 : headinfo5); } memmove(p, dbuf, arena.blocksize); memset(dbuf, 0, sizeof dbuf); packarena(&arena, dbuf); if(arena.diskstats.sealed) scorecp(dbuf+arena.blocksize-VtScoreSize, score); p = pagein(offset+arena.size-arena.blocksize, arena.blocksize); memset(&oarena, 0, sizeof oarena); unpackarena(&oarena, p); if(arena.version == ArenaVersion4){ sz = ArenaSize4; fmt = tailinfo4; fmta = tailinfo4a; }else{ sz = ArenaSize5; fmt = tailinfo5; fmta = tailinfo5a; } if(p[sz] == 1){ fmt = fmta; if(oarena.diskstats.sealed){ /* * some arenas were sealed with the extension * before we adopted the convention that if it didn't * add new information it gets dropped. */ _packarena(&arena, dbuf, 1); } } if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){ print("on-disk arena tail incorrect\n"); showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt); } if(arena.diskstats.sealed){ if(oarena.diskstats.sealed) if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){ print("on-disk arena seal score incorrect\n"); print("\tcorrect=%V\n", oldscore); print("\t disk=%V\n", p+arena.blocksize-VtScoreSize); } if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){ print("%ssealing arena%s: %V\n", oarena.diskstats.sealed ? "re" : "", scorecmp(oldscore, score) == 0 ? "" : " after changes", score); } } memmove(p, dbuf, arena.blocksize); pageout(); } AMapN* buildamap(void) { uchar *p; vlong o; ArenaHead h; AMapN *an; AMap *m; an = vtmallocz(sizeof *an); for(o=ap.arenabase; o<partend; o+=arenasize){ p = pagein(o, Block); if(unpackarenahead(&h, p) >= 0){ an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]); m = &an->map[an->n++]; m->start = o; m->stop = o+h.size; strcpy(m->name, h.name); } } return an; } void checkmap(void) { char *s; uchar *p; int i, len; AMapN *an; Fmt fmt; an = buildamap(); fmtstrinit(&fmt); fmtprint(&fmt, "%ud\n", an->n); for(i=0; i<an->n; i++) fmtprint(&fmt, "%s\t%lld\t%lld\n", an->map[i].name, an->map[i].start, an->map[i].stop); s = fmtstrflush(&fmt); len = strlen(s); if(len > ap.tabsize){ print("arena partition map too long: need %Z bytes have %Z\n", (vlong)len, (vlong)ap.tabsize); len = ap.tabsize; } if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */ print("arena partition map *way* too long\n"); return; } p = pagein(ap.tabbase, ap.tabsize); if(memcmp(p, s, len) != 0){ print("arena partition map incorrect; rewriting.\n"); memmove(p, s, len); } pageout(); } int mainstacksize = 512*1024; void threadmain(int argc, char **argv) { int mode; mode = OREAD; readonly = 1; ARGBEGIN{ case 'U': unseal = 1; break; case 'a': arenasize = unittoull(EARGF(usage())); break; case 'b': ap.blocksize = unittoull(EARGF(usage())); break; case 'f': fix = 1; mode = ORDWR; readonly = 0; break; case 'n': basename = EARGF(usage()); break; case 'v': verbose++; break; case 'x': dumpbase = EARGF(usage()); break; default: usage(); }ARGEND if(argc != 1 && argc != 2) usage(); file = argv[0]; ventifmtinstall(); fmtinstall('Z', Zfmt); fmtinstall('t', tfmt); quotefmtinstall(); part = initpart(file, mode|ODIRECT); if(part == nil) sysfatal("can't open %s: %r", file); partend = part->size; if(isonearena()){ checkarena(0, -1); threadexitsall(nil); } checkarenas(argc > 1 ? argv[1] : nil); checkmap(); threadexitsall(nil); }