Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/revlog.py @ 40057:324b4b10351e
revlog: rewrite censoring logic
I was able to corrupt a revlog relatively easily with the existing
censoring code. The underlying problem is that the existing code
doesn't fully take delta chains into account. When copying revisions
that occur after the censored revision, the delta base can refer
to a censored revision. Then at read time, things blow up due to the
revision data not being a compressed delta.
This commit rewrites the revlog censoring code to take a higher-level
approach. We now create a new revlog instance pointing at temp files.
We iterate through each revision in the source revlog and insert
those revisions into the new revlog, replacing the censored revision's
data along the way.
The new implementation isn't as efficient as the old one. This is
because it will fully engage delta computation on insertion. But I
don't think it matters.
The new implementation is a bit hacky because it attempts to reload
the revlog instance with a new revlog index/data file. This is fragile.
But this is needed because the index (which could be backed by C) would
have a cached copy of the old, possibly changed data and that could
lead to problems accessing index or revision data later.
One benefit of the new approach is that we integrate with the
transaction. The old revlog is backed up and if the transaction is
rolled back, the original revlog is restored.
As part of this, we had to teach the transaction about the store
vfs. I'm not super keen about this. But this was the easiest way
to hook things up to the transaction. We /could/ just ignore the
transaction like we were doing before. But any file mutation should
be governed by transaction semantics, including undo during rollback.
Differential Revision: https://phab.mercurial-scm.org/D4869
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Tue, 02 Oct 2018 17:34:34 -0700 |
parents | 0a4625ffd6c0 |
children | adbf8ca239e4 |
comparison
equal
deleted
inserted
replaced
40056:0a4625ffd6c0 | 40057:324b4b10351e |
---|---|
2339 addrevisioncb(self, rev, node) | 2339 addrevisioncb(self, rev, node) |
2340 finally: | 2340 finally: |
2341 destrevlog._lazydeltabase = oldlazydeltabase | 2341 destrevlog._lazydeltabase = oldlazydeltabase |
2342 destrevlog._deltabothparents = oldamd | 2342 destrevlog._deltabothparents = oldamd |
2343 | 2343 |
2344 def censorrevision(self, node, tombstone=b''): | 2344 def censorrevision(self, tr, censornode, tombstone=b''): |
2345 if (self.version & 0xFFFF) == REVLOGV0: | 2345 if (self.version & 0xFFFF) == REVLOGV0: |
2346 raise error.RevlogError(_('cannot censor with version %d revlogs') % | 2346 raise error.RevlogError(_('cannot censor with version %d revlogs') % |
2347 self.version) | 2347 self.version) |
2348 | 2348 |
2349 rev = self.rev(node) | 2349 censorrev = self.rev(censornode) |
2350 tombstone = storageutil.packmeta({b'censored': tombstone}, b'') | 2350 tombstone = storageutil.packmeta({b'censored': tombstone}, b'') |
2351 | 2351 |
2352 if len(tombstone) > self.rawsize(rev): | 2352 if len(tombstone) > self.rawsize(censorrev): |
2353 raise error.Abort(_('censor tombstone must be no longer than ' | 2353 raise error.Abort(_('censor tombstone must be no longer than ' |
2354 'censored data')) | 2354 'censored data')) |
2355 | 2355 |
2356 # Using two files instead of one makes it easy to rewrite entry-by-entry | 2356 # Rewriting the revlog in place is hard. Our strategy for censoring is |
2357 idxread = self.opener(self.indexfile, 'r') | 2357 # to create a new revlog, copy all revisions to it, then replace the |
2358 idxwrite = self.opener(self.indexfile, 'wb', atomictemp=True) | 2358 # revlogs on transaction close. |
2359 if self.version & FLAG_INLINE_DATA: | 2359 |
2360 dataread, datawrite = idxread, idxwrite | 2360 newindexfile = self.indexfile + b'.tmpcensored' |
2361 else: | 2361 newdatafile = self.datafile + b'.tmpcensored' |
2362 dataread = self.opener(self.datafile, 'r') | 2362 |
2363 datawrite = self.opener(self.datafile, 'wb', atomictemp=True) | 2363 # This is a bit dangerous. We could easily have a mismatch of state. |
2364 | 2364 newrl = revlog(self.opener, newindexfile, newdatafile, |
2365 # Copy all revlog data up to the entry to be censored. | 2365 censorable=True) |
2366 offset = self.start(rev) | 2366 newrl.version = self.version |
2367 | 2367 newrl._generaldelta = self._generaldelta |
2368 for chunk in util.filechunkiter(idxread, limit=rev * self._io.size): | 2368 newrl._io = self._io |
2369 idxwrite.write(chunk) | 2369 |
2370 for chunk in util.filechunkiter(dataread, limit=offset): | 2370 for rev in self.revs(): |
2371 datawrite.write(chunk) | 2371 node = self.node(rev) |
2372 | 2372 p1, p2 = self.parents(node) |
2373 def rewriteindex(r, newoffs, newdata=None): | 2373 |
2374 """Rewrite the index entry with a new data offset and new data. | 2374 if rev == censorrev: |
2375 | 2375 newrl.addrawrevision(tombstone, tr, self.linkrev(censorrev), |
2376 The newdata argument, if given, is a tuple of three positive | 2376 p1, p2, censornode, REVIDX_ISCENSORED) |
2377 integers: (new compressed, new uncompressed, added flag bits). | 2377 |
2378 """ | 2378 if newrl.deltaparent(rev) != nullrev: |
2379 offlags, comp, uncomp, base, link, p1, p2, nodeid = self.index[r] | 2379 raise error.Abort(_('censored revision stored as delta; ' |
2380 flags = gettype(offlags) | 2380 'cannot censor'), |
2381 if newdata: | 2381 hint=_('censoring of revlogs is not ' |
2382 comp, uncomp, nflags = newdata | 2382 'fully implemented; please report ' |
2383 flags |= nflags | 2383 'this bug')) |
2384 offlags = offset_type(newoffs, flags) | 2384 continue |
2385 e = (offlags, comp, uncomp, r, link, p1, p2, nodeid) | 2385 |
2386 idxwrite.write(self._io.packentry(e, None, self.version, r)) | 2386 if self.iscensored(rev): |
2387 idxread.seek(self._io.size, 1) | 2387 if self.deltaparent(rev) != nullrev: |
2388 | 2388 raise error.Abort(_('cannot censor due to censored ' |
2389 def rewrite(r, offs, data, nflags=REVIDX_DEFAULT_FLAGS): | 2389 'revision having delta stored')) |
2390 """Write the given fulltext with the given data offset. | 2390 rawtext = self._chunk(rev) |
2391 | |
2392 Returns: | |
2393 The integer number of data bytes written, for tracking data | |
2394 offsets. | |
2395 """ | |
2396 flag, compdata = self.compress(data) | |
2397 newcomp = len(flag) + len(compdata) | |
2398 rewriteindex(r, offs, (newcomp, len(data), nflags)) | |
2399 datawrite.write(flag) | |
2400 datawrite.write(compdata) | |
2401 dataread.seek(self.length(r), 1) | |
2402 return newcomp | |
2403 | |
2404 # Rewrite censored entry with (padded) tombstone data. | |
2405 pad = ' ' * (self.rawsize(rev) - len(tombstone)) | |
2406 offset += rewrite(rev, offset, tombstone + pad, REVIDX_ISCENSORED) | |
2407 | |
2408 # Rewrite all following filelog revisions fixing up offsets and deltas. | |
2409 for srev in pycompat.xrange(rev + 1, len(self)): | |
2410 if rev in self.parentrevs(srev): | |
2411 # Immediate children of censored node must be re-added as | |
2412 # fulltext. | |
2413 try: | |
2414 revdata = self.revision(srev) | |
2415 except error.CensoredNodeError as e: | |
2416 revdata = e.tombstone | |
2417 dlen = rewrite(srev, offset, revdata) | |
2418 else: | 2391 else: |
2419 # Copy any other revision data verbatim after fixing up the | 2392 rawtext = self.revision(rev, raw=True) |
2420 # offset. | 2393 |
2421 rewriteindex(srev, offset) | 2394 newrl.addrawrevision(rawtext, tr, self.linkrev(rev), p1, p2, node, |
2422 dlen = self.length(srev) | 2395 self.flags(rev)) |
2423 for chunk in util.filechunkiter(dataread, limit=dlen): | 2396 |
2424 datawrite.write(chunk) | 2397 tr.addbackup(self.indexfile, location='store') |
2425 offset += dlen | 2398 if not self._inline: |
2426 | 2399 tr.addbackup(self.datafile, location='store') |
2427 idxread.close() | 2400 |
2428 idxwrite.close() | 2401 self.opener.rename(newrl.indexfile, self.indexfile) |
2429 if dataread is not idxread: | 2402 if not self._inline: |
2430 dataread.close() | 2403 self.opener.rename(newrl.datafile, self.datafile) |
2431 datawrite.close() | 2404 |
2405 self.clearcaches() | |
2406 self._loadindex(self.version, None) | |
2432 | 2407 |
2433 def verifyintegrity(self, state): | 2408 def verifyintegrity(self, state): |
2434 """Verifies the integrity of the revlog. | 2409 """Verifies the integrity of the revlog. |
2435 | 2410 |
2436 Yields ``revlogproblem`` instances describing problems that are | 2411 Yields ``revlogproblem`` instances describing problems that are |