comparison mercurial/revlog.py @ 40057:324b4b10351e

revlog: rewrite censoring logic I was able to corrupt a revlog relatively easily with the existing censoring code. The underlying problem is that the existing code doesn't fully take delta chains into account. When copying revisions that occur after the censored revision, the delta base can refer to a censored revision. Then at read time, things blow up due to the revision data not being a compressed delta. This commit rewrites the revlog censoring code to take a higher-level approach. We now create a new revlog instance pointing at temp files. We iterate through each revision in the source revlog and insert those revisions into the new revlog, replacing the censored revision's data along the way. The new implementation isn't as efficient as the old one. This is because it will fully engage delta computation on insertion. But I don't think it matters. The new implementation is a bit hacky because it attempts to reload the revlog instance with a new revlog index/data file. This is fragile. But this is needed because the index (which could be backed by C) would have a cached copy of the old, possibly changed data and that could lead to problems accessing index or revision data later. One benefit of the new approach is that we integrate with the transaction. The old revlog is backed up and if the transaction is rolled back, the original revlog is restored. As part of this, we had to teach the transaction about the store vfs. I'm not super keen about this. But this was the easiest way to hook things up to the transaction. We /could/ just ignore the transaction like we were doing before. But any file mutation should be governed by transaction semantics, including undo during rollback. Differential Revision: https://phab.mercurial-scm.org/D4869
author Gregory Szorc <gregory.szorc@gmail.com>
date Tue, 02 Oct 2018 17:34:34 -0700
parents 0a4625ffd6c0
children adbf8ca239e4
comparison
equal deleted inserted replaced
40056:0a4625ffd6c0 40057:324b4b10351e
2339 addrevisioncb(self, rev, node) 2339 addrevisioncb(self, rev, node)
2340 finally: 2340 finally:
2341 destrevlog._lazydeltabase = oldlazydeltabase 2341 destrevlog._lazydeltabase = oldlazydeltabase
2342 destrevlog._deltabothparents = oldamd 2342 destrevlog._deltabothparents = oldamd
2343 2343
2344 def censorrevision(self, node, tombstone=b''): 2344 def censorrevision(self, tr, censornode, tombstone=b''):
2345 if (self.version & 0xFFFF) == REVLOGV0: 2345 if (self.version & 0xFFFF) == REVLOGV0:
2346 raise error.RevlogError(_('cannot censor with version %d revlogs') % 2346 raise error.RevlogError(_('cannot censor with version %d revlogs') %
2347 self.version) 2347 self.version)
2348 2348
2349 rev = self.rev(node) 2349 censorrev = self.rev(censornode)
2350 tombstone = storageutil.packmeta({b'censored': tombstone}, b'') 2350 tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
2351 2351
2352 if len(tombstone) > self.rawsize(rev): 2352 if len(tombstone) > self.rawsize(censorrev):
2353 raise error.Abort(_('censor tombstone must be no longer than ' 2353 raise error.Abort(_('censor tombstone must be no longer than '
2354 'censored data')) 2354 'censored data'))
2355 2355
2356 # Using two files instead of one makes it easy to rewrite entry-by-entry 2356 # Rewriting the revlog in place is hard. Our strategy for censoring is
2357 idxread = self.opener(self.indexfile, 'r') 2357 # to create a new revlog, copy all revisions to it, then replace the
2358 idxwrite = self.opener(self.indexfile, 'wb', atomictemp=True) 2358 # revlogs on transaction close.
2359 if self.version & FLAG_INLINE_DATA: 2359
2360 dataread, datawrite = idxread, idxwrite 2360 newindexfile = self.indexfile + b'.tmpcensored'
2361 else: 2361 newdatafile = self.datafile + b'.tmpcensored'
2362 dataread = self.opener(self.datafile, 'r') 2362
2363 datawrite = self.opener(self.datafile, 'wb', atomictemp=True) 2363 # This is a bit dangerous. We could easily have a mismatch of state.
2364 2364 newrl = revlog(self.opener, newindexfile, newdatafile,
2365 # Copy all revlog data up to the entry to be censored. 2365 censorable=True)
2366 offset = self.start(rev) 2366 newrl.version = self.version
2367 2367 newrl._generaldelta = self._generaldelta
2368 for chunk in util.filechunkiter(idxread, limit=rev * self._io.size): 2368 newrl._io = self._io
2369 idxwrite.write(chunk) 2369
2370 for chunk in util.filechunkiter(dataread, limit=offset): 2370 for rev in self.revs():
2371 datawrite.write(chunk) 2371 node = self.node(rev)
2372 2372 p1, p2 = self.parents(node)
2373 def rewriteindex(r, newoffs, newdata=None): 2373
2374 """Rewrite the index entry with a new data offset and new data. 2374 if rev == censorrev:
2375 2375 newrl.addrawrevision(tombstone, tr, self.linkrev(censorrev),
2376 The newdata argument, if given, is a tuple of three positive 2376 p1, p2, censornode, REVIDX_ISCENSORED)
2377 integers: (new compressed, new uncompressed, added flag bits). 2377
2378 """ 2378 if newrl.deltaparent(rev) != nullrev:
2379 offlags, comp, uncomp, base, link, p1, p2, nodeid = self.index[r] 2379 raise error.Abort(_('censored revision stored as delta; '
2380 flags = gettype(offlags) 2380 'cannot censor'),
2381 if newdata: 2381 hint=_('censoring of revlogs is not '
2382 comp, uncomp, nflags = newdata 2382 'fully implemented; please report '
2383 flags |= nflags 2383 'this bug'))
2384 offlags = offset_type(newoffs, flags) 2384 continue
2385 e = (offlags, comp, uncomp, r, link, p1, p2, nodeid) 2385
2386 idxwrite.write(self._io.packentry(e, None, self.version, r)) 2386 if self.iscensored(rev):
2387 idxread.seek(self._io.size, 1) 2387 if self.deltaparent(rev) != nullrev:
2388 2388 raise error.Abort(_('cannot censor due to censored '
2389 def rewrite(r, offs, data, nflags=REVIDX_DEFAULT_FLAGS): 2389 'revision having delta stored'))
2390 """Write the given fulltext with the given data offset. 2390 rawtext = self._chunk(rev)
2391
2392 Returns:
2393 The integer number of data bytes written, for tracking data
2394 offsets.
2395 """
2396 flag, compdata = self.compress(data)
2397 newcomp = len(flag) + len(compdata)
2398 rewriteindex(r, offs, (newcomp, len(data), nflags))
2399 datawrite.write(flag)
2400 datawrite.write(compdata)
2401 dataread.seek(self.length(r), 1)
2402 return newcomp
2403
2404 # Rewrite censored entry with (padded) tombstone data.
2405 pad = ' ' * (self.rawsize(rev) - len(tombstone))
2406 offset += rewrite(rev, offset, tombstone + pad, REVIDX_ISCENSORED)
2407
2408 # Rewrite all following filelog revisions fixing up offsets and deltas.
2409 for srev in pycompat.xrange(rev + 1, len(self)):
2410 if rev in self.parentrevs(srev):
2411 # Immediate children of censored node must be re-added as
2412 # fulltext.
2413 try:
2414 revdata = self.revision(srev)
2415 except error.CensoredNodeError as e:
2416 revdata = e.tombstone
2417 dlen = rewrite(srev, offset, revdata)
2418 else: 2391 else:
2419 # Copy any other revision data verbatim after fixing up the 2392 rawtext = self.revision(rev, raw=True)
2420 # offset. 2393
2421 rewriteindex(srev, offset) 2394 newrl.addrawrevision(rawtext, tr, self.linkrev(rev), p1, p2, node,
2422 dlen = self.length(srev) 2395 self.flags(rev))
2423 for chunk in util.filechunkiter(dataread, limit=dlen): 2396
2424 datawrite.write(chunk) 2397 tr.addbackup(self.indexfile, location='store')
2425 offset += dlen 2398 if not self._inline:
2426 2399 tr.addbackup(self.datafile, location='store')
2427 idxread.close() 2400
2428 idxwrite.close() 2401 self.opener.rename(newrl.indexfile, self.indexfile)
2429 if dataread is not idxread: 2402 if not self._inline:
2430 dataread.close() 2403 self.opener.rename(newrl.datafile, self.datafile)
2431 datawrite.close() 2404
2405 self.clearcaches()
2406 self._loadindex(self.version, None)
2432 2407
2433 def verifyintegrity(self, state): 2408 def verifyintegrity(self, state):
2434 """Verifies the integrity of the revlog. 2409 """Verifies the integrity of the revlog.
2435 2410
2436 Yields ``revlogproblem`` instances describing problems that are 2411 Yields ``revlogproblem`` instances describing problems that are