comparison mercurial/revlog.py @ 30745:c1b7b2285522

revlog: flag processor Add the ability for revlog objects to process revision flags and apply registered transforms on read/write operations. This patch introduces: - the 'revlog._processflags()' method that looks at revision flags and applies flag processors registered on them. Due to the need to handle non-commutative operations, flag transforms are applied in stable order but the order in which the transforms are applied is reversed between read and write operations. - the 'addflagprocessor()' method allowing to register processors on flags. Flag processors are defined as a 3-tuple of (read, write, raw) functions to be applied depending on the operation being performed. - an update on 'revlog.addrevision()' behavior. The current flagprocessor design relies on extensions to wrap around 'addrevision()' to set flags on revision data, and on the flagprocessor to perform the actual transformation of its contents. In the lfs case, this means we need to process flags before we meet the 2GB size check, leading to performing some operations before it happens: - if flags are set on the revision data, we assume some extensions might be modifying the contents using the flag processor next, and we compute the node for the original revision data (still allowing extension to override the node by wrapping around 'addrevision()'). - we then invoke the flag processor to apply registered transforms (in lfs's case, drastically reducing the size of large blobs). - finally, we proceed with the 2GB size check. Note: In the case a cachedelta is passed to 'addrevision()' and we detect the flag processor modified the revision data, we chose to trust the flag processor and drop the cachedelta.
author Remi Chaintron <remi@fb.com>
date Tue, 10 Jan 2017 16:15:21 +0000
parents e12c0fa1f65b
children 9cb0bb0f29f0
comparison
equal deleted inserted replaced
30744:e12c0fa1f65b 30745:c1b7b2285522
53 REVLOGNG_FLAGS = REVLOGNGINLINEDATA | REVLOGGENERALDELTA 53 REVLOGNG_FLAGS = REVLOGNGINLINEDATA | REVLOGGENERALDELTA
54 54
55 # revlog index flags 55 # revlog index flags
56 REVIDX_ISCENSORED = (1 << 15) # revision has censor metadata, must be verified 56 REVIDX_ISCENSORED = (1 << 15) # revision has censor metadata, must be verified
57 REVIDX_DEFAULT_FLAGS = 0 57 REVIDX_DEFAULT_FLAGS = 0
58 REVIDX_KNOWN_FLAGS = REVIDX_ISCENSORED 58 # stable order in which flags need to be processed and their processors applied
59 REVIDX_FLAGS_ORDER = [
60 REVIDX_ISCENSORED,
61 ]
62 REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
59 63
60 # max size of revlog with inline data 64 # max size of revlog with inline data
61 _maxinline = 131072 65 _maxinline = 131072
62 _chunksize = 1048576 66 _chunksize = 1048576
63 67
64 RevlogError = error.RevlogError 68 RevlogError = error.RevlogError
65 LookupError = error.LookupError 69 LookupError = error.LookupError
66 CensoredNodeError = error.CensoredNodeError 70 CensoredNodeError = error.CensoredNodeError
71 ProgrammingError = error.ProgrammingError
72
73 # Store flag processors (cf. 'addflagprocessor()' to register)
74 _flagprocessors = {
75 REVIDX_ISCENSORED: None,
76 }
77
78 def addflagprocessor(flag, processor):
79 """Register a flag processor on a revision data flag.
80
81 Invariant:
82 - Flags need to be defined in REVIDX_KNOWN_FLAGS and REVIDX_FLAGS_ORDER.
83 - Only one flag processor can be registered on a specific flag.
84 - flagprocessors must be 3-tuples of functions (read, write, raw) with the
85 following signatures:
86 - (read) f(self, text) -> newtext, bool
87 - (write) f(self, text) -> newtext, bool
88 - (raw) f(self, text) -> bool
89 The boolean returned by these transforms is used to determine whether
90 'newtext' can be used for hash integrity checking.
91
92 Note: The 'raw' transform is used for changegroup generation and in some
93 debug commands. In this case the transform only indicates whether the
94 contents can be used for hash integrity checks.
95 """
96 if not flag & REVIDX_KNOWN_FLAGS:
97 msg = _("cannot register processor on unknown flag '%#x'.") % (flag)
98 raise ProgrammingError(msg)
99 if flag not in REVIDX_FLAGS_ORDER:
100 msg = _("flag '%#x' undefined in REVIDX_FLAGS_ORDER.") % (flag)
101 raise ProgrammingError(msg)
102 if flag in _flagprocessors:
103 msg = _("cannot register multiple processors on flag '%#x'.") % (flag)
104 raise error.Abort(msg)
105 _flagprocessors[flag] = processor
67 106
68 def getoffset(q): 107 def getoffset(q):
69 return int(q >> 16) 108 return int(q >> 16)
70 109
71 def gettype(q): 110 def gettype(q):
1229 # look up what we need to read 1268 # look up what we need to read
1230 text = None 1269 text = None
1231 if rev is None: 1270 if rev is None:
1232 rev = self.rev(node) 1271 rev = self.rev(node)
1233 1272
1234 # check rev flags
1235 if self.flags(rev) & ~REVIDX_KNOWN_FLAGS:
1236 raise RevlogError(_('incompatible revision flag %x') %
1237 (self.flags(rev) & ~REVIDX_KNOWN_FLAGS))
1238
1239 chain, stopped = self._deltachain(rev, stoprev=cachedrev) 1273 chain, stopped = self._deltachain(rev, stoprev=cachedrev)
1240 if stopped: 1274 if stopped:
1241 text = self._cache[2] 1275 text = self._cache[2]
1242 1276
1243 # drop cache to save memory 1277 # drop cache to save memory
1247 if text is None: 1281 if text is None:
1248 text = str(bins[0]) 1282 text = str(bins[0])
1249 bins = bins[1:] 1283 bins = bins[1:]
1250 1284
1251 text = mdiff.patches(text, bins) 1285 text = mdiff.patches(text, bins)
1252 self.checkhash(text, node, rev=rev) 1286
1287 text, validatehash = self._processflags(text, self.flags(rev), 'read',
1288 raw=raw)
1289 if validatehash:
1290 self.checkhash(text, node, rev=rev)
1291
1253 self._cache = (node, rev, text) 1292 self._cache = (node, rev, text)
1254 return text 1293 return text
1255 1294
1256 def hash(self, text, p1, p2): 1295 def hash(self, text, p1, p2):
1257 """Compute a node hash. 1296 """Compute a node hash.
1258 1297
1259 Available as a function so that subclasses can replace the hash 1298 Available as a function so that subclasses can replace the hash
1260 as needed. 1299 as needed.
1261 """ 1300 """
1262 return hash(text, p1, p2) 1301 return hash(text, p1, p2)
1302
1303 def _processflags(self, text, flags, operation, raw=False):
1304 """Inspect revision data flags and applies transforms defined by
1305 registered flag processors.
1306
1307 ``text`` - the revision data to process
1308 ``flags`` - the revision flags
1309 ``operation`` - the operation being performed (read or write)
1310 ``raw`` - an optional argument describing if the raw transform should be
1311 applied.
1312
1313 This method processes the flags in the order (or reverse order if
1314 ``operation`` is 'write') defined by REVIDX_FLAGS_ORDER, applying the
1315 flag processors registered for present flags. The order of flags defined
1316 in REVIDX_FLAGS_ORDER needs to be stable to allow non-commutativity.
1317
1318 Returns a 2-tuple of ``(text, validatehash)`` where ``text`` is the
1319 processed text and ``validatehash`` is a bool indicating whether the
1320 returned text should be checked for hash integrity.
1321
1322 Note: If the ``raw`` argument is set, it has precedence over the
1323 operation and will only update the value of ``validatehash``.
1324 """
1325 if not operation in ('read', 'write'):
1326 raise ProgrammingError(_("invalid '%s' operation ") % (operation))
1327 # Check all flags are known.
1328 if flags & ~REVIDX_KNOWN_FLAGS:
1329 raise RevlogError(_("incompatible revision flag '%#x'") %
1330 (flags & ~REVIDX_KNOWN_FLAGS))
1331 validatehash = True
1332 # Depending on the operation (read or write), the order might be
1333 # reversed due to non-commutative transforms.
1334 orderedflags = REVIDX_FLAGS_ORDER
1335 if operation == 'write':
1336 orderedflags = reversed(orderedflags)
1337
1338 for flag in orderedflags:
1339 # If a flagprocessor has been registered for a known flag, apply the
1340 # related operation transform and update result tuple.
1341 if flag & flags:
1342 vhash = True
1343
1344 if flag not in _flagprocessors:
1345 message = _("missing processor for flag '%#x'") % (flag)
1346 raise RevlogError(message)
1347
1348 processor = _flagprocessors[flag]
1349 if processor is not None:
1350 readtransform, writetransform, rawtransform = processor
1351
1352 if raw:
1353 vhash = rawtransform(self, text)
1354 elif operation == 'read':
1355 text, vhash = readtransform(self, text)
1356 else: # write operation
1357 text, vhash = writetransform(self, text)
1358 validatehash = validatehash and vhash
1359
1360 return text, validatehash
1263 1361
1264 def checkhash(self, text, node, p1=None, p2=None, rev=None): 1362 def checkhash(self, text, node, p1=None, p2=None, rev=None):
1265 """Check node hash integrity. 1363 """Check node hash integrity.
1266 1364
1267 Available as a function so that subclasses can extend hash mismatch 1365 Available as a function so that subclasses can extend hash mismatch
1343 """ 1441 """
1344 if link == nullrev: 1442 if link == nullrev:
1345 raise RevlogError(_("attempted to add linkrev -1 to %s") 1443 raise RevlogError(_("attempted to add linkrev -1 to %s")
1346 % self.indexfile) 1444 % self.indexfile)
1347 1445
1446 if flags:
1447 node = node or self.hash(text, p1, p2)
1448
1449 newtext, validatehash = self._processflags(text, flags, 'write')
1450
1451 # If the flag processor modifies the revision data, ignore any provided
1452 # cachedelta.
1453 if newtext != text:
1454 cachedelta = None
1455 text = newtext
1456
1348 if len(text) > _maxentrysize: 1457 if len(text) > _maxentrysize:
1349 raise RevlogError( 1458 raise RevlogError(
1350 _("%s: size of %d bytes exceeds maximum revlog storage of 2GiB") 1459 _("%s: size of %d bytes exceeds maximum revlog storage of 2GiB")
1351 % (self.indexfile, len(text))) 1460 % (self.indexfile, len(text)))
1352 1461
1353 node = node or self.hash(text, p1, p2) 1462 node = node or self.hash(text, p1, p2)
1354 if node in self.nodemap: 1463 if node in self.nodemap:
1355 return node 1464 return node
1465
1466 if validatehash:
1467 self.checkhash(text, node, p1=p1, p2=p2)
1356 1468
1357 dfh = None 1469 dfh = None
1358 if not self._inline: 1470 if not self._inline:
1359 dfh = self.opener(self.datafile, "a+") 1471 dfh = self.opener(self.datafile, "a+")
1360 ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig) 1472 ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig)
1446 fh = dfh 1558 fh = dfh
1447 basetext = self.revision(self.node(baserev), _df=fh, raw=raw) 1559 basetext = self.revision(self.node(baserev), _df=fh, raw=raw)
1448 btext[0] = mdiff.patch(basetext, delta) 1560 btext[0] = mdiff.patch(basetext, delta)
1449 1561
1450 try: 1562 try:
1451 self.checkhash(btext[0], node, p1=p1, p2=p2) 1563 res = self._processflags(btext[0], flags, 'read', raw=raw)
1564 btext[0], validatehash = res
1565 if validatehash:
1566 self.checkhash(btext[0], node, p1=p1, p2=p2)
1452 if flags & REVIDX_ISCENSORED: 1567 if flags & REVIDX_ISCENSORED:
1453 raise RevlogError(_('node %s is not censored') % node) 1568 raise RevlogError(_('node %s is not censored') % node)
1454 except CensoredNodeError: 1569 except CensoredNodeError:
1455 # must pass the censored index flag to add censored revisions 1570 # must pass the censored index flag to add censored revisions
1456 if not flags & REVIDX_ISCENSORED: 1571 if not flags & REVIDX_ISCENSORED: