Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/revlog.py @ 51095:a82704902db8
revlog: move the compression/decompression logic on the inner object
This is a necessary step before being able to move more logic around restoring
a revision content there.
For now, we do a simple patch for the perf extension logic, when the
implementation of the inner object changes, we will likely need some evolution
of the API. However this is true of many things in the perf extension. So we
will see this later.
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Wed, 25 Oct 2023 02:13:18 +0200 |
parents | de6a8cc24de3 |
children | 9c8df10ea6e0 |
comparison
equal
deleted
inserted
replaced
51094:de6a8cc24de3 | 51095:a82704902db8 |
---|---|
351 index_file, | 351 index_file, |
352 data_file, | 352 data_file, |
353 sidedata_file, | 353 sidedata_file, |
354 inline, | 354 inline, |
355 data_config, | 355 data_config, |
356 feature_config, | |
356 chunk_cache, | 357 chunk_cache, |
358 default_compression_header, | |
357 ): | 359 ): |
358 self.opener = opener | 360 self.opener = opener |
359 self.index = index | 361 self.index = index |
360 | 362 |
361 self.__index_file = index_file | 363 self.__index_file = index_file |
362 self.data_file = data_file | 364 self.data_file = data_file |
363 self.sidedata_file = sidedata_file | 365 self.sidedata_file = sidedata_file |
364 self.inline = inline | 366 self.inline = inline |
365 self.data_config = data_config | 367 self.data_config = data_config |
368 self.feature_config = feature_config | |
369 | |
370 self._default_compression_header = default_compression_header | |
366 | 371 |
367 # index | 372 # index |
368 | 373 |
369 # 3-tuple of file handles being used for active writing. | 374 # 3-tuple of file handles being used for active writing. |
370 self._writinghandles = None | 375 self._writinghandles = None |
379 self.opener, | 384 self.opener, |
380 self.sidedata_file, | 385 self.sidedata_file, |
381 self.data_config.chunk_cache_size, | 386 self.data_config.chunk_cache_size, |
382 ) | 387 ) |
383 | 388 |
389 # revlog header -> revlog compressor | |
390 self._decompressors = {} | |
391 | |
384 @property | 392 @property |
385 def index_file(self): | 393 def index_file(self): |
386 return self.__index_file | 394 return self.__index_file |
387 | 395 |
388 @index_file.setter | 396 @index_file.setter |
402 return self.index[rev][1] | 410 return self.index[rev][1] |
403 | 411 |
404 def end(self, rev): | 412 def end(self, rev): |
405 """the end of the data chunk for this revision""" | 413 """the end of the data chunk for this revision""" |
406 return self.start(rev) + self.length(rev) | 414 return self.start(rev) + self.length(rev) |
415 | |
416 @util.propertycache | |
417 def _compressor(self): | |
418 engine = util.compengines[self.feature_config.compression_engine] | |
419 return engine.revlogcompressor( | |
420 self.feature_config.compression_engine_options | |
421 ) | |
422 | |
423 @util.propertycache | |
424 def _decompressor(self): | |
425 """the default decompressor""" | |
426 if self._default_compression_header is None: | |
427 return None | |
428 t = self._default_compression_header | |
429 c = self._get_decompressor(t) | |
430 return c.decompress | |
431 | |
432 def _get_decompressor(self, t): | |
433 try: | |
434 compressor = self._decompressors[t] | |
435 except KeyError: | |
436 try: | |
437 engine = util.compengines.forrevlogheader(t) | |
438 compressor = engine.revlogcompressor( | |
439 self.feature_config.compression_engine_options | |
440 ) | |
441 self._decompressors[t] = compressor | |
442 except KeyError: | |
443 raise error.RevlogError( | |
444 _(b'unknown compression type %s') % binascii.hexlify(t) | |
445 ) | |
446 return compressor | |
447 | |
448 def compress(self, data): | |
449 """Generate a possibly-compressed representation of data.""" | |
450 if not data: | |
451 return b'', data | |
452 | |
453 compressed = self._compressor.compress(data) | |
454 | |
455 if compressed: | |
456 # The revlog compressor added the header in the returned data. | |
457 return b'', compressed | |
458 | |
459 if data[0:1] == b'\0': | |
460 return b'', data | |
461 return b'u', data | |
462 | |
463 def decompress(self, data): | |
464 """Decompress a revlog chunk. | |
465 | |
466 The chunk is expected to begin with a header identifying the | |
467 format type so it can be routed to an appropriate decompressor. | |
468 """ | |
469 if not data: | |
470 return data | |
471 | |
472 # Revlogs are read much more frequently than they are written and many | |
473 # chunks only take microseconds to decompress, so performance is | |
474 # important here. | |
475 # | |
476 # We can make a few assumptions about revlogs: | |
477 # | |
478 # 1) the majority of chunks will be compressed (as opposed to inline | |
479 # raw data). | |
480 # 2) decompressing *any* data will likely by at least 10x slower than | |
481 # returning raw inline data. | |
482 # 3) we want to prioritize common and officially supported compression | |
483 # engines | |
484 # | |
485 # It follows that we want to optimize for "decompress compressed data | |
486 # when encoded with common and officially supported compression engines" | |
487 # case over "raw data" and "data encoded by less common or non-official | |
488 # compression engines." That is why we have the inline lookup first | |
489 # followed by the compengines lookup. | |
490 # | |
491 # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib | |
492 # compressed chunks. And this matters for changelog and manifest reads. | |
493 t = data[0:1] | |
494 | |
495 if t == b'x': | |
496 try: | |
497 return _zlibdecompress(data) | |
498 except zlib.error as e: | |
499 raise error.RevlogError( | |
500 _(b'revlog decompress error: %s') | |
501 % stringutil.forcebytestr(e) | |
502 ) | |
503 # '\0' is more common than 'u' so it goes first. | |
504 elif t == b'\0': | |
505 return data | |
506 elif t == b'u': | |
507 return util.buffer(data, 1) | |
508 | |
509 compressor = self._get_decompressor(t) | |
510 | |
511 return compressor.decompress(data) | |
407 | 512 |
408 @contextlib.contextmanager | 513 @contextlib.contextmanager |
409 def reading(self): | 514 def reading(self): |
410 """Context manager that keeps data and sidedata files open for reading""" | 515 """Context manager that keeps data and sidedata files open for reading""" |
411 if len(self.index) == 0: | 516 if len(self.index) == 0: |
1282 _(b"index %s is corrupted") % self.display_id | 1387 _(b"index %s is corrupted") % self.display_id |
1283 ) | 1388 ) |
1284 self.index = index | 1389 self.index = index |
1285 # revnum -> (chain-length, sum-delta-length) | 1390 # revnum -> (chain-length, sum-delta-length) |
1286 self._chaininfocache = util.lrucachedict(500) | 1391 self._chaininfocache = util.lrucachedict(500) |
1287 # revlog header -> revlog compressor | |
1288 self._decompressors = {} | |
1289 | 1392 |
1290 return chunkcache | 1393 return chunkcache |
1291 | 1394 |
1292 def _load_inner(self, chunk_cache): | 1395 def _load_inner(self, chunk_cache): |
1396 if self._docket is None: | |
1397 default_compression_header = None | |
1398 else: | |
1399 default_compression_header = self._docket.default_compression_header | |
1400 | |
1293 self._inner = _InnerRevlog( | 1401 self._inner = _InnerRevlog( |
1294 opener=self.opener, | 1402 opener=self.opener, |
1295 index=self.index, | 1403 index=self.index, |
1296 index_file=self._indexfile, | 1404 index_file=self._indexfile, |
1297 data_file=self._datafile, | 1405 data_file=self._datafile, |
1298 sidedata_file=self._sidedatafile, | 1406 sidedata_file=self._sidedatafile, |
1299 inline=self._inline, | 1407 inline=self._inline, |
1300 data_config=self.data_config, | 1408 data_config=self.data_config, |
1409 feature_config=self.feature_config, | |
1301 chunk_cache=chunk_cache, | 1410 chunk_cache=chunk_cache, |
1411 default_compression_header=default_compression_header, | |
1302 ) | 1412 ) |
1303 | 1413 |
1304 def get_revlog(self): | 1414 def get_revlog(self): |
1305 """simple function to mirror API of other not-really-revlog API""" | 1415 """simple function to mirror API of other not-really-revlog API""" |
1306 return self | 1416 return self |
1316 # Reference the file without the "data/" prefix, so it is familiar | 1426 # Reference the file without the "data/" prefix, so it is familiar |
1317 # to the user. | 1427 # to the user. |
1318 return self.target[1] | 1428 return self.target[1] |
1319 else: | 1429 else: |
1320 return self.radix | 1430 return self.radix |
1321 | |
1322 def _get_decompressor(self, t): | |
1323 try: | |
1324 compressor = self._decompressors[t] | |
1325 except KeyError: | |
1326 try: | |
1327 engine = util.compengines.forrevlogheader(t) | |
1328 compressor = engine.revlogcompressor( | |
1329 self.feature_config.compression_engine_options | |
1330 ) | |
1331 self._decompressors[t] = compressor | |
1332 except KeyError: | |
1333 raise error.RevlogError( | |
1334 _(b'unknown compression type %s') % binascii.hexlify(t) | |
1335 ) | |
1336 return compressor | |
1337 | |
1338 @util.propertycache | |
1339 def _compressor(self): | |
1340 engine = util.compengines[self.feature_config.compression_engine] | |
1341 return engine.revlogcompressor( | |
1342 self.feature_config.compression_engine_options | |
1343 ) | |
1344 | |
1345 @util.propertycache | |
1346 def _decompressor(self): | |
1347 """the default decompressor""" | |
1348 if self._docket is None: | |
1349 return None | |
1350 t = self._docket.default_compression_header | |
1351 c = self._get_decompressor(t) | |
1352 return c.decompress | |
1353 | 1431 |
1354 def _datafp(self, mode=b'r'): | 1432 def _datafp(self, mode=b'r'): |
1355 """file object for the revlog's data file""" | 1433 """file object for the revlog's data file""" |
1356 return self.opener(self._datafile, mode=mode) | 1434 return self.opener(self._datafile, mode=mode) |
1357 | 1435 |
2270 compression_mode = self.index[rev][10] | 2348 compression_mode = self.index[rev][10] |
2271 data = self._inner.get_segment_for_revs(rev, rev)[1] | 2349 data = self._inner.get_segment_for_revs(rev, rev)[1] |
2272 if compression_mode == COMP_MODE_PLAIN: | 2350 if compression_mode == COMP_MODE_PLAIN: |
2273 return data | 2351 return data |
2274 elif compression_mode == COMP_MODE_DEFAULT: | 2352 elif compression_mode == COMP_MODE_DEFAULT: |
2275 return self._decompressor(data) | 2353 return self._inner._decompressor(data) |
2276 elif compression_mode == COMP_MODE_INLINE: | 2354 elif compression_mode == COMP_MODE_INLINE: |
2277 return self.decompress(data) | 2355 return self._inner.decompress(data) |
2278 else: | 2356 else: |
2279 msg = b'unknown compression mode %d' | 2357 msg = b'unknown compression mode %d' |
2280 msg %= compression_mode | 2358 msg %= compression_mode |
2281 raise error.RevlogError(msg) | 2359 raise error.RevlogError(msg) |
2282 | 2360 |
2326 except OverflowError: | 2404 except OverflowError: |
2327 # issue4215 - we can't cache a run of chunks greater than | 2405 # issue4215 - we can't cache a run of chunks greater than |
2328 # 2G on Windows | 2406 # 2G on Windows |
2329 return [self._chunk(rev) for rev in revschunk] | 2407 return [self._chunk(rev) for rev in revschunk] |
2330 | 2408 |
2331 decomp = self.decompress | 2409 decomp = self._inner.decompress |
2332 # self._decompressor might be None, but will not be used in that case | 2410 # self._decompressor might be None, but will not be used in that case |
2333 def_decomp = self._decompressor | 2411 def_decomp = self._inner._decompressor |
2334 for rev in revschunk: | 2412 for rev in revschunk: |
2335 chunkstart = start(rev) | 2413 chunkstart = start(rev) |
2336 if inline: | 2414 if inline: |
2337 chunkstart += (rev + 1) * iosize | 2415 chunkstart += (rev + 1) * iosize |
2338 chunklength = length(rev) | 2416 chunklength = length(rev) |
2542 | 2620 |
2543 comp = self.index[rev][11] | 2621 comp = self.index[rev][11] |
2544 if comp == COMP_MODE_PLAIN: | 2622 if comp == COMP_MODE_PLAIN: |
2545 segment = comp_segment | 2623 segment = comp_segment |
2546 elif comp == COMP_MODE_DEFAULT: | 2624 elif comp == COMP_MODE_DEFAULT: |
2547 segment = self._decompressor(comp_segment) | 2625 segment = self._inner._decompressor(comp_segment) |
2548 elif comp == COMP_MODE_INLINE: | 2626 elif comp == COMP_MODE_INLINE: |
2549 segment = self.decompress(comp_segment) | 2627 segment = self._inner.decompress(comp_segment) |
2550 else: | 2628 else: |
2551 msg = b'unknown compression mode %d' | 2629 msg = b'unknown compression mode %d' |
2552 msg %= comp | 2630 msg %= comp |
2553 raise error.RevlogError(msg) | 2631 raise error.RevlogError(msg) |
2554 | 2632 |
2840 deltacomputer=deltacomputer, | 2918 deltacomputer=deltacomputer, |
2841 sidedata=sidedata, | 2919 sidedata=sidedata, |
2842 ) | 2920 ) |
2843 | 2921 |
2844 def compress(self, data): | 2922 def compress(self, data): |
2845 """Generate a possibly-compressed representation of data.""" | 2923 return self._inner.compress(data) |
2846 if not data: | |
2847 return b'', data | |
2848 | |
2849 compressed = self._compressor.compress(data) | |
2850 | |
2851 if compressed: | |
2852 # The revlog compressor added the header in the returned data. | |
2853 return b'', compressed | |
2854 | |
2855 if data[0:1] == b'\0': | |
2856 return b'', data | |
2857 return b'u', data | |
2858 | 2924 |
2859 def decompress(self, data): | 2925 def decompress(self, data): |
2860 """Decompress a revlog chunk. | 2926 return self._inner.decompress(data) |
2861 | |
2862 The chunk is expected to begin with a header identifying the | |
2863 format type so it can be routed to an appropriate decompressor. | |
2864 """ | |
2865 if not data: | |
2866 return data | |
2867 | |
2868 # Revlogs are read much more frequently than they are written and many | |
2869 # chunks only take microseconds to decompress, so performance is | |
2870 # important here. | |
2871 # | |
2872 # We can make a few assumptions about revlogs: | |
2873 # | |
2874 # 1) the majority of chunks will be compressed (as opposed to inline | |
2875 # raw data). | |
2876 # 2) decompressing *any* data will likely by at least 10x slower than | |
2877 # returning raw inline data. | |
2878 # 3) we want to prioritize common and officially supported compression | |
2879 # engines | |
2880 # | |
2881 # It follows that we want to optimize for "decompress compressed data | |
2882 # when encoded with common and officially supported compression engines" | |
2883 # case over "raw data" and "data encoded by less common or non-official | |
2884 # compression engines." That is why we have the inline lookup first | |
2885 # followed by the compengines lookup. | |
2886 # | |
2887 # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib | |
2888 # compressed chunks. And this matters for changelog and manifest reads. | |
2889 t = data[0:1] | |
2890 | |
2891 if t == b'x': | |
2892 try: | |
2893 return _zlibdecompress(data) | |
2894 except zlib.error as e: | |
2895 raise error.RevlogError( | |
2896 _(b'revlog decompress error: %s') | |
2897 % stringutil.forcebytestr(e) | |
2898 ) | |
2899 # '\0' is more common than 'u' so it goes first. | |
2900 elif t == b'\0': | |
2901 return data | |
2902 elif t == b'u': | |
2903 return util.buffer(data, 1) | |
2904 | |
2905 compressor = self._get_decompressor(t) | |
2906 | |
2907 return compressor.decompress(data) | |
2908 | 2927 |
2909 def _addrevision( | 2928 def _addrevision( |
2910 self, | 2929 self, |
2911 node, | 2930 node, |
2912 rawtext, | 2931 rawtext, |
3027 sidedata_compression_mode = COMP_MODE_INLINE | 3046 sidedata_compression_mode = COMP_MODE_INLINE |
3028 if sidedata and self.feature_config.has_side_data: | 3047 if sidedata and self.feature_config.has_side_data: |
3029 sidedata_compression_mode = COMP_MODE_PLAIN | 3048 sidedata_compression_mode = COMP_MODE_PLAIN |
3030 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) | 3049 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) |
3031 sidedata_offset = self._docket.sidedata_end | 3050 sidedata_offset = self._docket.sidedata_end |
3032 h, comp_sidedata = self.compress(serialized_sidedata) | 3051 h, comp_sidedata = self._inner.compress(serialized_sidedata) |
3033 if ( | 3052 if ( |
3034 h != b'u' | 3053 h != b'u' |
3035 and comp_sidedata[0:1] != b'\0' | 3054 and comp_sidedata[0:1] != b'\0' |
3036 and len(comp_sidedata) < len(serialized_sidedata) | 3055 and len(comp_sidedata) < len(serialized_sidedata) |
3037 ): | 3056 ): |
3874 ) | 3893 ) |
3875 | 3894 |
3876 sidedata_compression_mode = COMP_MODE_INLINE | 3895 sidedata_compression_mode = COMP_MODE_INLINE |
3877 if serialized_sidedata and self.feature_config.has_side_data: | 3896 if serialized_sidedata and self.feature_config.has_side_data: |
3878 sidedata_compression_mode = COMP_MODE_PLAIN | 3897 sidedata_compression_mode = COMP_MODE_PLAIN |
3879 h, comp_sidedata = self.compress(serialized_sidedata) | 3898 h, comp_sidedata = self._inner.compress(serialized_sidedata) |
3880 if ( | 3899 if ( |
3881 h != b'u' | 3900 h != b'u' |
3882 and comp_sidedata[0] != b'\0' | 3901 and comp_sidedata[0] != b'\0' |
3883 and len(comp_sidedata) < len(serialized_sidedata) | 3902 and len(comp_sidedata) < len(serialized_sidedata) |
3884 ): | 3903 ): |