comparison mercurial/revlog.py @ 51095:a82704902db8

revlog: move the compression/decompression logic on the inner object This is a necessary step before being able to move more logic around restoring a revision content there. For now, we do a simple patch for the perf extension logic, when the implementation of the inner object changes, we will likely need some evolution of the API. However this is true of many things in the perf extension. So we will see this later.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Wed, 25 Oct 2023 02:13:18 +0200
parents de6a8cc24de3
children 9c8df10ea6e0
comparison
equal deleted inserted replaced
51094:de6a8cc24de3 51095:a82704902db8
351 index_file, 351 index_file,
352 data_file, 352 data_file,
353 sidedata_file, 353 sidedata_file,
354 inline, 354 inline,
355 data_config, 355 data_config,
356 feature_config,
356 chunk_cache, 357 chunk_cache,
358 default_compression_header,
357 ): 359 ):
358 self.opener = opener 360 self.opener = opener
359 self.index = index 361 self.index = index
360 362
361 self.__index_file = index_file 363 self.__index_file = index_file
362 self.data_file = data_file 364 self.data_file = data_file
363 self.sidedata_file = sidedata_file 365 self.sidedata_file = sidedata_file
364 self.inline = inline 366 self.inline = inline
365 self.data_config = data_config 367 self.data_config = data_config
368 self.feature_config = feature_config
369
370 self._default_compression_header = default_compression_header
366 371
367 # index 372 # index
368 373
369 # 3-tuple of file handles being used for active writing. 374 # 3-tuple of file handles being used for active writing.
370 self._writinghandles = None 375 self._writinghandles = None
379 self.opener, 384 self.opener,
380 self.sidedata_file, 385 self.sidedata_file,
381 self.data_config.chunk_cache_size, 386 self.data_config.chunk_cache_size,
382 ) 387 )
383 388
389 # revlog header -> revlog compressor
390 self._decompressors = {}
391
384 @property 392 @property
385 def index_file(self): 393 def index_file(self):
386 return self.__index_file 394 return self.__index_file
387 395
388 @index_file.setter 396 @index_file.setter
402 return self.index[rev][1] 410 return self.index[rev][1]
403 411
404 def end(self, rev): 412 def end(self, rev):
405 """the end of the data chunk for this revision""" 413 """the end of the data chunk for this revision"""
406 return self.start(rev) + self.length(rev) 414 return self.start(rev) + self.length(rev)
415
416 @util.propertycache
417 def _compressor(self):
418 engine = util.compengines[self.feature_config.compression_engine]
419 return engine.revlogcompressor(
420 self.feature_config.compression_engine_options
421 )
422
423 @util.propertycache
424 def _decompressor(self):
425 """the default decompressor"""
426 if self._default_compression_header is None:
427 return None
428 t = self._default_compression_header
429 c = self._get_decompressor(t)
430 return c.decompress
431
432 def _get_decompressor(self, t):
433 try:
434 compressor = self._decompressors[t]
435 except KeyError:
436 try:
437 engine = util.compengines.forrevlogheader(t)
438 compressor = engine.revlogcompressor(
439 self.feature_config.compression_engine_options
440 )
441 self._decompressors[t] = compressor
442 except KeyError:
443 raise error.RevlogError(
444 _(b'unknown compression type %s') % binascii.hexlify(t)
445 )
446 return compressor
447
448 def compress(self, data):
449 """Generate a possibly-compressed representation of data."""
450 if not data:
451 return b'', data
452
453 compressed = self._compressor.compress(data)
454
455 if compressed:
456 # The revlog compressor added the header in the returned data.
457 return b'', compressed
458
459 if data[0:1] == b'\0':
460 return b'', data
461 return b'u', data
462
463 def decompress(self, data):
464 """Decompress a revlog chunk.
465
466 The chunk is expected to begin with a header identifying the
467 format type so it can be routed to an appropriate decompressor.
468 """
469 if not data:
470 return data
471
472 # Revlogs are read much more frequently than they are written and many
473 # chunks only take microseconds to decompress, so performance is
474 # important here.
475 #
476 # We can make a few assumptions about revlogs:
477 #
478 # 1) the majority of chunks will be compressed (as opposed to inline
479 # raw data).
480 # 2) decompressing *any* data will likely by at least 10x slower than
481 # returning raw inline data.
482 # 3) we want to prioritize common and officially supported compression
483 # engines
484 #
485 # It follows that we want to optimize for "decompress compressed data
486 # when encoded with common and officially supported compression engines"
487 # case over "raw data" and "data encoded by less common or non-official
488 # compression engines." That is why we have the inline lookup first
489 # followed by the compengines lookup.
490 #
491 # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
492 # compressed chunks. And this matters for changelog and manifest reads.
493 t = data[0:1]
494
495 if t == b'x':
496 try:
497 return _zlibdecompress(data)
498 except zlib.error as e:
499 raise error.RevlogError(
500 _(b'revlog decompress error: %s')
501 % stringutil.forcebytestr(e)
502 )
503 # '\0' is more common than 'u' so it goes first.
504 elif t == b'\0':
505 return data
506 elif t == b'u':
507 return util.buffer(data, 1)
508
509 compressor = self._get_decompressor(t)
510
511 return compressor.decompress(data)
407 512
408 @contextlib.contextmanager 513 @contextlib.contextmanager
409 def reading(self): 514 def reading(self):
410 """Context manager that keeps data and sidedata files open for reading""" 515 """Context manager that keeps data and sidedata files open for reading"""
411 if len(self.index) == 0: 516 if len(self.index) == 0:
1282 _(b"index %s is corrupted") % self.display_id 1387 _(b"index %s is corrupted") % self.display_id
1283 ) 1388 )
1284 self.index = index 1389 self.index = index
1285 # revnum -> (chain-length, sum-delta-length) 1390 # revnum -> (chain-length, sum-delta-length)
1286 self._chaininfocache = util.lrucachedict(500) 1391 self._chaininfocache = util.lrucachedict(500)
1287 # revlog header -> revlog compressor
1288 self._decompressors = {}
1289 1392
1290 return chunkcache 1393 return chunkcache
1291 1394
1292 def _load_inner(self, chunk_cache): 1395 def _load_inner(self, chunk_cache):
1396 if self._docket is None:
1397 default_compression_header = None
1398 else:
1399 default_compression_header = self._docket.default_compression_header
1400
1293 self._inner = _InnerRevlog( 1401 self._inner = _InnerRevlog(
1294 opener=self.opener, 1402 opener=self.opener,
1295 index=self.index, 1403 index=self.index,
1296 index_file=self._indexfile, 1404 index_file=self._indexfile,
1297 data_file=self._datafile, 1405 data_file=self._datafile,
1298 sidedata_file=self._sidedatafile, 1406 sidedata_file=self._sidedatafile,
1299 inline=self._inline, 1407 inline=self._inline,
1300 data_config=self.data_config, 1408 data_config=self.data_config,
1409 feature_config=self.feature_config,
1301 chunk_cache=chunk_cache, 1410 chunk_cache=chunk_cache,
1411 default_compression_header=default_compression_header,
1302 ) 1412 )
1303 1413
1304 def get_revlog(self): 1414 def get_revlog(self):
1305 """simple function to mirror API of other not-really-revlog API""" 1415 """simple function to mirror API of other not-really-revlog API"""
1306 return self 1416 return self
1316 # Reference the file without the "data/" prefix, so it is familiar 1426 # Reference the file without the "data/" prefix, so it is familiar
1317 # to the user. 1427 # to the user.
1318 return self.target[1] 1428 return self.target[1]
1319 else: 1429 else:
1320 return self.radix 1430 return self.radix
1321
1322 def _get_decompressor(self, t):
1323 try:
1324 compressor = self._decompressors[t]
1325 except KeyError:
1326 try:
1327 engine = util.compengines.forrevlogheader(t)
1328 compressor = engine.revlogcompressor(
1329 self.feature_config.compression_engine_options
1330 )
1331 self._decompressors[t] = compressor
1332 except KeyError:
1333 raise error.RevlogError(
1334 _(b'unknown compression type %s') % binascii.hexlify(t)
1335 )
1336 return compressor
1337
1338 @util.propertycache
1339 def _compressor(self):
1340 engine = util.compengines[self.feature_config.compression_engine]
1341 return engine.revlogcompressor(
1342 self.feature_config.compression_engine_options
1343 )
1344
1345 @util.propertycache
1346 def _decompressor(self):
1347 """the default decompressor"""
1348 if self._docket is None:
1349 return None
1350 t = self._docket.default_compression_header
1351 c = self._get_decompressor(t)
1352 return c.decompress
1353 1431
1354 def _datafp(self, mode=b'r'): 1432 def _datafp(self, mode=b'r'):
1355 """file object for the revlog's data file""" 1433 """file object for the revlog's data file"""
1356 return self.opener(self._datafile, mode=mode) 1434 return self.opener(self._datafile, mode=mode)
1357 1435
2270 compression_mode = self.index[rev][10] 2348 compression_mode = self.index[rev][10]
2271 data = self._inner.get_segment_for_revs(rev, rev)[1] 2349 data = self._inner.get_segment_for_revs(rev, rev)[1]
2272 if compression_mode == COMP_MODE_PLAIN: 2350 if compression_mode == COMP_MODE_PLAIN:
2273 return data 2351 return data
2274 elif compression_mode == COMP_MODE_DEFAULT: 2352 elif compression_mode == COMP_MODE_DEFAULT:
2275 return self._decompressor(data) 2353 return self._inner._decompressor(data)
2276 elif compression_mode == COMP_MODE_INLINE: 2354 elif compression_mode == COMP_MODE_INLINE:
2277 return self.decompress(data) 2355 return self._inner.decompress(data)
2278 else: 2356 else:
2279 msg = b'unknown compression mode %d' 2357 msg = b'unknown compression mode %d'
2280 msg %= compression_mode 2358 msg %= compression_mode
2281 raise error.RevlogError(msg) 2359 raise error.RevlogError(msg)
2282 2360
2326 except OverflowError: 2404 except OverflowError:
2327 # issue4215 - we can't cache a run of chunks greater than 2405 # issue4215 - we can't cache a run of chunks greater than
2328 # 2G on Windows 2406 # 2G on Windows
2329 return [self._chunk(rev) for rev in revschunk] 2407 return [self._chunk(rev) for rev in revschunk]
2330 2408
2331 decomp = self.decompress 2409 decomp = self._inner.decompress
2332 # self._decompressor might be None, but will not be used in that case 2410 # self._decompressor might be None, but will not be used in that case
2333 def_decomp = self._decompressor 2411 def_decomp = self._inner._decompressor
2334 for rev in revschunk: 2412 for rev in revschunk:
2335 chunkstart = start(rev) 2413 chunkstart = start(rev)
2336 if inline: 2414 if inline:
2337 chunkstart += (rev + 1) * iosize 2415 chunkstart += (rev + 1) * iosize
2338 chunklength = length(rev) 2416 chunklength = length(rev)
2542 2620
2543 comp = self.index[rev][11] 2621 comp = self.index[rev][11]
2544 if comp == COMP_MODE_PLAIN: 2622 if comp == COMP_MODE_PLAIN:
2545 segment = comp_segment 2623 segment = comp_segment
2546 elif comp == COMP_MODE_DEFAULT: 2624 elif comp == COMP_MODE_DEFAULT:
2547 segment = self._decompressor(comp_segment) 2625 segment = self._inner._decompressor(comp_segment)
2548 elif comp == COMP_MODE_INLINE: 2626 elif comp == COMP_MODE_INLINE:
2549 segment = self.decompress(comp_segment) 2627 segment = self._inner.decompress(comp_segment)
2550 else: 2628 else:
2551 msg = b'unknown compression mode %d' 2629 msg = b'unknown compression mode %d'
2552 msg %= comp 2630 msg %= comp
2553 raise error.RevlogError(msg) 2631 raise error.RevlogError(msg)
2554 2632
2840 deltacomputer=deltacomputer, 2918 deltacomputer=deltacomputer,
2841 sidedata=sidedata, 2919 sidedata=sidedata,
2842 ) 2920 )
2843 2921
2844 def compress(self, data): 2922 def compress(self, data):
2845 """Generate a possibly-compressed representation of data.""" 2923 return self._inner.compress(data)
2846 if not data:
2847 return b'', data
2848
2849 compressed = self._compressor.compress(data)
2850
2851 if compressed:
2852 # The revlog compressor added the header in the returned data.
2853 return b'', compressed
2854
2855 if data[0:1] == b'\0':
2856 return b'', data
2857 return b'u', data
2858 2924
2859 def decompress(self, data): 2925 def decompress(self, data):
2860 """Decompress a revlog chunk. 2926 return self._inner.decompress(data)
2861
2862 The chunk is expected to begin with a header identifying the
2863 format type so it can be routed to an appropriate decompressor.
2864 """
2865 if not data:
2866 return data
2867
2868 # Revlogs are read much more frequently than they are written and many
2869 # chunks only take microseconds to decompress, so performance is
2870 # important here.
2871 #
2872 # We can make a few assumptions about revlogs:
2873 #
2874 # 1) the majority of chunks will be compressed (as opposed to inline
2875 # raw data).
2876 # 2) decompressing *any* data will likely by at least 10x slower than
2877 # returning raw inline data.
2878 # 3) we want to prioritize common and officially supported compression
2879 # engines
2880 #
2881 # It follows that we want to optimize for "decompress compressed data
2882 # when encoded with common and officially supported compression engines"
2883 # case over "raw data" and "data encoded by less common or non-official
2884 # compression engines." That is why we have the inline lookup first
2885 # followed by the compengines lookup.
2886 #
2887 # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
2888 # compressed chunks. And this matters for changelog and manifest reads.
2889 t = data[0:1]
2890
2891 if t == b'x':
2892 try:
2893 return _zlibdecompress(data)
2894 except zlib.error as e:
2895 raise error.RevlogError(
2896 _(b'revlog decompress error: %s')
2897 % stringutil.forcebytestr(e)
2898 )
2899 # '\0' is more common than 'u' so it goes first.
2900 elif t == b'\0':
2901 return data
2902 elif t == b'u':
2903 return util.buffer(data, 1)
2904
2905 compressor = self._get_decompressor(t)
2906
2907 return compressor.decompress(data)
2908 2927
2909 def _addrevision( 2928 def _addrevision(
2910 self, 2929 self,
2911 node, 2930 node,
2912 rawtext, 2931 rawtext,
3027 sidedata_compression_mode = COMP_MODE_INLINE 3046 sidedata_compression_mode = COMP_MODE_INLINE
3028 if sidedata and self.feature_config.has_side_data: 3047 if sidedata and self.feature_config.has_side_data:
3029 sidedata_compression_mode = COMP_MODE_PLAIN 3048 sidedata_compression_mode = COMP_MODE_PLAIN
3030 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) 3049 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
3031 sidedata_offset = self._docket.sidedata_end 3050 sidedata_offset = self._docket.sidedata_end
3032 h, comp_sidedata = self.compress(serialized_sidedata) 3051 h, comp_sidedata = self._inner.compress(serialized_sidedata)
3033 if ( 3052 if (
3034 h != b'u' 3053 h != b'u'
3035 and comp_sidedata[0:1] != b'\0' 3054 and comp_sidedata[0:1] != b'\0'
3036 and len(comp_sidedata) < len(serialized_sidedata) 3055 and len(comp_sidedata) < len(serialized_sidedata)
3037 ): 3056 ):
3874 ) 3893 )
3875 3894
3876 sidedata_compression_mode = COMP_MODE_INLINE 3895 sidedata_compression_mode = COMP_MODE_INLINE
3877 if serialized_sidedata and self.feature_config.has_side_data: 3896 if serialized_sidedata and self.feature_config.has_side_data:
3878 sidedata_compression_mode = COMP_MODE_PLAIN 3897 sidedata_compression_mode = COMP_MODE_PLAIN
3879 h, comp_sidedata = self.compress(serialized_sidedata) 3898 h, comp_sidedata = self._inner.compress(serialized_sidedata)
3880 if ( 3899 if (
3881 h != b'u' 3900 h != b'u'
3882 and comp_sidedata[0] != b'\0' 3901 and comp_sidedata[0] != b'\0'
3883 and len(comp_sidedata) < len(serialized_sidedata) 3902 and len(comp_sidedata) < len(serialized_sidedata)
3884 ): 3903 ):