comparison contrib/python-zstandard/tests/test_decompressor.py @ 30895:c32454d69b85

zstd: vendor python-zstandard 0.7.0 Commit 3054ae3a66112970a091d3939fee32c2d0c1a23e from https://github.com/indygreg/python-zstandard is imported without modifications (other than removing unwanted files). The vendored zstd library within has been upgraded from 1.1.2 to 1.1.3. This version introduced new APIs for threads, thread pools, multi-threaded compression, and a new dictionary builder (COVER). These features are not yet used by python-zstandard (or Mercurial for that matter). However, that will likely change in the next python-zstandard release (and I think there are opportunities for Mercurial to take advantage of the multi-threaded APIs). Relevant to Mercurial, the CFFI bindings are now fully implemented. This means zstd should "just work" with PyPy (although I haven't tried). The python-zstandard test suite also runs all tests against both the C extension and CFFI bindings to ensure feature parity. There is also a "decompress_content_dict_chain()" API. This was derived from discussions with Yann Collet on list about alternate ways of encoding delta chains. The change most relevant to Mercurial is a performance enhancement in the simple decompression API to reuse a data structure across operations. This makes decompression of multiple inputs significantly faster. (This scenario occurs when reading revlog delta chains, for example.) Using python-zstandard's bench.py to measure the performance difference... On changelog chunks in the mozilla-unified repo: decompress discrete decompress() reuse zctx 1.262243 wall; 1.260000 CPU; 1.260000 user; 0.000000 sys 170.43 MB/s (best of 3) 0.949106 wall; 0.950000 CPU; 0.950000 user; 0.000000 sys 226.66 MB/s (best of 4) decompress discrete dict decompress() reuse zctx 0.692170 wall; 0.690000 CPU; 0.690000 user; 0.000000 sys 310.80 MB/s (best of 5) 0.437088 wall; 0.440000 CPU; 0.440000 user; 0.000000 sys 492.17 MB/s (best of 7) On manifest chunks in the mozilla-unified repo: decompress discrete decompress() reuse zctx 1.367284 wall; 1.370000 CPU; 1.370000 user; 0.000000 sys 274.01 MB/s (best of 3) 1.086831 wall; 1.080000 CPU; 1.080000 user; 0.000000 sys 344.72 MB/s (best of 3) decompress discrete dict decompress() reuse zctx 0.993272 wall; 0.990000 CPU; 0.990000 user; 0.000000 sys 377.19 MB/s (best of 3) 0.678651 wall; 0.680000 CPU; 0.680000 user; 0.000000 sys 552.06 MB/s (best of 5) That should make reads on zstd revlogs a bit faster ;) # no-check-commit
author Gregory Szorc <gregory.szorc@gmail.com>
date Tue, 07 Feb 2017 23:24:47 -0800
parents b86a448a2965
children e0dc40530c5a
comparison
equal deleted inserted replaced
30894:5b60464efbde 30895:c32454d69b85
8 except ImportError: 8 except ImportError:
9 import unittest 9 import unittest
10 10
11 import zstd 11 import zstd
12 12
13 from .common import OpCountingBytesIO 13 from .common import (
14 make_cffi,
15 OpCountingBytesIO,
16 )
14 17
15 18
16 if sys.version_info[0] >= 3: 19 if sys.version_info[0] >= 3:
17 next = lambda it: it.__next__() 20 next = lambda it: it.__next__()
18 else: 21 else:
19 next = lambda it: it.next() 22 next = lambda it: it.next()
20 23
21 24
25 @make_cffi
22 class TestDecompressor_decompress(unittest.TestCase): 26 class TestDecompressor_decompress(unittest.TestCase):
23 def test_empty_input(self): 27 def test_empty_input(self):
24 dctx = zstd.ZstdDecompressor() 28 dctx = zstd.ZstdDecompressor()
25 29
26 with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'): 30 with self.assertRaisesRegexp(zstd.ZstdError, 'input data invalid'):
117 for i in range(len(sources)): 121 for i in range(len(sources)):
118 decompressed = dctx.decompress(compressed[i]) 122 decompressed = dctx.decompress(compressed[i])
119 self.assertEqual(decompressed, sources[i]) 123 self.assertEqual(decompressed, sources[i])
120 124
121 125
126 @make_cffi
122 class TestDecompressor_copy_stream(unittest.TestCase): 127 class TestDecompressor_copy_stream(unittest.TestCase):
123 def test_no_read(self): 128 def test_no_read(self):
124 source = object() 129 source = object()
125 dest = io.BytesIO() 130 dest = io.BytesIO()
126 131
178 self.assertEqual(w, len(b'foobarfoobar')) 183 self.assertEqual(w, len(b'foobarfoobar'))
179 self.assertEqual(source._read_count, len(source.getvalue()) + 1) 184 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
180 self.assertEqual(dest._write_count, len(dest.getvalue())) 185 self.assertEqual(dest._write_count, len(dest.getvalue()))
181 186
182 187
188 @make_cffi
183 class TestDecompressor_decompressobj(unittest.TestCase): 189 class TestDecompressor_decompressobj(unittest.TestCase):
184 def test_simple(self): 190 def test_simple(self):
185 data = zstd.ZstdCompressor(level=1).compress(b'foobar') 191 data = zstd.ZstdCompressor(level=1).compress(b'foobar')
186 192
187 dctx = zstd.ZstdDecompressor() 193 dctx = zstd.ZstdDecompressor()
205 with dctx.write_to(buffer) as decompressor: 211 with dctx.write_to(buffer) as decompressor:
206 decompressor.write(data) 212 decompressor.write(data)
207 return buffer.getvalue() 213 return buffer.getvalue()
208 214
209 215
216 @make_cffi
210 class TestDecompressor_write_to(unittest.TestCase): 217 class TestDecompressor_write_to(unittest.TestCase):
211 def test_empty_roundtrip(self): 218 def test_empty_roundtrip(self):
212 cctx = zstd.ZstdCompressor() 219 cctx = zstd.ZstdCompressor()
213 empty = cctx.compress(b'') 220 empty = cctx.compress(b'')
214 self.assertEqual(decompress_via_writer(empty), b'') 221 self.assertEqual(decompress_via_writer(empty), b'')
254 261
255 orig = b'foobar' * 16384 262 orig = b'foobar' * 16384
256 buffer = io.BytesIO() 263 buffer = io.BytesIO()
257 cctx = zstd.ZstdCompressor(dict_data=d) 264 cctx = zstd.ZstdCompressor(dict_data=d)
258 with cctx.write_to(buffer) as compressor: 265 with cctx.write_to(buffer) as compressor:
259 compressor.write(orig) 266 self.assertEqual(compressor.write(orig), 1544)
260 267
261 compressed = buffer.getvalue() 268 compressed = buffer.getvalue()
262 buffer = io.BytesIO() 269 buffer = io.BytesIO()
263 270
264 dctx = zstd.ZstdDecompressor(dict_data=d) 271 dctx = zstd.ZstdDecompressor(dict_data=d)
265 with dctx.write_to(buffer) as decompressor: 272 with dctx.write_to(buffer) as decompressor:
266 decompressor.write(compressed) 273 self.assertEqual(decompressor.write(compressed), len(orig))
267 274
268 self.assertEqual(buffer.getvalue(), orig) 275 self.assertEqual(buffer.getvalue(), orig)
269 276
270 def test_memory_size(self): 277 def test_memory_size(self):
271 dctx = zstd.ZstdDecompressor() 278 dctx = zstd.ZstdDecompressor()
289 296
290 self.assertEqual(dest.getvalue(), b'foobarfoobar') 297 self.assertEqual(dest.getvalue(), b'foobarfoobar')
291 self.assertEqual(dest._write_count, len(dest.getvalue())) 298 self.assertEqual(dest._write_count, len(dest.getvalue()))
292 299
293 300
301 @make_cffi
294 class TestDecompressor_read_from(unittest.TestCase): 302 class TestDecompressor_read_from(unittest.TestCase):
295 def test_type_validation(self): 303 def test_type_validation(self):
296 dctx = zstd.ZstdDecompressor() 304 dctx = zstd.ZstdDecompressor()
297 305
298 # Object with read() works. 306 # Object with read() works.
300 308
301 # Buffer protocol works. 309 # Buffer protocol works.
302 dctx.read_from(b'foobar') 310 dctx.read_from(b'foobar')
303 311
304 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): 312 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'):
305 dctx.read_from(True) 313 b''.join(dctx.read_from(True))
306 314
307 def test_empty_input(self): 315 def test_empty_input(self):
308 dctx = zstd.ZstdDecompressor() 316 dctx = zstd.ZstdDecompressor()
309 317
310 source = io.BytesIO() 318 source = io.BytesIO()
349 357
350 def test_skip_bytes_too_large(self): 358 def test_skip_bytes_too_large(self):
351 dctx = zstd.ZstdDecompressor() 359 dctx = zstd.ZstdDecompressor()
352 360
353 with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'): 361 with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'):
354 dctx.read_from(b'', skip_bytes=1, read_size=1) 362 b''.join(dctx.read_from(b'', skip_bytes=1, read_size=1))
355 363
356 with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'): 364 with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'):
357 b''.join(dctx.read_from(b'foobar', skip_bytes=10)) 365 b''.join(dctx.read_from(b'foobar', skip_bytes=10))
358 366
359 def test_skip_bytes(self): 367 def test_skip_bytes(self):
474 dctx = zstd.ZstdDecompressor() 482 dctx = zstd.ZstdDecompressor()
475 for chunk in dctx.read_from(source, read_size=1, write_size=1): 483 for chunk in dctx.read_from(source, read_size=1, write_size=1):
476 self.assertEqual(len(chunk), 1) 484 self.assertEqual(len(chunk), 1)
477 485
478 self.assertEqual(source._read_count, len(source.getvalue())) 486 self.assertEqual(source._read_count, len(source.getvalue()))
487
488
489 @make_cffi
490 class TestDecompressor_content_dict_chain(unittest.TestCase):
491 def test_bad_inputs_simple(self):
492 dctx = zstd.ZstdDecompressor()
493
494 with self.assertRaises(TypeError):
495 dctx.decompress_content_dict_chain(b'foo')
496
497 with self.assertRaises(TypeError):
498 dctx.decompress_content_dict_chain((b'foo', b'bar'))
499
500 with self.assertRaisesRegexp(ValueError, 'empty input chain'):
501 dctx.decompress_content_dict_chain([])
502
503 with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'):
504 dctx.decompress_content_dict_chain([u'foo'])
505
506 with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'):
507 dctx.decompress_content_dict_chain([True])
508
509 with self.assertRaisesRegexp(ValueError, 'chunk 0 is too small to contain a zstd frame'):
510 dctx.decompress_content_dict_chain([zstd.FRAME_HEADER])
511
512 with self.assertRaisesRegexp(ValueError, 'chunk 0 is not a valid zstd frame'):
513 dctx.decompress_content_dict_chain([b'foo' * 8])
514
515 no_size = zstd.ZstdCompressor().compress(b'foo' * 64)
516
517 with self.assertRaisesRegexp(ValueError, 'chunk 0 missing content size in frame'):
518 dctx.decompress_content_dict_chain([no_size])
519
520 # Corrupt first frame.
521 frame = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64)
522 frame = frame[0:12] + frame[15:]
523 with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 0'):
524 dctx.decompress_content_dict_chain([frame])
525
526 def test_bad_subsequent_input(self):
527 initial = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64)
528
529 dctx = zstd.ZstdDecompressor()
530
531 with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'):
532 dctx.decompress_content_dict_chain([initial, u'foo'])
533
534 with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'):
535 dctx.decompress_content_dict_chain([initial, None])
536
537 with self.assertRaisesRegexp(ValueError, 'chunk 1 is too small to contain a zstd frame'):
538 dctx.decompress_content_dict_chain([initial, zstd.FRAME_HEADER])
539
540 with self.assertRaisesRegexp(ValueError, 'chunk 1 is not a valid zstd frame'):
541 dctx.decompress_content_dict_chain([initial, b'foo' * 8])
542
543 no_size = zstd.ZstdCompressor().compress(b'foo' * 64)
544
545 with self.assertRaisesRegexp(ValueError, 'chunk 1 missing content size in frame'):
546 dctx.decompress_content_dict_chain([initial, no_size])
547
548 # Corrupt second frame.
549 cctx = zstd.ZstdCompressor(write_content_size=True, dict_data=zstd.ZstdCompressionDict(b'foo' * 64))
550 frame = cctx.compress(b'bar' * 64)
551 frame = frame[0:12] + frame[15:]
552
553 with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 1'):
554 dctx.decompress_content_dict_chain([initial, frame])
555
556 def test_simple(self):
557 original = [
558 b'foo' * 64,
559 b'foobar' * 64,
560 b'baz' * 64,
561 b'foobaz' * 64,
562 b'foobarbaz' * 64,
563 ]
564
565 chunks = []
566 chunks.append(zstd.ZstdCompressor(write_content_size=True).compress(original[0]))
567 for i, chunk in enumerate(original[1:]):
568 d = zstd.ZstdCompressionDict(original[i])
569 cctx = zstd.ZstdCompressor(dict_data=d, write_content_size=True)
570 chunks.append(cctx.compress(chunk))
571
572 for i in range(1, len(original)):
573 chain = chunks[0:i]
574 expected = original[i - 1]
575 dctx = zstd.ZstdDecompressor()
576 decompressed = dctx.decompress_content_dict_chain(chain)
577 self.assertEqual(decompressed, expected)