diff -r 5b60464efbde -r c32454d69b85 contrib/python-zstandard/c-ext/decompressor.c --- a/contrib/python-zstandard/c-ext/decompressor.c Thu Feb 09 21:44:32 2017 -0500 +++ b/contrib/python-zstandard/c-ext/decompressor.c Tue Feb 07 23:24:47 2017 -0800 @@ -59,23 +59,19 @@ ZstdCompressionDict* dict = NULL; - self->refdctx = NULL; + self->dctx = NULL; self->dict = NULL; self->ddict = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!:ZstdDecompressor", kwlist, &ZstdCompressionDictType, &dict)) { return -1; } - /* Instead of creating a ZSTD_DCtx for every decompression operation, - we create an instance at object creation time and recycle it via - ZSTD_copyDCTx() on each use. This means each use is a malloc+memcpy - instead of a malloc+init. */ /* TODO lazily initialize the reference ZSTD_DCtx on first use since not instances of ZstdDecompressor will use a ZSTD_DCtx. */ - self->refdctx = ZSTD_createDCtx(); - if (!self->refdctx) { + self->dctx = ZSTD_createDCtx(); + if (!self->dctx) { PyErr_NoMemory(); goto except; } @@ -88,17 +84,17 @@ return 0; except: - if (self->refdctx) { - ZSTD_freeDCtx(self->refdctx); - self->refdctx = NULL; + if (self->dctx) { + ZSTD_freeDCtx(self->dctx); + self->dctx = NULL; } return -1; } static void Decompressor_dealloc(ZstdDecompressor* self) { - if (self->refdctx) { - ZSTD_freeDCtx(self->refdctx); + if (self->dctx) { + ZSTD_freeDCtx(self->dctx); } Py_XDECREF(self->dict); @@ -150,8 +146,8 @@ PyObject* totalReadPy; PyObject* totalWritePy; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk", kwlist, &source, - &dest, &inSize, &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk:copy_stream", kwlist, + &source, &dest, &inSize, &outSize)) { return NULL; } @@ -243,7 +239,7 @@ Py_DecRef(totalReadPy); Py_DecRef(totalWritePy); - finally: +finally: if (output.dst) { PyMem_Free(output.dst); } @@ -291,28 +287,19 @@ unsigned long long decompressedSize; size_t destCapacity; PyObject* result = NULL; - ZSTD_DCtx* dctx = NULL; void* dictData = NULL; size_t dictSize = 0; size_t zresult; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n:decompress", #else - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n:decompress", #endif - &source, &sourceSize, &maxOutputSize)) { + kwlist, &source, &sourceSize, &maxOutputSize)) { return NULL; } - dctx = PyMem_Malloc(ZSTD_sizeof_DCtx(self->refdctx)); - if (!dctx) { - PyErr_NoMemory(); - return NULL; - } - - ZSTD_copyDCtx(dctx, self->refdctx); - if (self->dict) { dictData = self->dict->dictData; dictSize = self->dict->dictSize; @@ -320,12 +307,12 @@ if (dictData && !self->ddict) { Py_BEGIN_ALLOW_THREADS - self->ddict = ZSTD_createDDict(dictData, dictSize); + self->ddict = ZSTD_createDDict_byReference(dictData, dictSize); Py_END_ALLOW_THREADS if (!self->ddict) { PyErr_SetString(ZstdError, "could not create decompression dict"); - goto except; + return NULL; } } @@ -335,7 +322,7 @@ if (0 == maxOutputSize) { PyErr_SetString(ZstdError, "input data invalid or missing content size " "in frame header"); - goto except; + return NULL; } else { result = PyBytes_FromStringAndSize(NULL, maxOutputSize); @@ -348,45 +335,39 @@ } if (!result) { - goto except; + return NULL; } Py_BEGIN_ALLOW_THREADS if (self->ddict) { - zresult = ZSTD_decompress_usingDDict(dctx, PyBytes_AsString(result), destCapacity, + zresult = ZSTD_decompress_usingDDict(self->dctx, + PyBytes_AsString(result), destCapacity, source, sourceSize, self->ddict); } else { - zresult = ZSTD_decompressDCtx(dctx, PyBytes_AsString(result), destCapacity, source, sourceSize); + zresult = ZSTD_decompressDCtx(self->dctx, + PyBytes_AsString(result), destCapacity, source, sourceSize); } Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult)); - goto except; + Py_DecRef(result); + return NULL; } else if (decompressedSize && zresult != decompressedSize) { PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu", zresult, decompressedSize); - goto except; + Py_DecRef(result); + return NULL; } else if (zresult < destCapacity) { if (_PyBytes_Resize(&result, zresult)) { - goto except; + Py_DecRef(result); + return NULL; } } - goto finally; - -except: - Py_DecRef(result); - result = NULL; - -finally: - if (dctx) { - PyMem_FREE(dctx); - } - return result; } @@ -455,8 +436,8 @@ ZstdDecompressorIterator* result; size_t skipBytes = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk", kwlist, &reader, - &inSize, &outSize, &skipBytes)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_from", kwlist, + &reader, &inSize, &outSize, &skipBytes)) { return NULL; } @@ -534,19 +515,14 @@ goto finally; except: - if (result->reader) { - Py_DECREF(result->reader); - result->reader = NULL; - } + Py_CLEAR(result->reader); if (result->buffer) { PyBuffer_Release(result->buffer); - Py_DECREF(result->buffer); - result->buffer = NULL; + Py_CLEAR(result->buffer); } - Py_DECREF(result); - result = NULL; + Py_CLEAR(result); finally: @@ -577,7 +553,8 @@ size_t outSize = ZSTD_DStreamOutSize(); ZstdDecompressionWriter* result; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k", kwlist, &writer, &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:write_to", kwlist, + &writer, &outSize)) { return NULL; } @@ -605,6 +582,200 @@ return result; } +PyDoc_STRVAR(Decompressor_decompress_content_dict_chain__doc__, +"Decompress a series of chunks using the content dictionary chaining technique\n" +); + +static PyObject* Decompressor_decompress_content_dict_chain(PyObject* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "frames", + NULL + }; + + PyObject* chunks; + Py_ssize_t chunksLen; + Py_ssize_t chunkIndex; + char parity = 0; + PyObject* chunk; + char* chunkData; + Py_ssize_t chunkSize; + ZSTD_DCtx* dctx = NULL; + size_t zresult; + ZSTD_frameParams frameParams; + void* buffer1 = NULL; + size_t buffer1Size = 0; + size_t buffer1ContentSize = 0; + void* buffer2 = NULL; + size_t buffer2Size = 0; + size_t buffer2ContentSize = 0; + void* destBuffer = NULL; + PyObject* result = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain", + kwlist, &PyList_Type, &chunks)) { + return NULL; + } + + chunksLen = PyList_Size(chunks); + if (!chunksLen) { + PyErr_SetString(PyExc_ValueError, "empty input chain"); + return NULL; + } + + /* The first chunk should not be using a dictionary. We handle it specially. */ + chunk = PyList_GetItem(chunks, 0); + if (!PyBytes_Check(chunk)) { + PyErr_SetString(PyExc_ValueError, "chunk 0 must be bytes"); + return NULL; + } + + /* We require that all chunks be zstd frames and that they have content size set. */ + PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize); + zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize); + if (ZSTD_isError(zresult)) { + PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame"); + return NULL; + } + else if (zresult) { + PyErr_SetString(PyExc_ValueError, "chunk 0 is too small to contain a zstd frame"); + return NULL; + } + + if (0 == frameParams.frameContentSize) { + PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame"); + return NULL; + } + + dctx = ZSTD_createDCtx(); + if (!dctx) { + PyErr_NoMemory(); + goto finally; + } + + buffer1Size = frameParams.frameContentSize; + buffer1 = PyMem_Malloc(buffer1Size); + if (!buffer1) { + goto finally; + } + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompressDCtx(dctx, buffer1, buffer1Size, chunkData, chunkSize); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult)); + goto finally; + } + + buffer1ContentSize = zresult; + + /* Special case of a simple chain. */ + if (1 == chunksLen) { + result = PyBytes_FromStringAndSize(buffer1, buffer1Size); + goto finally; + } + + /* This should ideally look at next chunk. But this is slightly simpler. */ + buffer2Size = frameParams.frameContentSize; + buffer2 = PyMem_Malloc(buffer2Size); + if (!buffer2) { + goto finally; + } + + /* For each subsequent chunk, use the previous fulltext as a content dictionary. + Our strategy is to have 2 buffers. One holds the previous fulltext (to be + used as a content dictionary) and the other holds the new fulltext. The + buffers grow when needed but never decrease in size. This limits the + memory allocator overhead. + */ + for (chunkIndex = 1; chunkIndex < chunksLen; chunkIndex++) { + chunk = PyList_GetItem(chunks, chunkIndex); + if (!PyBytes_Check(chunk)) { + PyErr_Format(PyExc_ValueError, "chunk %zd must be bytes", chunkIndex); + goto finally; + } + + PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize); + zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex); + goto finally; + } + else if (zresult) { + PyErr_Format(PyExc_ValueError, "chunk %zd is too small to contain a zstd frame", chunkIndex); + goto finally; + } + + if (0 == frameParams.frameContentSize) { + PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex); + goto finally; + } + + parity = chunkIndex % 2; + + /* This could definitely be abstracted to reduce code duplication. */ + if (parity) { + /* Resize destination buffer to hold larger content. */ + if (buffer2Size < frameParams.frameContentSize) { + buffer2Size = frameParams.frameContentSize; + destBuffer = PyMem_Realloc(buffer2, buffer2Size); + if (!destBuffer) { + goto finally; + } + buffer2 = destBuffer; + } + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_usingDict(dctx, buffer2, buffer2Size, + chunkData, chunkSize, buffer1, buffer1ContentSize); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not decompress chunk %zd: %s", + chunkIndex, ZSTD_getErrorName(zresult)); + goto finally; + } + buffer2ContentSize = zresult; + } + else { + if (buffer1Size < frameParams.frameContentSize) { + buffer1Size = frameParams.frameContentSize; + destBuffer = PyMem_Realloc(buffer1, buffer1Size); + if (!destBuffer) { + goto finally; + } + buffer1 = destBuffer; + } + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_usingDict(dctx, buffer1, buffer1Size, + chunkData, chunkSize, buffer2, buffer2ContentSize); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not decompress chunk %zd: %s", + chunkIndex, ZSTD_getErrorName(zresult)); + goto finally; + } + buffer1ContentSize = zresult; + } + } + + result = PyBytes_FromStringAndSize(parity ? buffer2 : buffer1, + parity ? buffer2ContentSize : buffer1ContentSize); + +finally: + if (buffer2) { + PyMem_Free(buffer2); + } + if (buffer1) { + PyMem_Free(buffer1); + } + + if (dctx) { + ZSTD_freeDCtx(dctx); + } + + return result; +} + static PyMethodDef Decompressor_methods[] = { { "copy_stream", (PyCFunction)Decompressor_copy_stream, METH_VARARGS | METH_KEYWORDS, Decompressor_copy_stream__doc__ }, @@ -616,6 +787,8 @@ Decompressor_read_from__doc__ }, { "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS, Decompressor_write_to__doc__ }, + { "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain, + METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ }, { NULL, NULL } };