--- a/contrib/python-zstandard/c-ext/compressiondict.c Sun Apr 08 01:08:43 2018 +0200
+++ b/contrib/python-zstandard/c-ext/compressiondict.c Mon Apr 09 10:13:29 2018 -0700
@@ -14,125 +14,11 @@
static char* kwlist[] = {
"dict_size",
"samples",
- "selectivity",
- "level",
- "notifications",
- "dict_id",
- NULL
- };
- size_t capacity;
- PyObject* samples;
- Py_ssize_t samplesLen;
- unsigned selectivity = 0;
- int level = 0;
- unsigned notifications = 0;
- unsigned dictID = 0;
- ZDICT_params_t zparams;
- Py_ssize_t sampleIndex;
- Py_ssize_t sampleSize;
- PyObject* sampleItem;
- size_t zresult;
- void* sampleBuffer = NULL;
- void* sampleOffset;
- size_t samplesSize = 0;
- size_t* sampleSizes = NULL;
- void* dict = NULL;
- ZstdCompressionDict* result = NULL;
-
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary",
- kwlist,
- &capacity,
- &PyList_Type, &samples,
- &selectivity, &level, ¬ifications, &dictID)) {
- return NULL;
- }
-
- memset(&zparams, 0, sizeof(zparams));
-
- zparams.selectivityLevel = selectivity;
- zparams.compressionLevel = level;
- zparams.notificationLevel = notifications;
- zparams.dictID = dictID;
-
- /* Figure out the size of the raw samples */
- samplesLen = PyList_Size(samples);
- for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
- sampleItem = PyList_GetItem(samples, sampleIndex);
- if (!PyBytes_Check(sampleItem)) {
- PyErr_SetString(PyExc_ValueError, "samples must be bytes");
- return NULL;
- }
- samplesSize += PyBytes_GET_SIZE(sampleItem);
- }
-
- /* Now that we know the total size of the raw simples, we can allocate
- a buffer for the raw data */
- sampleBuffer = PyMem_Malloc(samplesSize);
- if (!sampleBuffer) {
- PyErr_NoMemory();
- goto finally;
- }
- sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
- if (!sampleSizes) {
- PyErr_NoMemory();
- goto finally;
- }
-
- sampleOffset = sampleBuffer;
- /* Now iterate again and assemble the samples in the buffer */
- for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
- sampleItem = PyList_GetItem(samples, sampleIndex);
- sampleSize = PyBytes_GET_SIZE(sampleItem);
- sampleSizes[sampleIndex] = sampleSize;
- memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
- sampleOffset = (char*)sampleOffset + sampleSize;
- }
-
- dict = PyMem_Malloc(capacity);
- if (!dict) {
- PyErr_NoMemory();
- goto finally;
- }
-
- /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */
- Py_BEGIN_ALLOW_THREADS
- zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
- sampleBuffer, sampleSizes, (unsigned int)samplesLen,
- zparams);
- Py_END_ALLOW_THREADS
- if (ZDICT_isError(zresult)) {
- PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
- PyMem_Free(dict);
- goto finally;
- }
-
- result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
- if (!result) {
- goto finally;
- }
-
- result->dictData = dict;
- result->dictSize = zresult;
- result->d = 0;
- result->k = 0;
-
-finally:
- PyMem_Free(sampleBuffer);
- PyMem_Free(sampleSizes);
-
- return result;
-}
-
-ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
- static char* kwlist[] = {
- "dict_size",
- "samples",
"k",
"d",
"notifications",
"dict_id",
"level",
- "optimize",
"steps",
"threads",
NULL
@@ -145,10 +31,9 @@
unsigned notifications = 0;
unsigned dictID = 0;
int level = 0;
- PyObject* optimize = NULL;
unsigned steps = 0;
int threads = 0;
- COVER_params_t params;
+ ZDICT_cover_params_t params;
Py_ssize_t samplesLen;
Py_ssize_t i;
size_t samplesSize = 0;
@@ -160,9 +45,9 @@
size_t zresult;
ZstdCompressionDict* result = NULL;
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary",
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary",
kwlist, &capacity, &PyList_Type, &samples,
- &k, &d, ¬ifications, &dictID, &level, &optimize, &steps, &threads)) {
+ &k, &d, ¬ifications, &dictID, &level, &steps, &threads)) {
return NULL;
}
@@ -175,9 +60,9 @@
params.d = d;
params.steps = steps;
params.nbThreads = threads;
- params.notificationLevel = notifications;
- params.dictID = dictID;
- params.compressionLevel = level;
+ params.zParams.notificationLevel = notifications;
+ params.zParams.dictID = dictID;
+ params.zParams.compressionLevel = level;
/* Figure out total size of input samples. */
samplesLen = PyList_Size(samples);
@@ -219,12 +104,21 @@
}
Py_BEGIN_ALLOW_THREADS
- if (optimize && PyObject_IsTrue(optimize)) {
- zresult = COVER_optimizeTrainFromBuffer(dict, capacity,
+ /* No parameters uses the default function, which will use default params
+ and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */
+ if (!params.k && !params.d && !params.zParams.compressionLevel
+ && !params.zParams.notificationLevel && !params.zParams.dictID) {
+ zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer,
+ sampleSizes, (unsigned)samplesLen);
+ }
+ /* Use optimize mode if user controlled steps or threads explicitly. */
+ else if (params.steps || params.nbThreads) {
+ zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity,
sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms);
}
+ /* Non-optimize mode with explicit control. */
else {
- zresult = COVER_trainFromBuffer(dict, capacity,
+ zresult = ZDICT_trainFromBuffer_cover(dict, capacity,
sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
}
Py_END_ALLOW_THREADS
@@ -243,8 +137,11 @@
result->dictData = dict;
result->dictSize = zresult;
+ result->dictType = ZSTD_dct_fullDict;
result->d = params.d;
result->k = params.k;
+ result->cdict = NULL;
+ result->ddict = NULL;
finally:
PyMem_Free(sampleBuffer);
@@ -253,43 +150,99 @@
return result;
}
+int ensure_ddict(ZstdCompressionDict* dict) {
+ if (dict->ddict) {
+ return 0;
+ }
+
+ Py_BEGIN_ALLOW_THREADS
+ dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize,
+ ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem);
+ Py_END_ALLOW_THREADS
+ if (!dict->ddict) {
+ PyErr_SetString(ZstdError, "could not create decompression dict");
+ return 1;
+ }
+
+ return 0;
+}
+
PyDoc_STRVAR(ZstdCompressionDict__doc__,
"ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
"\n"
"This type holds the results of a computed Zstandard compression dictionary.\n"
-"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n"
-"obtained from another source into the constructor.\n"
+"Instances are obtained by calling ``train_dictionary()`` or by passing\n"
+"bytes obtained from another source into the constructor.\n"
);
-static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) {
- const char* source;
- Py_ssize_t sourceSize;
+static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
+ static char* kwlist[] = {
+ "data",
+ "dict_type",
+ NULL
+ };
+
+ int result = -1;
+ Py_buffer source;
+ unsigned dictType = ZSTD_dct_auto;
self->dictData = NULL;
self->dictSize = 0;
+ self->cdict = NULL;
+ self->ddict = NULL;
#if PY_MAJOR_VERSION >= 3
- if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict",
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict",
#else
- if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict",
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict",
#endif
- &source, &sourceSize)) {
+ kwlist, &source, &dictType)) {
return -1;
}
- self->dictData = PyMem_Malloc(sourceSize);
+ if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "data buffer should be contiguous and have at most one dimension");
+ goto finally;
+ }
+
+ if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent
+ && dictType != ZSTD_dct_fullDict) {
+ PyErr_Format(PyExc_ValueError,
+ "invalid dictionary load mode: %d; must use DICT_TYPE_* constants",
+ dictType);
+ goto finally;
+ }
+
+ self->dictType = dictType;
+
+ self->dictData = PyMem_Malloc(source.len);
if (!self->dictData) {
PyErr_NoMemory();
- return -1;
+ goto finally;
}
- memcpy(self->dictData, source, sourceSize);
- self->dictSize = sourceSize;
+ memcpy(self->dictData, source.buf, source.len);
+ self->dictSize = source.len;
+
+ result = 0;
- return 0;
+finally:
+ PyBuffer_Release(&source);
+ return result;
+}
+
+static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
+ if (self->cdict) {
+ ZSTD_freeCDict(self->cdict);
+ self->cdict = NULL;
}
-static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
+ if (self->ddict) {
+ ZSTD_freeDDict(self->ddict);
+ self->ddict = NULL;
+ }
+
if (self->dictData) {
PyMem_Free(self->dictData);
self->dictData = NULL;
@@ -298,6 +251,74 @@
PyObject_Del(self);
}
+PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__,
+"Precompute a dictionary so it can be used by multiple compressors.\n"
+);
+
+static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
+ static char* kwlist[] = {
+ "level",
+ "compression_params",
+ NULL
+ };
+
+ int level = 0;
+ ZstdCompressionParametersObject* compressionParams = NULL;
+ ZSTD_compressionParameters cParams;
+ size_t zresult;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist,
+ &level, &ZstdCompressionParametersType, &compressionParams)) {
+ return NULL;
+ }
+
+ if (level && compressionParams) {
+ PyErr_SetString(PyExc_ValueError,
+ "must only specify one of level or compression_params");
+ return NULL;
+ }
+
+ if (!level && !compressionParams) {
+ PyErr_SetString(PyExc_ValueError,
+ "must specify one of level or compression_params");
+ return NULL;
+ }
+
+ if (self->cdict) {
+ zresult = ZSTD_freeCDict(self->cdict);
+ self->cdict = NULL;
+ if (ZSTD_isError(zresult)) {
+ PyErr_Format(ZstdError, "unable to free CDict: %s",
+ ZSTD_getErrorName(zresult));
+ return NULL;
+ }
+ }
+
+ if (level) {
+ cParams = ZSTD_getCParams(level, 0, self->dictSize);
+ }
+ else {
+ cParams.chainLog = compressionParams->chainLog;
+ cParams.hashLog = compressionParams->hashLog;
+ cParams.searchLength = compressionParams->minMatch;
+ cParams.searchLog = compressionParams->searchLog;
+ cParams.strategy = compressionParams->compressionStrategy;
+ cParams.targetLength = compressionParams->targetLength;
+ cParams.windowLog = compressionParams->windowLog;
+ }
+
+ assert(!self->cdict);
+ self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize,
+ ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem);
+
+ if (!self->cdict) {
+ PyErr_SetString(ZstdError, "unable to precompute dictionary");
+ return NULL;
+ }
+
+ Py_RETURN_NONE;
+}
+
static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
@@ -313,6 +334,8 @@
PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
+ { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress,
+ METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ },
{ NULL, NULL }
};