--- a/contrib/python-zstandard/c-ext/compressiondict.c	Sun Apr 08 01:08:43 2018 +0200
+++ b/contrib/python-zstandard/c-ext/compressiondict.c	Mon Apr 09 10:13:29 2018 -0700
@@ -14,125 +14,11 @@
 	static char* kwlist[] = {
 		"dict_size",
 		"samples",
-		"selectivity",
-		"level",
-		"notifications",
-		"dict_id",
-		NULL
-	};
-	size_t capacity;
-	PyObject* samples;
-	Py_ssize_t samplesLen;
-	unsigned  selectivity = 0;
-	int level = 0;
-	unsigned notifications = 0;
-	unsigned dictID = 0;
-	ZDICT_params_t zparams;
-	Py_ssize_t sampleIndex;
-	Py_ssize_t sampleSize;
-	PyObject* sampleItem;
-	size_t zresult;
-	void* sampleBuffer = NULL;
-	void* sampleOffset;
-	size_t samplesSize = 0;
-	size_t* sampleSizes = NULL;
-	void* dict = NULL;
-	ZstdCompressionDict* result = NULL;
-
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary",
-		kwlist,
-		&capacity,
-		&PyList_Type, &samples,
-		&selectivity, &level, &notifications, &dictID)) {
-		return NULL;
-	}
-
-	memset(&zparams, 0, sizeof(zparams));
-
-	zparams.selectivityLevel = selectivity;
-	zparams.compressionLevel = level;
-	zparams.notificationLevel = notifications;
-	zparams.dictID = dictID;
-
-	/* Figure out the size of the raw samples */
-	samplesLen = PyList_Size(samples);
-	for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
-		sampleItem = PyList_GetItem(samples, sampleIndex);
-		if (!PyBytes_Check(sampleItem)) {
-			PyErr_SetString(PyExc_ValueError, "samples must be bytes");
-			return NULL;
-		}
-		samplesSize += PyBytes_GET_SIZE(sampleItem);
-	}
-
-	/* Now that we know the total size of the raw simples, we can allocate
-	a buffer for the raw data */
-	sampleBuffer = PyMem_Malloc(samplesSize);
-	if (!sampleBuffer) {
-		PyErr_NoMemory();
-		goto finally;
-	}
-	sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
-	if (!sampleSizes) {
-		PyErr_NoMemory();
-		goto finally;
-	}
-
-	sampleOffset = sampleBuffer;
-	/* Now iterate again and assemble the samples in the buffer */
-	for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
-		sampleItem = PyList_GetItem(samples, sampleIndex);
-		sampleSize = PyBytes_GET_SIZE(sampleItem);
-		sampleSizes[sampleIndex] = sampleSize;
-		memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
-		sampleOffset = (char*)sampleOffset + sampleSize;
-	}
-
-	dict = PyMem_Malloc(capacity);
-	if (!dict) {
-		PyErr_NoMemory();
-		goto finally;
-	}
-
-	/* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */
-	Py_BEGIN_ALLOW_THREADS
-	zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
-		sampleBuffer, sampleSizes, (unsigned int)samplesLen,
-		zparams);
-	Py_END_ALLOW_THREADS
-	if (ZDICT_isError(zresult)) {
-		PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
-		PyMem_Free(dict);
-		goto finally;
-	}
-
-	result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
-	if (!result) {
-		goto finally;
-	}
-
-	result->dictData = dict;
-	result->dictSize = zresult;
-	result->d = 0;
-	result->k = 0;
-
-finally:
-	PyMem_Free(sampleBuffer);
-	PyMem_Free(sampleSizes);
-
-	return result;
-}
-
-ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
-	static char* kwlist[] = {
-		"dict_size",
-		"samples",
 		"k",
 		"d",
 		"notifications",
 		"dict_id",
 		"level",
-		"optimize",
 		"steps",
 		"threads",
 		NULL
@@ -145,10 +31,9 @@
 	unsigned notifications = 0;
 	unsigned dictID = 0;
 	int level = 0;
-	PyObject* optimize = NULL;
 	unsigned steps = 0;
 	int threads = 0;
-	COVER_params_t params;
+	ZDICT_cover_params_t params;
 	Py_ssize_t samplesLen;
 	Py_ssize_t i;
 	size_t samplesSize = 0;
@@ -160,9 +45,9 @@
 	size_t zresult;
 	ZstdCompressionDict* result = NULL;
 
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary",
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary",
 		kwlist, &capacity, &PyList_Type, &samples,
-		&k, &d, &notifications, &dictID, &level, &optimize, &steps, &threads)) {
+		&k, &d, &notifications, &dictID, &level, &steps, &threads)) {
 		return NULL;
 	}
 
@@ -175,9 +60,9 @@
 	params.d = d;
 	params.steps = steps;
 	params.nbThreads = threads;
-	params.notificationLevel = notifications;
-	params.dictID = dictID;
-	params.compressionLevel = level;
+	params.zParams.notificationLevel = notifications;
+	params.zParams.dictID = dictID;
+	params.zParams.compressionLevel = level;
 
 	/* Figure out total size of input samples. */
 	samplesLen = PyList_Size(samples);
@@ -219,12 +104,21 @@
 	}
 
 	Py_BEGIN_ALLOW_THREADS
-	if (optimize && PyObject_IsTrue(optimize)) {
-		zresult = COVER_optimizeTrainFromBuffer(dict, capacity,
+	/* No parameters uses the default function, which will use default params
+	   and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */
+	if (!params.k && !params.d && !params.zParams.compressionLevel
+		&& !params.zParams.notificationLevel && !params.zParams.dictID) {
+		zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer,
+			sampleSizes, (unsigned)samplesLen);
+	}
+	/* Use optimize mode if user controlled steps or threads explicitly. */
+	else if (params.steps || params.nbThreads) {
+		zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity,
 			sampleBuffer, sampleSizes, (unsigned)samplesLen, &params);
 	}
+	/* Non-optimize mode with explicit control. */
 	else {
-		zresult = COVER_trainFromBuffer(dict, capacity,
+		zresult = ZDICT_trainFromBuffer_cover(dict, capacity,
 			sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
 	}
 	Py_END_ALLOW_THREADS
@@ -243,8 +137,11 @@
 
 	result->dictData = dict;
 	result->dictSize = zresult;
+	result->dictType = ZSTD_dct_fullDict;
 	result->d = params.d;
 	result->k = params.k;
+	result->cdict = NULL;
+	result->ddict = NULL;
 
 finally:
 	PyMem_Free(sampleBuffer);
@@ -253,43 +150,99 @@
 	return result;
 }
 
+int ensure_ddict(ZstdCompressionDict* dict) {
+	if (dict->ddict) {
+		return 0;
+	}
+
+	Py_BEGIN_ALLOW_THREADS
+	dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize,
+		ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem);
+	Py_END_ALLOW_THREADS
+	if (!dict->ddict) {
+		PyErr_SetString(ZstdError, "could not create decompression dict");
+		return 1;
+	}
+
+	return 0;
+}
+
 PyDoc_STRVAR(ZstdCompressionDict__doc__,
 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
 "\n"
 "This type holds the results of a computed Zstandard compression dictionary.\n"
-"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n"
-"obtained from another source into the constructor.\n"
+"Instances are obtained by calling ``train_dictionary()`` or by passing\n"
+"bytes obtained from another source into the constructor.\n"
 );
 
-static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) {
-	const char* source;
-	Py_ssize_t sourceSize;
+static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
+	static char* kwlist[] = {
+		"data",
+		"dict_type",
+		NULL
+	};
+
+	int result = -1;
+	Py_buffer source;
+	unsigned dictType = ZSTD_dct_auto;
 
 	self->dictData = NULL;
 	self->dictSize = 0;
+	self->cdict = NULL;
+	self->ddict = NULL;
 
 #if PY_MAJOR_VERSION >= 3
-	if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict",
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict",
 #else
-	if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict",
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict",
 #endif
-		&source, &sourceSize)) {
+		kwlist, &source, &dictType)) {
 		return -1;
 	}
 
-	self->dictData = PyMem_Malloc(sourceSize);
+	if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
+		PyErr_SetString(PyExc_ValueError,
+			"data buffer should be contiguous and have at most one dimension");
+		goto finally;
+	}
+
+	if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent
+		&& dictType != ZSTD_dct_fullDict) {
+		PyErr_Format(PyExc_ValueError,
+			"invalid dictionary load mode: %d; must use DICT_TYPE_* constants",
+			dictType);
+		goto finally;
+	}
+
+	self->dictType = dictType;
+
+	self->dictData = PyMem_Malloc(source.len);
 	if (!self->dictData) {
 		PyErr_NoMemory();
-		return -1;
+		goto finally;
 	}
 
-	memcpy(self->dictData, source, sourceSize);
-	self->dictSize = sourceSize;
+	memcpy(self->dictData, source.buf, source.len);
+	self->dictSize = source.len;
+
+	result = 0;
 
-	return 0;
+finally:
+	PyBuffer_Release(&source);
+	return result;
+}
+
+static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
+	if (self->cdict) {
+		ZSTD_freeCDict(self->cdict);
+		self->cdict = NULL;
 	}
 
-static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
+	if (self->ddict) {
+		ZSTD_freeDDict(self->ddict);
+		self->ddict = NULL;
+	}
+
 	if (self->dictData) {
 		PyMem_Free(self->dictData);
 		self->dictData = NULL;
@@ -298,6 +251,74 @@
 	PyObject_Del(self);
 }
 
+PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__,
+"Precompute a dictionary so it can be used by multiple compressors.\n"
+);
+
+static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
+	static char* kwlist[] = {
+		"level",
+		"compression_params",
+		NULL
+	};
+
+	int level = 0;
+	ZstdCompressionParametersObject* compressionParams = NULL;
+	ZSTD_compressionParameters cParams;
+	size_t zresult;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist,
+		&level, &ZstdCompressionParametersType, &compressionParams)) {
+		return NULL;
+	}
+
+	if (level && compressionParams) {
+		PyErr_SetString(PyExc_ValueError,
+			"must only specify one of level or compression_params");
+		return NULL;
+	}
+
+	if (!level && !compressionParams) {
+		PyErr_SetString(PyExc_ValueError,
+			"must specify one of level or compression_params");
+		return NULL;
+	}
+
+	if (self->cdict) {
+		zresult = ZSTD_freeCDict(self->cdict);
+		self->cdict = NULL;
+		if (ZSTD_isError(zresult)) {
+			PyErr_Format(ZstdError, "unable to free CDict: %s",
+				ZSTD_getErrorName(zresult));
+			return NULL;
+		}
+	}
+
+	if (level) {
+		cParams = ZSTD_getCParams(level, 0, self->dictSize);
+	}
+	else {
+		cParams.chainLog = compressionParams->chainLog;
+		cParams.hashLog = compressionParams->hashLog;
+		cParams.searchLength = compressionParams->minMatch;
+		cParams.searchLog = compressionParams->searchLog;
+		cParams.strategy = compressionParams->compressionStrategy;
+		cParams.targetLength = compressionParams->targetLength;
+		cParams.windowLog = compressionParams->windowLog;
+	}
+
+	assert(!self->cdict);
+	self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize,
+		ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem);
+
+	if (!self->cdict) {
+		PyErr_SetString(ZstdError, "unable to precompute dictionary");
+		return NULL;
+	}
+
+	Py_RETURN_NONE;
+}
+
 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
 	unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
 
@@ -313,6 +334,8 @@
 	PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
 	{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
 	PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
+	{ "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress,
+	METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ },
 	{ NULL, NULL }
 };
changeset 37495	b1fb341d8a61
parent 31796	e0dc40530c5a
child 42070	675775c33ab6