contrib/python-zstandard/c-ext/compressiondict.c
changeset 37495 b1fb341d8a61
parent 31796 e0dc40530c5a
child 42070 675775c33ab6
equal deleted inserted replaced
37494:1ce7a55b09d1 37495:b1fb341d8a61
     9 #include "python-zstandard.h"
     9 #include "python-zstandard.h"
    10 
    10 
    11 extern PyObject* ZstdError;
    11 extern PyObject* ZstdError;
    12 
    12 
    13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
    13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
    14 	static char* kwlist[] = {
       
    15 		"dict_size",
       
    16 		"samples",
       
    17 		"selectivity",
       
    18 		"level",
       
    19 		"notifications",
       
    20 		"dict_id",
       
    21 		NULL
       
    22 	};
       
    23 	size_t capacity;
       
    24 	PyObject* samples;
       
    25 	Py_ssize_t samplesLen;
       
    26 	unsigned  selectivity = 0;
       
    27 	int level = 0;
       
    28 	unsigned notifications = 0;
       
    29 	unsigned dictID = 0;
       
    30 	ZDICT_params_t zparams;
       
    31 	Py_ssize_t sampleIndex;
       
    32 	Py_ssize_t sampleSize;
       
    33 	PyObject* sampleItem;
       
    34 	size_t zresult;
       
    35 	void* sampleBuffer = NULL;
       
    36 	void* sampleOffset;
       
    37 	size_t samplesSize = 0;
       
    38 	size_t* sampleSizes = NULL;
       
    39 	void* dict = NULL;
       
    40 	ZstdCompressionDict* result = NULL;
       
    41 
       
    42 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary",
       
    43 		kwlist,
       
    44 		&capacity,
       
    45 		&PyList_Type, &samples,
       
    46 		&selectivity, &level, &notifications, &dictID)) {
       
    47 		return NULL;
       
    48 	}
       
    49 
       
    50 	memset(&zparams, 0, sizeof(zparams));
       
    51 
       
    52 	zparams.selectivityLevel = selectivity;
       
    53 	zparams.compressionLevel = level;
       
    54 	zparams.notificationLevel = notifications;
       
    55 	zparams.dictID = dictID;
       
    56 
       
    57 	/* Figure out the size of the raw samples */
       
    58 	samplesLen = PyList_Size(samples);
       
    59 	for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
       
    60 		sampleItem = PyList_GetItem(samples, sampleIndex);
       
    61 		if (!PyBytes_Check(sampleItem)) {
       
    62 			PyErr_SetString(PyExc_ValueError, "samples must be bytes");
       
    63 			return NULL;
       
    64 		}
       
    65 		samplesSize += PyBytes_GET_SIZE(sampleItem);
       
    66 	}
       
    67 
       
    68 	/* Now that we know the total size of the raw simples, we can allocate
       
    69 	a buffer for the raw data */
       
    70 	sampleBuffer = PyMem_Malloc(samplesSize);
       
    71 	if (!sampleBuffer) {
       
    72 		PyErr_NoMemory();
       
    73 		goto finally;
       
    74 	}
       
    75 	sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
       
    76 	if (!sampleSizes) {
       
    77 		PyErr_NoMemory();
       
    78 		goto finally;
       
    79 	}
       
    80 
       
    81 	sampleOffset = sampleBuffer;
       
    82 	/* Now iterate again and assemble the samples in the buffer */
       
    83 	for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
       
    84 		sampleItem = PyList_GetItem(samples, sampleIndex);
       
    85 		sampleSize = PyBytes_GET_SIZE(sampleItem);
       
    86 		sampleSizes[sampleIndex] = sampleSize;
       
    87 		memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
       
    88 		sampleOffset = (char*)sampleOffset + sampleSize;
       
    89 	}
       
    90 
       
    91 	dict = PyMem_Malloc(capacity);
       
    92 	if (!dict) {
       
    93 		PyErr_NoMemory();
       
    94 		goto finally;
       
    95 	}
       
    96 
       
    97 	/* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */
       
    98 	Py_BEGIN_ALLOW_THREADS
       
    99 	zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
       
   100 		sampleBuffer, sampleSizes, (unsigned int)samplesLen,
       
   101 		zparams);
       
   102 	Py_END_ALLOW_THREADS
       
   103 	if (ZDICT_isError(zresult)) {
       
   104 		PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
       
   105 		PyMem_Free(dict);
       
   106 		goto finally;
       
   107 	}
       
   108 
       
   109 	result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
       
   110 	if (!result) {
       
   111 		goto finally;
       
   112 	}
       
   113 
       
   114 	result->dictData = dict;
       
   115 	result->dictSize = zresult;
       
   116 	result->d = 0;
       
   117 	result->k = 0;
       
   118 
       
   119 finally:
       
   120 	PyMem_Free(sampleBuffer);
       
   121 	PyMem_Free(sampleSizes);
       
   122 
       
   123 	return result;
       
   124 }
       
   125 
       
   126 ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
       
   127 	static char* kwlist[] = {
    14 	static char* kwlist[] = {
   128 		"dict_size",
    15 		"dict_size",
   129 		"samples",
    16 		"samples",
   130 		"k",
    17 		"k",
   131 		"d",
    18 		"d",
   132 		"notifications",
    19 		"notifications",
   133 		"dict_id",
    20 		"dict_id",
   134 		"level",
    21 		"level",
   135 		"optimize",
       
   136 		"steps",
    22 		"steps",
   137 		"threads",
    23 		"threads",
   138 		NULL
    24 		NULL
   139 	};
    25 	};
   140 
    26 
   143 	unsigned k = 0;
    29 	unsigned k = 0;
   144 	unsigned d = 0;
    30 	unsigned d = 0;
   145 	unsigned notifications = 0;
    31 	unsigned notifications = 0;
   146 	unsigned dictID = 0;
    32 	unsigned dictID = 0;
   147 	int level = 0;
    33 	int level = 0;
   148 	PyObject* optimize = NULL;
       
   149 	unsigned steps = 0;
    34 	unsigned steps = 0;
   150 	int threads = 0;
    35 	int threads = 0;
   151 	COVER_params_t params;
    36 	ZDICT_cover_params_t params;
   152 	Py_ssize_t samplesLen;
    37 	Py_ssize_t samplesLen;
   153 	Py_ssize_t i;
    38 	Py_ssize_t i;
   154 	size_t samplesSize = 0;
    39 	size_t samplesSize = 0;
   155 	void* sampleBuffer = NULL;
    40 	void* sampleBuffer = NULL;
   156 	size_t* sampleSizes = NULL;
    41 	size_t* sampleSizes = NULL;
   158 	Py_ssize_t sampleSize;
    43 	Py_ssize_t sampleSize;
   159 	void* dict = NULL;
    44 	void* dict = NULL;
   160 	size_t zresult;
    45 	size_t zresult;
   161 	ZstdCompressionDict* result = NULL;
    46 	ZstdCompressionDict* result = NULL;
   162 
    47 
   163 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary",
    48 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary",
   164 		kwlist, &capacity, &PyList_Type, &samples,
    49 		kwlist, &capacity, &PyList_Type, &samples,
   165 		&k, &d, &notifications, &dictID, &level, &optimize, &steps, &threads)) {
    50 		&k, &d, &notifications, &dictID, &level, &steps, &threads)) {
   166 		return NULL;
    51 		return NULL;
   167 	}
    52 	}
   168 
    53 
   169 	if (threads < 0) {
    54 	if (threads < 0) {
   170 		threads = cpu_count();
    55 		threads = cpu_count();
   173 	memset(&params, 0, sizeof(params));
    58 	memset(&params, 0, sizeof(params));
   174 	params.k = k;
    59 	params.k = k;
   175 	params.d = d;
    60 	params.d = d;
   176 	params.steps = steps;
    61 	params.steps = steps;
   177 	params.nbThreads = threads;
    62 	params.nbThreads = threads;
   178 	params.notificationLevel = notifications;
    63 	params.zParams.notificationLevel = notifications;
   179 	params.dictID = dictID;
    64 	params.zParams.dictID = dictID;
   180 	params.compressionLevel = level;
    65 	params.zParams.compressionLevel = level;
   181 
    66 
   182 	/* Figure out total size of input samples. */
    67 	/* Figure out total size of input samples. */
   183 	samplesLen = PyList_Size(samples);
    68 	samplesLen = PyList_Size(samples);
   184 	for (i = 0; i < samplesLen; i++) {
    69 	for (i = 0; i < samplesLen; i++) {
   185 		PyObject* sampleItem = PyList_GET_ITEM(samples, i);
    70 		PyObject* sampleItem = PyList_GET_ITEM(samples, i);
   217 		PyErr_NoMemory();
   102 		PyErr_NoMemory();
   218 		goto finally;
   103 		goto finally;
   219 	}
   104 	}
   220 
   105 
   221 	Py_BEGIN_ALLOW_THREADS
   106 	Py_BEGIN_ALLOW_THREADS
   222 	if (optimize && PyObject_IsTrue(optimize)) {
   107 	/* No parameters uses the default function, which will use default params
   223 		zresult = COVER_optimizeTrainFromBuffer(dict, capacity,
   108 	   and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */
       
   109 	if (!params.k && !params.d && !params.zParams.compressionLevel
       
   110 		&& !params.zParams.notificationLevel && !params.zParams.dictID) {
       
   111 		zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer,
       
   112 			sampleSizes, (unsigned)samplesLen);
       
   113 	}
       
   114 	/* Use optimize mode if user controlled steps or threads explicitly. */
       
   115 	else if (params.steps || params.nbThreads) {
       
   116 		zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity,
   224 			sampleBuffer, sampleSizes, (unsigned)samplesLen, &params);
   117 			sampleBuffer, sampleSizes, (unsigned)samplesLen, &params);
   225 	}
   118 	}
       
   119 	/* Non-optimize mode with explicit control. */
   226 	else {
   120 	else {
   227 		zresult = COVER_trainFromBuffer(dict, capacity,
   121 		zresult = ZDICT_trainFromBuffer_cover(dict, capacity,
   228 			sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
   122 			sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
   229 	}
   123 	}
   230 	Py_END_ALLOW_THREADS
   124 	Py_END_ALLOW_THREADS
   231 
   125 
   232 	if (ZDICT_isError(zresult)) {
   126 	if (ZDICT_isError(zresult)) {
   241 		goto finally;
   135 		goto finally;
   242 	}
   136 	}
   243 
   137 
   244 	result->dictData = dict;
   138 	result->dictData = dict;
   245 	result->dictSize = zresult;
   139 	result->dictSize = zresult;
       
   140 	result->dictType = ZSTD_dct_fullDict;
   246 	result->d = params.d;
   141 	result->d = params.d;
   247 	result->k = params.k;
   142 	result->k = params.k;
       
   143 	result->cdict = NULL;
       
   144 	result->ddict = NULL;
   248 
   145 
   249 finally:
   146 finally:
   250 	PyMem_Free(sampleBuffer);
   147 	PyMem_Free(sampleBuffer);
   251 	PyMem_Free(sampleSizes);
   148 	PyMem_Free(sampleSizes);
   252 
   149 
   253 	return result;
   150 	return result;
       
   151 }
       
   152 
       
   153 int ensure_ddict(ZstdCompressionDict* dict) {
       
   154 	if (dict->ddict) {
       
   155 		return 0;
       
   156 	}
       
   157 
       
   158 	Py_BEGIN_ALLOW_THREADS
       
   159 	dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize,
       
   160 		ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem);
       
   161 	Py_END_ALLOW_THREADS
       
   162 	if (!dict->ddict) {
       
   163 		PyErr_SetString(ZstdError, "could not create decompression dict");
       
   164 		return 1;
       
   165 	}
       
   166 
       
   167 	return 0;
   254 }
   168 }
   255 
   169 
   256 PyDoc_STRVAR(ZstdCompressionDict__doc__,
   170 PyDoc_STRVAR(ZstdCompressionDict__doc__,
   257 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
   171 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
   258 "\n"
   172 "\n"
   259 "This type holds the results of a computed Zstandard compression dictionary.\n"
   173 "This type holds the results of a computed Zstandard compression dictionary.\n"
   260 "Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n"
   174 "Instances are obtained by calling ``train_dictionary()`` or by passing\n"
   261 "obtained from another source into the constructor.\n"
   175 "bytes obtained from another source into the constructor.\n"
   262 );
   176 );
   263 
   177 
   264 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) {
   178 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
   265 	const char* source;
   179 	static char* kwlist[] = {
   266 	Py_ssize_t sourceSize;
   180 		"data",
       
   181 		"dict_type",
       
   182 		NULL
       
   183 	};
       
   184 
       
   185 	int result = -1;
       
   186 	Py_buffer source;
       
   187 	unsigned dictType = ZSTD_dct_auto;
   267 
   188 
   268 	self->dictData = NULL;
   189 	self->dictData = NULL;
   269 	self->dictSize = 0;
   190 	self->dictSize = 0;
       
   191 	self->cdict = NULL;
       
   192 	self->ddict = NULL;
   270 
   193 
   271 #if PY_MAJOR_VERSION >= 3
   194 #if PY_MAJOR_VERSION >= 3
   272 	if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict",
   195 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict",
   273 #else
   196 #else
   274 	if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict",
   197 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict",
   275 #endif
   198 #endif
   276 		&source, &sourceSize)) {
   199 		kwlist, &source, &dictType)) {
   277 		return -1;
   200 		return -1;
   278 	}
   201 	}
   279 
   202 
   280 	self->dictData = PyMem_Malloc(sourceSize);
   203 	if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
       
   204 		PyErr_SetString(PyExc_ValueError,
       
   205 			"data buffer should be contiguous and have at most one dimension");
       
   206 		goto finally;
       
   207 	}
       
   208 
       
   209 	if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent
       
   210 		&& dictType != ZSTD_dct_fullDict) {
       
   211 		PyErr_Format(PyExc_ValueError,
       
   212 			"invalid dictionary load mode: %d; must use DICT_TYPE_* constants",
       
   213 			dictType);
       
   214 		goto finally;
       
   215 	}
       
   216 
       
   217 	self->dictType = dictType;
       
   218 
       
   219 	self->dictData = PyMem_Malloc(source.len);
   281 	if (!self->dictData) {
   220 	if (!self->dictData) {
   282 		PyErr_NoMemory();
   221 		PyErr_NoMemory();
   283 		return -1;
   222 		goto finally;
   284 	}
   223 	}
   285 
   224 
   286 	memcpy(self->dictData, source, sourceSize);
   225 	memcpy(self->dictData, source.buf, source.len);
   287 	self->dictSize = sourceSize;
   226 	self->dictSize = source.len;
   288 
   227 
   289 	return 0;
   228 	result = 0;
   290 	}
   229 
       
   230 finally:
       
   231 	PyBuffer_Release(&source);
       
   232 	return result;
       
   233 }
   291 
   234 
   292 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
   235 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
       
   236 	if (self->cdict) {
       
   237 		ZSTD_freeCDict(self->cdict);
       
   238 		self->cdict = NULL;
       
   239 	}
       
   240 
       
   241 	if (self->ddict) {
       
   242 		ZSTD_freeDDict(self->ddict);
       
   243 		self->ddict = NULL;
       
   244 	}
       
   245 
   293 	if (self->dictData) {
   246 	if (self->dictData) {
   294 		PyMem_Free(self->dictData);
   247 		PyMem_Free(self->dictData);
   295 		self->dictData = NULL;
   248 		self->dictData = NULL;
   296 	}
   249 	}
   297 
   250 
   298 	PyObject_Del(self);
   251 	PyObject_Del(self);
       
   252 }
       
   253 
       
   254 PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__,
       
   255 "Precompute a dictionary so it can be used by multiple compressors.\n"
       
   256 );
       
   257 
       
   258 static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
       
   259 	static char* kwlist[] = {
       
   260 		"level",
       
   261 		"compression_params",
       
   262 		NULL
       
   263 	};
       
   264 
       
   265 	int level = 0;
       
   266 	ZstdCompressionParametersObject* compressionParams = NULL;
       
   267 	ZSTD_compressionParameters cParams;
       
   268 	size_t zresult;
       
   269 
       
   270 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist,
       
   271 		&level, &ZstdCompressionParametersType, &compressionParams)) {
       
   272 		return NULL;
       
   273 	}
       
   274 
       
   275 	if (level && compressionParams) {
       
   276 		PyErr_SetString(PyExc_ValueError,
       
   277 			"must only specify one of level or compression_params");
       
   278 		return NULL;
       
   279 	}
       
   280 
       
   281 	if (!level && !compressionParams) {
       
   282 		PyErr_SetString(PyExc_ValueError,
       
   283 			"must specify one of level or compression_params");
       
   284 		return NULL;
       
   285 	}
       
   286 
       
   287 	if (self->cdict) {
       
   288 		zresult = ZSTD_freeCDict(self->cdict);
       
   289 		self->cdict = NULL;
       
   290 		if (ZSTD_isError(zresult)) {
       
   291 			PyErr_Format(ZstdError, "unable to free CDict: %s",
       
   292 				ZSTD_getErrorName(zresult));
       
   293 			return NULL;
       
   294 		}
       
   295 	}
       
   296 
       
   297 	if (level) {
       
   298 		cParams = ZSTD_getCParams(level, 0, self->dictSize);
       
   299 	}
       
   300 	else {
       
   301 		cParams.chainLog = compressionParams->chainLog;
       
   302 		cParams.hashLog = compressionParams->hashLog;
       
   303 		cParams.searchLength = compressionParams->minMatch;
       
   304 		cParams.searchLog = compressionParams->searchLog;
       
   305 		cParams.strategy = compressionParams->compressionStrategy;
       
   306 		cParams.targetLength = compressionParams->targetLength;
       
   307 		cParams.windowLog = compressionParams->windowLog;
       
   308 	}
       
   309 
       
   310 	assert(!self->cdict);
       
   311 	self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize,
       
   312 		ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem);
       
   313 
       
   314 	if (!self->cdict) {
       
   315 		PyErr_SetString(ZstdError, "unable to precompute dictionary");
       
   316 		return NULL;
       
   317 	}
       
   318 
       
   319 	Py_RETURN_NONE;
   299 }
   320 }
   300 
   321 
   301 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
   322 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
   302 	unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
   323 	unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
   303 
   324 
   311 static PyMethodDef ZstdCompressionDict_methods[] = {
   332 static PyMethodDef ZstdCompressionDict_methods[] = {
   312 	{ "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS,
   333 	{ "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS,
   313 	PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
   334 	PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
   314 	{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
   335 	{ "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
   315 	PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
   336 	PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
       
   337 	{ "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress,
       
   338 	METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ },
   316 	{ NULL, NULL }
   339 	{ NULL, NULL }
   317 };
   340 };
   318 
   341 
   319 static PyMemberDef ZstdCompressionDict_members[] = {
   342 static PyMemberDef ZstdCompressionDict_members[] = {
   320 	{ "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY,
   343 	{ "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY,