contrib/python-zstandard/c-ext/compressor.c
changeset 30822 b54a2984cdd4
parent 30435 b86a448a2965
child 30830 08fa3a76a080
equal deleted inserted replaced
30821:7005c03f7387 30822:b54a2984cdd4
     7 */
     7 */
     8 
     8 
     9 #include "python-zstandard.h"
     9 #include "python-zstandard.h"
    10 
    10 
    11 extern PyObject* ZstdError;
    11 extern PyObject* ZstdError;
       
    12 
       
    13 int populate_cdict(ZstdCompressor* compressor, void* dictData, size_t dictSize, ZSTD_parameters* zparams) {
       
    14 	ZSTD_customMem zmem;
       
    15 	assert(!compressor->cdict);
       
    16 	Py_BEGIN_ALLOW_THREADS
       
    17 	memset(&zmem, 0, sizeof(zmem));
       
    18 	compressor->cdict = ZSTD_createCDict_advanced(compressor->dict->dictData,
       
    19 		compressor->dict->dictSize, *zparams, zmem);
       
    20 	Py_END_ALLOW_THREADS
       
    21 
       
    22 	if (!compressor->cdict) {
       
    23 		PyErr_SetString(ZstdError, "could not create compression dictionary");
       
    24 		return 1;
       
    25 	}
       
    26 
       
    27 	return 0;
       
    28 }
    12 
    29 
    13 /**
    30 /**
    14 * Initialize a zstd CStream from a ZstdCompressor instance.
    31 * Initialize a zstd CStream from a ZstdCompressor instance.
    15 *
    32 *
    16 * Returns a ZSTD_CStream on success or NULL on failure. If NULL, a Python
    33 * Returns a ZSTD_CStream on success or NULL on failure. If NULL, a Python
    54 		return NULL;
    71 		return NULL;
    55 	}
    72 	}
    56 
    73 
    57 	return cstream;
    74 	return cstream;
    58 }
    75 }
    59 
       
    60 
    76 
    61 PyDoc_STRVAR(ZstdCompressor__doc__,
    77 PyDoc_STRVAR(ZstdCompressor__doc__,
    62 "ZstdCompressor(level=None, dict_data=None, compression_params=None)\n"
    78 "ZstdCompressor(level=None, dict_data=None, compression_params=None)\n"
    63 "\n"
    79 "\n"
    64 "Create an object used to perform Zstandard compression.\n"
    80 "Create an object used to perform Zstandard compression.\n"
   105 	CompressionParametersObject* params = NULL;
   121 	CompressionParametersObject* params = NULL;
   106 	PyObject* writeChecksum = NULL;
   122 	PyObject* writeChecksum = NULL;
   107 	PyObject* writeContentSize = NULL;
   123 	PyObject* writeContentSize = NULL;
   108 	PyObject* writeDictID = NULL;
   124 	PyObject* writeDictID = NULL;
   109 
   125 
       
   126 	self->cctx = NULL;
   110 	self->dict = NULL;
   127 	self->dict = NULL;
   111 	self->cparams = NULL;
   128 	self->cparams = NULL;
   112 	self->cdict = NULL;
   129 	self->cdict = NULL;
   113 
   130 
   114 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOO", kwlist,
   131 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOO", kwlist,
   127 		PyErr_Format(PyExc_ValueError, "level must be less than %d",
   144 		PyErr_Format(PyExc_ValueError, "level must be less than %d",
   128 			ZSTD_maxCLevel() + 1);
   145 			ZSTD_maxCLevel() + 1);
   129 		return -1;
   146 		return -1;
   130 	}
   147 	}
   131 
   148 
       
   149 	/* We create a ZSTD_CCtx for reuse among multiple operations to reduce the
       
   150 	   overhead of each compression operation. */
       
   151 	self->cctx = ZSTD_createCCtx();
       
   152 	if (!self->cctx) {
       
   153 		PyErr_NoMemory();
       
   154 		return -1;
       
   155 	}
       
   156 
   132 	self->compressionLevel = level;
   157 	self->compressionLevel = level;
   133 
   158 
   134 	if (dict) {
   159 	if (dict) {
   135 		self->dict = dict;
   160 		self->dict = dict;
   136 		Py_INCREF(dict);
   161 		Py_INCREF(dict);
   161 	Py_XDECREF(self->dict);
   186 	Py_XDECREF(self->dict);
   162 
   187 
   163 	if (self->cdict) {
   188 	if (self->cdict) {
   164 		ZSTD_freeCDict(self->cdict);
   189 		ZSTD_freeCDict(self->cdict);
   165 		self->cdict = NULL;
   190 		self->cdict = NULL;
       
   191 	}
       
   192 
       
   193 	if (self->cctx) {
       
   194 		ZSTD_freeCCtx(self->cctx);
       
   195 		self->cctx = NULL;
   166 	}
   196 	}
   167 
   197 
   168 	PyObject_Del(self);
   198 	PyObject_Del(self);
   169 }
   199 }
   170 
   200 
   337 
   367 
   338 	return res;
   368 	return res;
   339 }
   369 }
   340 
   370 
   341 PyDoc_STRVAR(ZstdCompressor_compress__doc__,
   371 PyDoc_STRVAR(ZstdCompressor_compress__doc__,
   342 "compress(data)\n"
   372 "compress(data, allow_empty=False)\n"
   343 "\n"
   373 "\n"
   344 "Compress data in a single operation.\n"
   374 "Compress data in a single operation.\n"
   345 "\n"
   375 "\n"
   346 "This is the simplest mechanism to perform compression: simply pass in a\n"
   376 "This is the simplest mechanism to perform compression: simply pass in a\n"
   347 "value and get a compressed value back. It is almost the most prone to abuse.\n"
   377 "value and get a compressed value back. It is almost the most prone to abuse.\n"
   348 "The input and output values must fit in memory, so passing in very large\n"
   378 "The input and output values must fit in memory, so passing in very large\n"
   349 "values can result in excessive memory usage. For this reason, one of the\n"
   379 "values can result in excessive memory usage. For this reason, one of the\n"
   350 "streaming based APIs is preferred for larger values.\n"
   380 "streaming based APIs is preferred for larger values.\n"
   351 );
   381 );
   352 
   382 
   353 static PyObject* ZstdCompressor_compress(ZstdCompressor* self, PyObject* args) {
   383 static PyObject* ZstdCompressor_compress(ZstdCompressor* self, PyObject* args, PyObject* kwargs) {
       
   384 	static char* kwlist[] = {
       
   385 		"data",
       
   386 		"allow_empty",
       
   387 		NULL
       
   388 	};
       
   389 
   354 	const char* source;
   390 	const char* source;
   355 	Py_ssize_t sourceSize;
   391 	Py_ssize_t sourceSize;
       
   392 	PyObject* allowEmpty = NULL;
   356 	size_t destSize;
   393 	size_t destSize;
   357 	ZSTD_CCtx* cctx;
       
   358 	PyObject* output;
   394 	PyObject* output;
   359 	char* dest;
   395 	char* dest;
   360 	void* dictData = NULL;
   396 	void* dictData = NULL;
   361 	size_t dictSize = 0;
   397 	size_t dictSize = 0;
   362 	size_t zresult;
   398 	size_t zresult;
   363 	ZSTD_parameters zparams;
   399 	ZSTD_parameters zparams;
   364 	ZSTD_customMem zmem;
       
   365 
   400 
   366 #if PY_MAJOR_VERSION >= 3
   401 #if PY_MAJOR_VERSION >= 3
   367 	if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) {
   402 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|O",
   368 #else
   403 #else
   369 	if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) {
   404 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|O",
   370 #endif
   405 #endif
       
   406 		kwlist, &source, &sourceSize, &allowEmpty)) {
       
   407 		return NULL;
       
   408 	}
       
   409 
       
   410 	/* Limitation in zstd C API doesn't let decompression side distinguish
       
   411 	   between content size of 0 and unknown content size. This can make round
       
   412 	   tripping via Python difficult. Until this is fixed, require a flag
       
   413 	   to fire the footgun.
       
   414 	   https://github.com/indygreg/python-zstandard/issues/11 */
       
   415 	if (0 == sourceSize && self->fparams.contentSizeFlag
       
   416 		&& (!allowEmpty || PyObject_Not(allowEmpty))) {
       
   417 		PyErr_SetString(PyExc_ValueError, "cannot write empty inputs when writing content sizes");
   371 		return NULL;
   418 		return NULL;
   372 	}
   419 	}
   373 
   420 
   374 	destSize = ZSTD_compressBound(sourceSize);
   421 	destSize = ZSTD_compressBound(sourceSize);
   375 	output = PyBytes_FromStringAndSize(NULL, destSize);
   422 	output = PyBytes_FromStringAndSize(NULL, destSize);
   376 	if (!output) {
   423 	if (!output) {
   377 		return NULL;
   424 		return NULL;
   378 	}
   425 	}
   379 
   426 
   380 	dest = PyBytes_AsString(output);
   427 	dest = PyBytes_AsString(output);
   381 
       
   382 	cctx = ZSTD_createCCtx();
       
   383 	if (!cctx) {
       
   384 		Py_DECREF(output);
       
   385 		PyErr_SetString(ZstdError, "could not create CCtx");
       
   386 		return NULL;
       
   387 	}
       
   388 
   428 
   389 	if (self->dict) {
   429 	if (self->dict) {
   390 		dictData = self->dict->dictData;
   430 		dictData = self->dict->dictData;
   391 		dictSize = self->dict->dictSize;
   431 		dictSize = self->dict->dictSize;
   392 	}
   432 	}
   404 	zparams.fParams = self->fparams;
   444 	zparams.fParams = self->fparams;
   405 
   445 
   406 	/* The raw dict data has to be processed before it can be used. Since this
   446 	/* The raw dict data has to be processed before it can be used. Since this
   407 	adds overhead - especially if multiple dictionary compression operations
   447 	adds overhead - especially if multiple dictionary compression operations
   408 	are performed on the same ZstdCompressor instance - we create a
   448 	are performed on the same ZstdCompressor instance - we create a
   409 	ZSTD_CDict once and reuse it for all operations. */
   449 	ZSTD_CDict once and reuse it for all operations.
   410 
   450 
   411 	/* TODO the zparams (which can be derived from the source data size) used
   451 	Note: the compression parameters used for the first invocation (possibly
   412 	on first invocation are effectively reused for subsequent operations. This
   452 	derived from the source size) will be reused on all subsequent invocations.
   413 	may not be appropriate if input sizes vary significantly and could affect
   453 	https://github.com/facebook/zstd/issues/358 contains more info. We could
   414 	chosen compression parameters.
   454 	potentially add an argument somewhere to control this behavior.
   415 	https://github.com/facebook/zstd/issues/358 tracks this issue. */
   455 	*/
   416 	if (dictData && !self->cdict) {
   456 	if (dictData && !self->cdict) {
   417 		Py_BEGIN_ALLOW_THREADS
   457 		if (populate_cdict(self, dictData, dictSize, &zparams)) {
   418 		memset(&zmem, 0, sizeof(zmem));
       
   419 		self->cdict = ZSTD_createCDict_advanced(dictData, dictSize, zparams, zmem);
       
   420 		Py_END_ALLOW_THREADS
       
   421 
       
   422 		if (!self->cdict) {
       
   423 			Py_DECREF(output);
   458 			Py_DECREF(output);
   424 			ZSTD_freeCCtx(cctx);
       
   425 			PyErr_SetString(ZstdError, "could not create compression dictionary");
       
   426 			return NULL;
   459 			return NULL;
   427 		}
   460 		}
   428 	}
   461 	}
   429 
   462 
   430 	Py_BEGIN_ALLOW_THREADS
   463 	Py_BEGIN_ALLOW_THREADS
   431 	/* By avoiding ZSTD_compress(), we don't necessarily write out content
   464 	/* By avoiding ZSTD_compress(), we don't necessarily write out content
   432 	   size. This means the argument to ZstdCompressor to control frame
   465 	   size. This means the argument to ZstdCompressor to control frame
   433 	   parameters is honored. */
   466 	   parameters is honored. */
   434 	if (self->cdict) {
   467 	if (self->cdict) {
   435 		zresult = ZSTD_compress_usingCDict(cctx, dest, destSize,
   468 		zresult = ZSTD_compress_usingCDict(self->cctx, dest, destSize,
   436 			source, sourceSize, self->cdict);
   469 			source, sourceSize, self->cdict);
   437 	}
   470 	}
   438 	else {
   471 	else {
   439 		zresult = ZSTD_compress_advanced(cctx, dest, destSize,
   472 		zresult = ZSTD_compress_advanced(self->cctx, dest, destSize,
   440 			source, sourceSize, dictData, dictSize, zparams);
   473 			source, sourceSize, dictData, dictSize, zparams);
   441 	}
   474 	}
   442 	Py_END_ALLOW_THREADS
   475 	Py_END_ALLOW_THREADS
   443 
       
   444 	ZSTD_freeCCtx(cctx);
       
   445 
   476 
   446 	if (ZSTD_isError(zresult)) {
   477 	if (ZSTD_isError(zresult)) {
   447 		PyErr_Format(ZstdError, "cannot compress: %s", ZSTD_getErrorName(zresult));
   478 		PyErr_Format(ZstdError, "cannot compress: %s", ZSTD_getErrorName(zresult));
   448 		Py_CLEAR(output);
   479 		Py_CLEAR(output);
   449 		return NULL;
   480 		return NULL;
   498 	result->output.pos = 0;
   529 	result->output.pos = 0;
   499 
   530 
   500 	result->compressor = self;
   531 	result->compressor = self;
   501 	Py_INCREF(result->compressor);
   532 	Py_INCREF(result->compressor);
   502 
   533 
   503 	result->flushed = 0;
   534 	result->finished = 0;
   504 
   535 
   505 	return result;
   536 	return result;
   506 }
   537 }
   507 
   538 
   508 PyDoc_STRVAR(ZstdCompressor_read_from__doc__,
   539 PyDoc_STRVAR(ZstdCompressor_read_from__doc__,
   689 
   720 
   690 	return result;
   721 	return result;
   691 }
   722 }
   692 
   723 
   693 static PyMethodDef ZstdCompressor_methods[] = {
   724 static PyMethodDef ZstdCompressor_methods[] = {
   694 	{ "compress", (PyCFunction)ZstdCompressor_compress, METH_VARARGS,
   725 	{ "compress", (PyCFunction)ZstdCompressor_compress,
   695 	ZstdCompressor_compress__doc__ },
   726 	METH_VARARGS | METH_KEYWORDS, ZstdCompressor_compress__doc__ },
   696 	{ "compressobj", (PyCFunction)ZstdCompressor_compressobj,
   727 	{ "compressobj", (PyCFunction)ZstdCompressor_compressobj,
   697 	METH_VARARGS | METH_KEYWORDS, ZstdCompressionObj__doc__ },
   728 	METH_VARARGS | METH_KEYWORDS, ZstdCompressionObj__doc__ },
   698 	{ "copy_stream", (PyCFunction)ZstdCompressor_copy_stream,
   729 	{ "copy_stream", (PyCFunction)ZstdCompressor_copy_stream,
   699 	METH_VARARGS | METH_KEYWORDS, ZstdCompressor_copy_stream__doc__ },
   730 	METH_VARARGS | METH_KEYWORDS, ZstdCompressor_copy_stream__doc__ },
   700 	{ "read_from", (PyCFunction)ZstdCompressor_read_from,
   731 	{ "read_from", (PyCFunction)ZstdCompressor_read_from,