9 #include "python-zstandard.h" |
9 #include "python-zstandard.h" |
10 |
10 |
11 extern PyObject* ZstdError; |
11 extern PyObject* ZstdError; |
12 |
12 |
13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { |
13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { |
14 static char* kwlist[] = { |
|
15 "dict_size", |
|
16 "samples", |
|
17 "selectivity", |
|
18 "level", |
|
19 "notifications", |
|
20 "dict_id", |
|
21 NULL |
|
22 }; |
|
23 size_t capacity; |
|
24 PyObject* samples; |
|
25 Py_ssize_t samplesLen; |
|
26 unsigned selectivity = 0; |
|
27 int level = 0; |
|
28 unsigned notifications = 0; |
|
29 unsigned dictID = 0; |
|
30 ZDICT_params_t zparams; |
|
31 Py_ssize_t sampleIndex; |
|
32 Py_ssize_t sampleSize; |
|
33 PyObject* sampleItem; |
|
34 size_t zresult; |
|
35 void* sampleBuffer = NULL; |
|
36 void* sampleOffset; |
|
37 size_t samplesSize = 0; |
|
38 size_t* sampleSizes = NULL; |
|
39 void* dict = NULL; |
|
40 ZstdCompressionDict* result = NULL; |
|
41 |
|
42 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary", |
|
43 kwlist, |
|
44 &capacity, |
|
45 &PyList_Type, &samples, |
|
46 &selectivity, &level, ¬ifications, &dictID)) { |
|
47 return NULL; |
|
48 } |
|
49 |
|
50 memset(&zparams, 0, sizeof(zparams)); |
|
51 |
|
52 zparams.selectivityLevel = selectivity; |
|
53 zparams.compressionLevel = level; |
|
54 zparams.notificationLevel = notifications; |
|
55 zparams.dictID = dictID; |
|
56 |
|
57 /* Figure out the size of the raw samples */ |
|
58 samplesLen = PyList_Size(samples); |
|
59 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { |
|
60 sampleItem = PyList_GetItem(samples, sampleIndex); |
|
61 if (!PyBytes_Check(sampleItem)) { |
|
62 PyErr_SetString(PyExc_ValueError, "samples must be bytes"); |
|
63 return NULL; |
|
64 } |
|
65 samplesSize += PyBytes_GET_SIZE(sampleItem); |
|
66 } |
|
67 |
|
68 /* Now that we know the total size of the raw simples, we can allocate |
|
69 a buffer for the raw data */ |
|
70 sampleBuffer = PyMem_Malloc(samplesSize); |
|
71 if (!sampleBuffer) { |
|
72 PyErr_NoMemory(); |
|
73 goto finally; |
|
74 } |
|
75 sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); |
|
76 if (!sampleSizes) { |
|
77 PyErr_NoMemory(); |
|
78 goto finally; |
|
79 } |
|
80 |
|
81 sampleOffset = sampleBuffer; |
|
82 /* Now iterate again and assemble the samples in the buffer */ |
|
83 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { |
|
84 sampleItem = PyList_GetItem(samples, sampleIndex); |
|
85 sampleSize = PyBytes_GET_SIZE(sampleItem); |
|
86 sampleSizes[sampleIndex] = sampleSize; |
|
87 memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); |
|
88 sampleOffset = (char*)sampleOffset + sampleSize; |
|
89 } |
|
90 |
|
91 dict = PyMem_Malloc(capacity); |
|
92 if (!dict) { |
|
93 PyErr_NoMemory(); |
|
94 goto finally; |
|
95 } |
|
96 |
|
97 /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */ |
|
98 Py_BEGIN_ALLOW_THREADS |
|
99 zresult = ZDICT_trainFromBuffer_advanced(dict, capacity, |
|
100 sampleBuffer, sampleSizes, (unsigned int)samplesLen, |
|
101 zparams); |
|
102 Py_END_ALLOW_THREADS |
|
103 if (ZDICT_isError(zresult)) { |
|
104 PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult)); |
|
105 PyMem_Free(dict); |
|
106 goto finally; |
|
107 } |
|
108 |
|
109 result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); |
|
110 if (!result) { |
|
111 goto finally; |
|
112 } |
|
113 |
|
114 result->dictData = dict; |
|
115 result->dictSize = zresult; |
|
116 result->d = 0; |
|
117 result->k = 0; |
|
118 |
|
119 finally: |
|
120 PyMem_Free(sampleBuffer); |
|
121 PyMem_Free(sampleSizes); |
|
122 |
|
123 return result; |
|
124 } |
|
125 |
|
126 ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { |
|
127 static char* kwlist[] = { |
14 static char* kwlist[] = { |
128 "dict_size", |
15 "dict_size", |
129 "samples", |
16 "samples", |
130 "k", |
17 "k", |
131 "d", |
18 "d", |
132 "notifications", |
19 "notifications", |
133 "dict_id", |
20 "dict_id", |
134 "level", |
21 "level", |
135 "optimize", |
|
136 "steps", |
22 "steps", |
137 "threads", |
23 "threads", |
138 NULL |
24 NULL |
139 }; |
25 }; |
140 |
26 |
217 PyErr_NoMemory(); |
102 PyErr_NoMemory(); |
218 goto finally; |
103 goto finally; |
219 } |
104 } |
220 |
105 |
221 Py_BEGIN_ALLOW_THREADS |
106 Py_BEGIN_ALLOW_THREADS |
222 if (optimize && PyObject_IsTrue(optimize)) { |
107 /* No parameters uses the default function, which will use default params |
223 zresult = COVER_optimizeTrainFromBuffer(dict, capacity, |
108 and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */ |
|
109 if (!params.k && !params.d && !params.zParams.compressionLevel |
|
110 && !params.zParams.notificationLevel && !params.zParams.dictID) { |
|
111 zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer, |
|
112 sampleSizes, (unsigned)samplesLen); |
|
113 } |
|
114 /* Use optimize mode if user controlled steps or threads explicitly. */ |
|
115 else if (params.steps || params.nbThreads) { |
|
116 zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity, |
224 sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); |
117 sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); |
225 } |
118 } |
|
119 /* Non-optimize mode with explicit control. */ |
226 else { |
120 else { |
227 zresult = COVER_trainFromBuffer(dict, capacity, |
121 zresult = ZDICT_trainFromBuffer_cover(dict, capacity, |
228 sampleBuffer, sampleSizes, (unsigned)samplesLen, params); |
122 sampleBuffer, sampleSizes, (unsigned)samplesLen, params); |
229 } |
123 } |
230 Py_END_ALLOW_THREADS |
124 Py_END_ALLOW_THREADS |
231 |
125 |
232 if (ZDICT_isError(zresult)) { |
126 if (ZDICT_isError(zresult)) { |
241 goto finally; |
135 goto finally; |
242 } |
136 } |
243 |
137 |
244 result->dictData = dict; |
138 result->dictData = dict; |
245 result->dictSize = zresult; |
139 result->dictSize = zresult; |
|
140 result->dictType = ZSTD_dct_fullDict; |
246 result->d = params.d; |
141 result->d = params.d; |
247 result->k = params.k; |
142 result->k = params.k; |
|
143 result->cdict = NULL; |
|
144 result->ddict = NULL; |
248 |
145 |
249 finally: |
146 finally: |
250 PyMem_Free(sampleBuffer); |
147 PyMem_Free(sampleBuffer); |
251 PyMem_Free(sampleSizes); |
148 PyMem_Free(sampleSizes); |
252 |
149 |
253 return result; |
150 return result; |
|
151 } |
|
152 |
|
153 int ensure_ddict(ZstdCompressionDict* dict) { |
|
154 if (dict->ddict) { |
|
155 return 0; |
|
156 } |
|
157 |
|
158 Py_BEGIN_ALLOW_THREADS |
|
159 dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize, |
|
160 ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem); |
|
161 Py_END_ALLOW_THREADS |
|
162 if (!dict->ddict) { |
|
163 PyErr_SetString(ZstdError, "could not create decompression dict"); |
|
164 return 1; |
|
165 } |
|
166 |
|
167 return 0; |
254 } |
168 } |
255 |
169 |
256 PyDoc_STRVAR(ZstdCompressionDict__doc__, |
170 PyDoc_STRVAR(ZstdCompressionDict__doc__, |
257 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" |
171 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" |
258 "\n" |
172 "\n" |
259 "This type holds the results of a computed Zstandard compression dictionary.\n" |
173 "This type holds the results of a computed Zstandard compression dictionary.\n" |
260 "Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" |
174 "Instances are obtained by calling ``train_dictionary()`` or by passing\n" |
261 "obtained from another source into the constructor.\n" |
175 "bytes obtained from another source into the constructor.\n" |
262 ); |
176 ); |
263 |
177 |
264 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { |
178 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { |
265 const char* source; |
179 static char* kwlist[] = { |
266 Py_ssize_t sourceSize; |
180 "data", |
|
181 "dict_type", |
|
182 NULL |
|
183 }; |
|
184 |
|
185 int result = -1; |
|
186 Py_buffer source; |
|
187 unsigned dictType = ZSTD_dct_auto; |
267 |
188 |
268 self->dictData = NULL; |
189 self->dictData = NULL; |
269 self->dictSize = 0; |
190 self->dictSize = 0; |
|
191 self->cdict = NULL; |
|
192 self->ddict = NULL; |
270 |
193 |
271 #if PY_MAJOR_VERSION >= 3 |
194 #if PY_MAJOR_VERSION >= 3 |
272 if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", |
195 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict", |
273 #else |
196 #else |
274 if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", |
197 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict", |
275 #endif |
198 #endif |
276 &source, &sourceSize)) { |
199 kwlist, &source, &dictType)) { |
277 return -1; |
200 return -1; |
278 } |
201 } |
279 |
202 |
280 self->dictData = PyMem_Malloc(sourceSize); |
203 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { |
|
204 PyErr_SetString(PyExc_ValueError, |
|
205 "data buffer should be contiguous and have at most one dimension"); |
|
206 goto finally; |
|
207 } |
|
208 |
|
209 if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent |
|
210 && dictType != ZSTD_dct_fullDict) { |
|
211 PyErr_Format(PyExc_ValueError, |
|
212 "invalid dictionary load mode: %d; must use DICT_TYPE_* constants", |
|
213 dictType); |
|
214 goto finally; |
|
215 } |
|
216 |
|
217 self->dictType = dictType; |
|
218 |
|
219 self->dictData = PyMem_Malloc(source.len); |
281 if (!self->dictData) { |
220 if (!self->dictData) { |
282 PyErr_NoMemory(); |
221 PyErr_NoMemory(); |
283 return -1; |
222 goto finally; |
284 } |
223 } |
285 |
224 |
286 memcpy(self->dictData, source, sourceSize); |
225 memcpy(self->dictData, source.buf, source.len); |
287 self->dictSize = sourceSize; |
226 self->dictSize = source.len; |
288 |
227 |
289 return 0; |
228 result = 0; |
290 } |
229 |
|
230 finally: |
|
231 PyBuffer_Release(&source); |
|
232 return result; |
|
233 } |
291 |
234 |
292 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { |
235 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { |
|
236 if (self->cdict) { |
|
237 ZSTD_freeCDict(self->cdict); |
|
238 self->cdict = NULL; |
|
239 } |
|
240 |
|
241 if (self->ddict) { |
|
242 ZSTD_freeDDict(self->ddict); |
|
243 self->ddict = NULL; |
|
244 } |
|
245 |
293 if (self->dictData) { |
246 if (self->dictData) { |
294 PyMem_Free(self->dictData); |
247 PyMem_Free(self->dictData); |
295 self->dictData = NULL; |
248 self->dictData = NULL; |
296 } |
249 } |
297 |
250 |
298 PyObject_Del(self); |
251 PyObject_Del(self); |
|
252 } |
|
253 |
|
254 PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__, |
|
255 "Precompute a dictionary so it can be used by multiple compressors.\n" |
|
256 ); |
|
257 |
|
258 static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { |
|
259 static char* kwlist[] = { |
|
260 "level", |
|
261 "compression_params", |
|
262 NULL |
|
263 }; |
|
264 |
|
265 int level = 0; |
|
266 ZstdCompressionParametersObject* compressionParams = NULL; |
|
267 ZSTD_compressionParameters cParams; |
|
268 size_t zresult; |
|
269 |
|
270 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist, |
|
271 &level, &ZstdCompressionParametersType, &compressionParams)) { |
|
272 return NULL; |
|
273 } |
|
274 |
|
275 if (level && compressionParams) { |
|
276 PyErr_SetString(PyExc_ValueError, |
|
277 "must only specify one of level or compression_params"); |
|
278 return NULL; |
|
279 } |
|
280 |
|
281 if (!level && !compressionParams) { |
|
282 PyErr_SetString(PyExc_ValueError, |
|
283 "must specify one of level or compression_params"); |
|
284 return NULL; |
|
285 } |
|
286 |
|
287 if (self->cdict) { |
|
288 zresult = ZSTD_freeCDict(self->cdict); |
|
289 self->cdict = NULL; |
|
290 if (ZSTD_isError(zresult)) { |
|
291 PyErr_Format(ZstdError, "unable to free CDict: %s", |
|
292 ZSTD_getErrorName(zresult)); |
|
293 return NULL; |
|
294 } |
|
295 } |
|
296 |
|
297 if (level) { |
|
298 cParams = ZSTD_getCParams(level, 0, self->dictSize); |
|
299 } |
|
300 else { |
|
301 cParams.chainLog = compressionParams->chainLog; |
|
302 cParams.hashLog = compressionParams->hashLog; |
|
303 cParams.searchLength = compressionParams->minMatch; |
|
304 cParams.searchLog = compressionParams->searchLog; |
|
305 cParams.strategy = compressionParams->compressionStrategy; |
|
306 cParams.targetLength = compressionParams->targetLength; |
|
307 cParams.windowLog = compressionParams->windowLog; |
|
308 } |
|
309 |
|
310 assert(!self->cdict); |
|
311 self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize, |
|
312 ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem); |
|
313 |
|
314 if (!self->cdict) { |
|
315 PyErr_SetString(ZstdError, "unable to precompute dictionary"); |
|
316 return NULL; |
|
317 } |
|
318 |
|
319 Py_RETURN_NONE; |
299 } |
320 } |
300 |
321 |
301 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { |
322 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { |
302 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); |
323 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); |
303 |
324 |
311 static PyMethodDef ZstdCompressionDict_methods[] = { |
332 static PyMethodDef ZstdCompressionDict_methods[] = { |
312 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, |
333 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, |
313 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, |
334 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, |
314 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, |
335 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, |
315 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, |
336 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, |
|
337 { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress, |
|
338 METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ }, |
316 { NULL, NULL } |
339 { NULL, NULL } |
317 }; |
340 }; |
318 |
341 |
319 static PyMemberDef ZstdCompressionDict_members[] = { |
342 static PyMemberDef ZstdCompressionDict_members[] = { |
320 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, |
343 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, |