--- a/contrib/python-zstandard/zstd/dictBuilder/zdict.c Sun Apr 08 01:08:43 2018 +0200
+++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.c Mon Apr 09 10:13:29 2018 -0700
@@ -1,18 +1,20 @@
-/**
+/*
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
* All rights reserved.
*
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree. An additional grant
- * of patent rights can be found in the PATENTS file in the same directory.
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
*/
/*-**************************************
* Tuning parameters
****************************************/
+#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
-#define ZDICT_MIN_SAMPLES_SIZE 512
+#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
/*-**************************************
@@ -59,11 +61,8 @@
#define NOISELENGTH 32
-#define MINRATIO 4
-static const int g_compressionLevel_default = 6;
+static const int g_compressionLevel_default = 3;
static const U32 g_selectivity_default = 9;
-static const size_t g_provision_entropySize = 200;
-static const size_t g_min_fast_dictContent = 192;
/*-*************************************
@@ -96,7 +95,7 @@
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
{
if (dictSize < 8) return 0;
- if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
+ if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
return MEM_readLE32((const char*)dictBuffer + 4);
}
@@ -104,7 +103,7 @@
/*-********************************************************
* Dictionary training functions
**********************************************************/
-static unsigned ZDICT_NbCommonBytes (register size_t val)
+static unsigned ZDICT_NbCommonBytes (size_t val)
{
if (MEM_isLittleEndian()) {
if (MEM_64bits()) {
@@ -208,7 +207,6 @@
U32 cumulLength[LLIMIT] = {0};
U32 savings[LLIMIT] = {0};
const BYTE* b = (const BYTE*)buffer;
- size_t length;
size_t maxLength = LLIMIT;
size_t pos = suffix[start];
U32 end = start;
@@ -223,26 +221,30 @@
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
/* skip and mark segment */
- U16 u16 = MEM_read16(b+pos+4);
- U32 u, e = 6;
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
- if (b[pos+e] == b[pos+e-1]) e++;
- for (u=1; u<e; u++)
+ U16 const pattern16 = MEM_read16(b+pos+4);
+ U32 u, patternEnd = 6;
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
+ for (u=1; u<patternEnd; u++)
doneMarks[pos+u] = 1;
return solution;
}
/* look forward */
- do {
- end++;
- length = ZDICT_count(b + pos, b + suffix[end]);
- } while (length >=MINMATCHLENGTH);
+ { size_t length;
+ do {
+ end++;
+ length = ZDICT_count(b + pos, b + suffix[end]);
+ } while (length >= MINMATCHLENGTH);
+ }
/* look backward */
- do {
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
- if (length >=MINMATCHLENGTH) start--;
- } while(length >= MINMATCHLENGTH);
+ { size_t length;
+ do {
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
+ if (length >=MINMATCHLENGTH) start--;
+ } while(length >= MINMATCHLENGTH);
+ }
/* exit if not found a minimum nb of repetitions */
if (end-start < minRatio) {
@@ -269,7 +271,7 @@
U32 selectedCount = 0;
U32 selectedID = currentID;
for (id =refinedStart; id < refinedEnd; id++) {
- if (b[ suffix[id] + searchLength] != currentChar) {
+ if (b[suffix[id] + searchLength] != currentChar) {
if (currentCount > selectedCount) {
selectedCount = currentCount;
selectedID = currentID;
@@ -298,20 +300,23 @@
memset(lengthList, 0, sizeof(lengthList));
/* look forward */
- do {
- end++;
- length = ZDICT_count(b + pos, b + suffix[end]);
- if (length >= LLIMIT) length = LLIMIT-1;
- lengthList[length]++;
- } while (length >=MINMATCHLENGTH);
+ { size_t length;
+ do {
+ end++;
+ length = ZDICT_count(b + pos, b + suffix[end]);
+ if (length >= LLIMIT) length = LLIMIT-1;
+ lengthList[length]++;
+ } while (length >=MINMATCHLENGTH);
+ }
/* look backward */
- length = MINMATCHLENGTH;
- while ((length >= MINMATCHLENGTH) & (start > 0)) {
- length = ZDICT_count(b + pos, b + suffix[start - 1]);
- if (length >= LLIMIT) length = LLIMIT - 1;
- lengthList[length]++;
- if (length >= MINMATCHLENGTH) start--;
+ { size_t length = MINMATCHLENGTH;
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
+ if (length >= LLIMIT) length = LLIMIT - 1;
+ lengthList[length]++;
+ if (length >= MINMATCHLENGTH) start--;
+ }
}
/* largest useful length */
@@ -346,12 +351,12 @@
/* mark positions done */
{ U32 id;
for (id=start; id<end; id++) {
- U32 p, pEnd;
+ U32 p, pEnd, length;
U32 const testedPos = suffix[id];
if (testedPos == pos)
length = solution.length;
else {
- length = ZDICT_count(b+pos, b+testedPos);
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
if (length > solution.length) length = solution.length;
}
pEnd = (U32)(testedPos + length);
@@ -363,21 +368,35 @@
}
-/*! ZDICT_checkMerge
+static int isIncluded(const void* in, const void* container, size_t length)
+{
+ const char* const ip = (const char*) in;
+ const char* const into = (const char*) container;
+ size_t u;
+
+ for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
+ if (ip[u] != into[u]) break;
+ }
+
+ return u==length;
+}
+
+/*! ZDICT_tryMerge() :
check if dictItem can be merged, do it if possible
@return : id of destination elt, 0 if not merged
*/
-static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
+static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
{
const U32 tableSize = table->pos;
const U32 eltEnd = elt.pos + elt.length;
+ const char* const buf = (const char*) buffer;
/* tail overlap */
U32 u; for (u=1; u<tableSize; u++) {
if (u==eltNbToSkip) continue;
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
/* append */
- U32 addedLength = table[u].pos - elt.pos;
+ U32 const addedLength = table[u].pos - elt.pos;
table[u].length += addedLength;
table[u].pos = elt.pos;
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
@@ -393,9 +412,10 @@
/* front overlap */
for (u=1; u<tableSize; u++) {
if (u==eltNbToSkip) continue;
+
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
/* append */
- int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
+ int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
table[u].savings += elt.length / 8; /* rough approx bonus */
if (addedLength > 0) { /* otherwise, elt fully included into existing */
table[u].length += addedLength;
@@ -407,7 +427,18 @@
table[u] = table[u-1], u--;
table[u] = elt;
return u;
- } }
+ }
+
+ if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
+ if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
+ size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
+ table[u].pos = elt.pos;
+ table[u].savings += (U32)(elt.savings * addedLength / elt.length);
+ table[u].length = MIN(elt.length, table[u].length + 1);
+ return u;
+ }
+ }
+ }
return 0;
}
@@ -415,8 +446,8 @@
static void ZDICT_removeDictItem(dictItem* table, U32 id)
{
- /* convention : first element is nb of elts */
- U32 const max = table->pos;
+ /* convention : table[0].pos stores nb of elts */
+ U32 const max = table[0].pos;
U32 u;
if (!id) return; /* protection, should never happen */
for (u=id; u<max-1; u++)
@@ -425,14 +456,14 @@
}
-static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
{
/* merge if possible */
- U32 mergeId = ZDICT_checkMerge(table, elt, 0);
+ U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
if (mergeId) {
U32 newMerge = 1;
while (newMerge) {
- newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
+ newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
if (newMerge) ZDICT_removeDictItem(table, mergeId);
mergeId = newMerge;
}
@@ -463,7 +494,7 @@
}
-static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
const size_t* fileSizes, unsigned nbFiles,
U32 minRatio, U32 notificationLevel)
@@ -480,7 +511,7 @@
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
if (ZDICT_clockSpan(displayClock) > refreshRate) \
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
- if (notificationLevel>=4) fflush(stdout); } }
+ if (notificationLevel>=4) fflush(stderr); } }
/* init */
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
@@ -521,7 +552,7 @@
if (doneMarks[cursor]) { cursor++; continue; }
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
if (solution.length==0) { cursor++; continue; }
- ZDICT_insertDictItem(dictList, dictListSize, solution);
+ ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
cursor += solution.length;
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
} }
@@ -550,29 +581,30 @@
typedef struct
{
- ZSTD_CCtx* ref;
- ZSTD_CCtx* zc;
- void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
+ ZSTD_CCtx* ref; /* contains reference to dictionary */
+ ZSTD_CCtx* zc; /* working context */
+ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
} EStats_ress_t;
#define MAXREPOFFSET 1024
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
- const void* src, size_t srcSize, U32 notificationLevel)
+ U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
+ const void* src, size_t srcSize,
+ U32 notificationLevel)
{
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
size_t cSize;
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+ { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
}
- cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
+ cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
if (cSize) { /* if == 0; block is not compressible */
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
/* literals stats */
{ const BYTE* bytePtr;
@@ -610,17 +642,6 @@
} } }
}
-/*
-static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
-{
- unsigned u;
- size_t max=0;
- for (u=0; u<nbFiles; u++)
- if (max < fileSizes[u]) max = fileSizes[u];
- return max;
-}
-*/
-
static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
{
size_t total=0;
@@ -645,6 +666,18 @@
}
}
+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+ int u;
+ for (u=1; u<256; u++) countLit[u] = 2;
+ countLit[0] = 4;
+ countLit[253] = 1;
+ countLit[254] = 1;
+}
#define OFFCODE_MAX 30 /* only applicable to first block */
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
@@ -674,32 +707,33 @@
BYTE* dstPtr = (BYTE*)dstBuffer;
/* init */
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
esr.ref = ZSTD_createCCtx();
esr.zc = ZSTD_createCCtx();
- esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
if (!esr.ref || !esr.zc || !esr.workPlace) {
eSize = ERROR(memory_allocation);
DISPLAYLEVEL(1, "Not enough memory \n");
goto _cleanup;
}
- if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
- for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
- for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
- for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
- for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
+ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
+ for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
+ for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
memset(repOffset, 0, sizeof(repOffset));
repOffset[1] = repOffset[4] = repOffset[8] = 1;
memset(bestRepOffset, 0, sizeof(bestRepOffset));
- if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
+ if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
- if (ZSTD_isError(beginResult)) {
+ if (ZSTD_isError(beginResult)) {
+ DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
eSize = ERROR(GENERIC);
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
goto _cleanup;
} }
- /* collect stats on all files */
+ /* collect stats on all samples */
for (u=0; u<nbFiles; u++) {
ZDICT_countEStats(esr, params,
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
@@ -708,14 +742,21 @@
pos += fileSizes[u];
}
- /* analyze */
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
- if (HUF_isError(errorCode)) {
- eSize = ERROR(GENERIC);
- DISPLAYLEVEL(1, "HUF_buildCTable error \n");
- goto _cleanup;
+ /* analyze, build stats, starting with literals */
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+ if (HUF_isError(maxNbBits)) {
+ eSize = ERROR(GENERIC);
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+ goto _cleanup;
+ }
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+ assert(maxNbBits==9);
+ }
+ huffLog = (U32)maxNbBits;
}
- huffLog = (U32)errorCode;
/* looking for most common first offsets */
{ U32 offset;
@@ -812,7 +853,6 @@
MEM_writeLE32(dstPtr+4, repStartValue[1]);
MEM_writeLE32(dstPtr+8, repStartValue[2]);
#endif
- //dstPtr += 12;
eSize += 12;
_cleanup:
@@ -831,18 +871,19 @@
ZDICT_params_t params)
{
size_t hSize;
-#define HBUFFSIZE 256
+#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
BYTE header[HBUFFSIZE];
int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
U32 const notificationLevel = params.notificationLevel;
/* check conditions */
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
/* dictionary header */
- MEM_writeLE32(header, ZSTD_DICT_MAGIC);
+ MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
{ U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
U32 const dictID = params.dictID ? params.dictID : compliantID;
@@ -877,20 +918,11 @@
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t params)
{
- size_t hSize;
int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
U32 const notificationLevel = params.notificationLevel;
+ size_t hSize = 8;
- /* dictionary header */
- MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
- { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
- U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
- U32 const dictID = params.dictID ? params.dictID : compliantID;
- MEM_writeLE32((char*)dictBuffer+4, dictID);
- }
- hSize = 8;
-
- /* entropy tables */
+ /* calculate entropy tables */
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
DISPLAYLEVEL(2, "statistics ... \n");
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
@@ -902,6 +934,13 @@
hSize += eSize;
}
+ /* add dictionary header (after entropy tables) */
+ MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
+ }
if (hSize + dictContentSize < dictBufferCapacity)
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
@@ -909,14 +948,14 @@
}
-/*! ZDICT_trainFromBuffer_unsafe() :
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
* Warning : `samplesBuffer` must be followed by noisy guard band.
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
*/
-size_t ZDICT_trainFromBuffer_unsafe(
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
void* dictBuffer, size_t maxDictSize,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
- ZDICT_params_t params)
+ ZDICT_legacy_params_t params)
{
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@@ -925,24 +964,24 @@
size_t const targetDictSize = maxDictSize;
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
size_t dictSize = 0;
- U32 const notificationLevel = params.notificationLevel;
+ U32 const notificationLevel = params.zParams.notificationLevel;
/* checks */
if (!dictList) return ERROR(memory_allocation);
- if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
- if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
+ if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
/* init */
ZDICT_initDictItem(dictList);
/* build dictionary */
- ZDICT_trainBuffer(dictList, dictListSize,
- samplesBuffer, samplesBuffSize,
- samplesSizes, nbSamples,
- minRep, notificationLevel);
+ ZDICT_trainBuffer_legacy(dictList, dictListSize,
+ samplesBuffer, samplesBuffSize,
+ samplesSizes, nbSamples,
+ minRep, notificationLevel);
/* display best matches */
- if (params.notificationLevel>= 3) {
+ if (params.zParams.notificationLevel>= 3) {
U32 const nb = MIN(25, dictList[0].pos);
U32 const dictContentSize = ZDICT_dictSize(dictList);
U32 u;
@@ -963,14 +1002,15 @@
/* create dictionary */
{ U32 dictContentSize = ZDICT_dictSize(dictList);
- if (dictContentSize < targetDictSize/3) {
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
+ if (dictContentSize < targetDictSize/4) {
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+ if (samplesBuffSize < 10 * targetDictSize)
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
if (minRep > MINRATIO) {
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
}
- if (samplesBuffSize < 10 * targetDictSize)
- DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
}
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
@@ -978,7 +1018,7 @@
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
- DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
}
/* limit dictionary size */
@@ -1004,7 +1044,7 @@
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
samplesBuffer, samplesSizes, nbSamples,
- params);
+ params.zParams);
}
/* clean up */
@@ -1013,11 +1053,12 @@
}
-/* issue : samplesBuffer need to be followed by a noisy guard band.
-* work around : duplicate the buffer, and add the noise */
-size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
- ZDICT_params_t params)
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+ ZDICT_legacy_params_t params)
{
size_t result;
void* newBuff;
@@ -1030,10 +1071,9 @@
memcpy(newBuff, samplesBuffer, sBuffSize);
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
- result = ZDICT_trainFromBuffer_unsafe(
- dictBuffer, dictBufferCapacity,
- newBuff, samplesSizes, nbSamples,
- params);
+ result =
+ ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+ samplesSizes, nbSamples, params);
free(newBuff);
return result;
}
@@ -1042,15 +1082,23 @@
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
{
- ZDICT_params_t params;
+ ZDICT_cover_params_t params;
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
memset(¶ms, 0, sizeof(params));
- return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
- samplesBuffer, samplesSizes, nbSamples,
- params);
+ params.d = 8;
+ params.steps = 4;
+ /* Default to level 6 since no compression level information is available */
+ params.zParams.compressionLevel = 6;
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+ params.zParams.notificationLevel = ZSTD_DEBUG;
+#endif
+ return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+ samplesBuffer, samplesSizes, nbSamples,
+ ¶ms);
}
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
{
ZDICT_params_t params;
memset(¶ms, 0, sizeof(params));