--- a/contrib/python-zstandard/zstd/dictBuilder/fastcover.c Sun Sep 15 00:07:30 2019 -0400
+++ b/contrib/python-zstandard/zstd/dictBuilder/fastcover.c Sun Sep 15 20:04:00 2019 -0700
@@ -132,7 +132,7 @@
*
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
*
- * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
*/
static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
U32 *freqs, U32 begin, U32 end,
@@ -161,7 +161,7 @@
/* Get hash value of current dmer */
const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
- /* Add frequency of this index to score if this is the first occurence of index in active segment */
+ /* Add frequency of this index to score if this is the first occurrence of index in active segment */
if (segmentFreqs[idx] == 0) {
activeSegment.score += freqs[idx];
}
@@ -287,10 +287,10 @@
* Prepare a context for dictionary building.
* The context is only dependent on the parameter `d` and can used multiple
* times.
- * Returns 1 on success or zero on error.
+ * Returns 0 on success or error code on error.
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
*/
-static int
+static size_t
FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples,
@@ -310,19 +310,19 @@
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
(unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
- return 0;
+ return ERROR(srcSize_wrong);
}
/* Check if there are at least 5 training samples */
if (nbTrainSamples < 5) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
- return 0;
+ return ERROR(srcSize_wrong);
}
/* Check if there's testing sample */
if (nbTestSamples < 1) {
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
- return 0;
+ return ERROR(srcSize_wrong);
}
/* Zero the context */
@@ -347,7 +347,7 @@
if (ctx->offsets == NULL) {
DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
FASTCOVER_ctx_destroy(ctx);
- return 0;
+ return ERROR(memory_allocation);
}
/* Fill offsets from the samplesSizes */
@@ -364,13 +364,13 @@
if (ctx->freqs == NULL) {
DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
FASTCOVER_ctx_destroy(ctx);
- return 0;
+ return ERROR(memory_allocation);
}
DISPLAYLEVEL(2, "Computing frequencies\n");
FASTCOVER_computeFrequency(ctx->freqs, ctx);
- return 1;
+ return 0;
}
@@ -386,29 +386,35 @@
{
BYTE *const dict = (BYTE *)dictBuffer;
size_t tail = dictBufferCapacity;
- /* Divide the data up into epochs of equal size.
- * We will select at least one segment from each epoch.
- */
- const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
- const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
+ /* Divide the data into epochs. We will select one segment from each epoch. */
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
+ (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
+ const size_t maxZeroScoreRun = 10;
+ size_t zeroScoreRun = 0;
size_t epoch;
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
- epochs, epochSize);
+ (U32)epochs.num, (U32)epochs.size);
/* Loop through the epochs until there are no more segments or the dictionary
* is full.
*/
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
- const U32 epochBegin = (U32)(epoch * epochSize);
- const U32 epochEnd = epochBegin + epochSize;
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+ const U32 epochBegin = (U32)(epoch * epochs.size);
+ const U32 epochEnd = epochBegin + epochs.size;
size_t segmentSize;
/* Select a segment */
COVER_segment_t segment = FASTCOVER_selectSegment(
ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
- /* If the segment covers no dmers, then we are out of content */
+ /* If the segment covers no dmers, then we are out of content.
+ * There may be new content in other epochs, for continue for some time.
+ */
if (segment.score == 0) {
- break;
+ if (++zeroScoreRun >= maxZeroScoreRun) {
+ break;
+ }
+ continue;
}
+ zeroScoreRun = 0;
/* Trim the segment if necessary and if it is too small then we are done */
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
@@ -429,7 +435,6 @@
return tail;
}
-
/**
* Parameters for FASTCOVER_tryParameters().
*/
@@ -458,6 +463,7 @@
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
/* Allocate space for hash table, dict, and freqs */
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
if (!segmentFreqs || !dict || !freqs) {
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
@@ -467,27 +473,24 @@
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
/* Build the dictionary */
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
- parameters, segmentFreqs);
+ parameters, segmentFreqs);
+
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
- dictBufferCapacity = ZDICT_finalizeDictionary(
- dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
- ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
- if (ZDICT_isError(dictBufferCapacity)) {
- DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
+ ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
+ totalCompressedSize);
+
+ if (COVER_dictSelectionIsError(selection)) {
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
goto _cleanup;
}
}
- /* Check total compressed size */
- totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
- ctx->samples, ctx->offsets,
- ctx->nbTrainSamples, ctx->nbSamples,
- dict, dictBufferCapacity);
_cleanup:
- COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
- dictBufferCapacity);
+ free(dict);
+ COVER_best_finish(data->best, parameters, selection);
free(data);
free(segmentFreqs);
- free(dict);
+ COVER_dictSelectionFree(selection);
free(freqs);
}
@@ -502,6 +505,7 @@
coverParams->nbThreads = fastCoverParams.nbThreads;
coverParams->splitPoint = fastCoverParams.splitPoint;
coverParams->zParams = fastCoverParams.zParams;
+ coverParams->shrinkDict = fastCoverParams.shrinkDict;
}
@@ -518,6 +522,7 @@
fastCoverParams->f = f;
fastCoverParams->accel = accel;
fastCoverParams->zParams = coverParams.zParams;
+ fastCoverParams->shrinkDict = coverParams.shrinkDict;
}
@@ -544,11 +549,11 @@
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
parameters.accel)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
- return ERROR(GENERIC);
+ return ERROR(parameter_outOfBound);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
- return ERROR(GENERIC);
+ return ERROR(srcSize_wrong);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -558,12 +563,16 @@
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
/* Initialize context */
- if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+ {
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
coverParams.d, parameters.splitPoint, parameters.f,
- accelParams)) {
- DISPLAYLEVEL(1, "Failed to initialize context\n");
- return ERROR(GENERIC);
+ accelParams);
+ if (ZSTD_isError(initVal)) {
+ DISPLAYLEVEL(1, "Failed to initialize context\n");
+ return initVal;
+ }
}
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
/* Build the dictionary */
DISPLAYLEVEL(2, "Building dictionary\n");
{
@@ -609,6 +618,7 @@
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+ const unsigned shrinkDict = 0;
/* Local variables */
const int displayLevel = parameters->zParams.notificationLevel;
unsigned iteration = 1;
@@ -616,22 +626,23 @@
unsigned k;
COVER_best_t best;
POOL_ctx *pool = NULL;
+ int warned = 0;
/* Checks */
if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
- return ERROR(GENERIC);
+ return ERROR(parameter_outOfBound);
}
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
- return ERROR(GENERIC);
+ return ERROR(parameter_outOfBound);
}
if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
- return ERROR(GENERIC);
+ return ERROR(parameter_outOfBound);
}
if (nbSamples == 0) {
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
- return ERROR(GENERIC);
+ return ERROR(srcSize_wrong);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
@@ -658,11 +669,18 @@
/* Initialize the context for this value of d */
FASTCOVER_ctx_t ctx;
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
- if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
- LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
- COVER_best_destroy(&best);
- POOL_free(pool);
- return ERROR(GENERIC);
+ {
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
+ if (ZSTD_isError(initVal)) {
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+ COVER_best_destroy(&best);
+ POOL_free(pool);
+ return initVal;
+ }
+ }
+ if (!warned) {
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
+ warned = 1;
}
/* Loop through k reusing the same context */
for (k = kMinK; k <= kMaxK; k += kStepSize) {
@@ -675,7 +693,7 @@
COVER_best_destroy(&best);
FASTCOVER_ctx_destroy(&ctx);
POOL_free(pool);
- return ERROR(GENERIC);
+ return ERROR(memory_allocation);
}
data->ctx = &ctx;
data->best = &best;
@@ -685,6 +703,7 @@
data->parameters.d = d;
data->parameters.splitPoint = splitPoint;
data->parameters.steps = kSteps;
+ data->parameters.shrinkDict = shrinkDict;
data->parameters.zParams.notificationLevel = g_displayLevel;
/* Check the parameters */
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,