Mercurial > public > mercurial-scm > hg
diff contrib/python-zstandard/zstd/dictBuilder/fastcover.c @ 42937:69de49c4e39c
zstandard: vendor python-zstandard 0.12
The upstream source distribution from PyPI was extracted. Unwanted
files were removed.
The clang-format ignore list was updated to reflect the new source
of files.
test-repo-compengines.t was updated to reflect a change in behavior
of the zstd library.
The project contains a vendored copy of zstandard 1.4.3. The old
version was 1.3.8. This should result in some minor performance wins.
# no-check-commit because 3rd party code has different style guidelines
Differential Revision: https://phab.mercurial-scm.org/D6858
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Sun, 15 Sep 2019 20:04:00 -0700 |
parents | 675775c33ab6 |
children |
line wrap: on
line diff
--- a/contrib/python-zstandard/zstd/dictBuilder/fastcover.c Sun Sep 15 00:07:30 2019 -0400 +++ b/contrib/python-zstandard/zstd/dictBuilder/fastcover.c Sun Sep 15 20:04:00 2019 -0700 @@ -132,7 +132,7 @@ * * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) * - * Once the dmer with hash value d is in the dictionay we set F(d) = 0. + * Once the dmer with hash value d is in the dictionary we set F(d) = 0. */ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, U32 *freqs, U32 begin, U32 end, @@ -161,7 +161,7 @@ /* Get hash value of current dmer */ const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d); - /* Add frequency of this index to score if this is the first occurence of index in active segment */ + /* Add frequency of this index to score if this is the first occurrence of index in active segment */ if (segmentFreqs[idx] == 0) { activeSegment.score += freqs[idx]; } @@ -287,10 +287,10 @@ * Prepare a context for dictionary building. * The context is only dependent on the parameter `d` and can used multiple * times. - * Returns 1 on success or zero on error. + * Returns 0 on success or error code on error. * The context must be destroyed with `FASTCOVER_ctx_destroy()`. */ -static int +static size_t FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, @@ -310,19 +310,19 @@ totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) { DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20)); - return 0; + return ERROR(srcSize_wrong); } /* Check if there are at least 5 training samples */ if (nbTrainSamples < 5) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples); - return 0; + return ERROR(srcSize_wrong); } /* Check if there's testing sample */ if (nbTestSamples < 1) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples); - return 0; + return ERROR(srcSize_wrong); } /* Zero the context */ @@ -347,7 +347,7 @@ if (ctx->offsets == NULL) { DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n"); FASTCOVER_ctx_destroy(ctx); - return 0; + return ERROR(memory_allocation); } /* Fill offsets from the samplesSizes */ @@ -364,13 +364,13 @@ if (ctx->freqs == NULL) { DISPLAYLEVEL(1, "Failed to allocate frequency table \n"); FASTCOVER_ctx_destroy(ctx); - return 0; + return ERROR(memory_allocation); } DISPLAYLEVEL(2, "Computing frequencies\n"); FASTCOVER_computeFrequency(ctx->freqs, ctx); - return 1; + return 0; } @@ -386,29 +386,35 @@ { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; - /* Divide the data up into epochs of equal size. - * We will select at least one segment from each epoch. - */ - const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k)); - const unsigned epochSize = (U32)(ctx->nbDmers / epochs); + /* Divide the data into epochs. We will select one segment from each epoch. */ + const COVER_epoch_info_t epochs = COVER_computeEpochs( + (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1); + const size_t maxZeroScoreRun = 10; + size_t zeroScoreRun = 0; size_t epoch; DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", - epochs, epochSize); + (U32)epochs.num, (U32)epochs.size); /* Loop through the epochs until there are no more segments or the dictionary * is full. */ - for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { - const U32 epochBegin = (U32)(epoch * epochSize); - const U32 epochEnd = epochBegin + epochSize; + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) { + const U32 epochBegin = (U32)(epoch * epochs.size); + const U32 epochEnd = epochBegin + epochs.size; size_t segmentSize; /* Select a segment */ COVER_segment_t segment = FASTCOVER_selectSegment( ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs); - /* If the segment covers no dmers, then we are out of content */ + /* If the segment covers no dmers, then we are out of content. + * There may be new content in other epochs, for continue for some time. + */ if (segment.score == 0) { - break; + if (++zeroScoreRun >= maxZeroScoreRun) { + break; + } + continue; } + zeroScoreRun = 0; /* Trim the segment if necessary and if it is too small then we are done */ segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); @@ -429,7 +435,6 @@ return tail; } - /** * Parameters for FASTCOVER_tryParameters(). */ @@ -458,6 +463,7 @@ U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16)); /* Allocate space for hash table, dict, and freqs */ BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); if (!segmentFreqs || !dict || !freqs) { DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); @@ -467,27 +473,24 @@ memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32)); /* Build the dictionary */ { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity, - parameters, segmentFreqs); + parameters, segmentFreqs); + const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); - dictBufferCapacity = ZDICT_finalizeDictionary( - dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, - ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams); - if (ZDICT_isError(dictBufferCapacity)) { - DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, + totalCompressedSize); + + if (COVER_dictSelectionIsError(selection)) { + DISPLAYLEVEL(1, "Failed to select dictionary\n"); goto _cleanup; } } - /* Check total compressed size */ - totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes, - ctx->samples, ctx->offsets, - ctx->nbTrainSamples, ctx->nbSamples, - dict, dictBufferCapacity); _cleanup: - COVER_best_finish(data->best, totalCompressedSize, parameters, dict, - dictBufferCapacity); + free(dict); + COVER_best_finish(data->best, parameters, selection); free(data); free(segmentFreqs); - free(dict); + COVER_dictSelectionFree(selection); free(freqs); } @@ -502,6 +505,7 @@ coverParams->nbThreads = fastCoverParams.nbThreads; coverParams->splitPoint = fastCoverParams.splitPoint; coverParams->zParams = fastCoverParams.zParams; + coverParams->shrinkDict = fastCoverParams.shrinkDict; } @@ -518,6 +522,7 @@ fastCoverParams->f = f; fastCoverParams->accel = accel; fastCoverParams->zParams = coverParams.zParams; + fastCoverParams->shrinkDict = coverParams.shrinkDict; } @@ -544,11 +549,11 @@ if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f, parameters.accel)) { DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (nbSamples == 0) { DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); - return ERROR(GENERIC); + return ERROR(srcSize_wrong); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", @@ -558,12 +563,16 @@ /* Assign corresponding FASTCOVER_accel_t to accelParams*/ accelParams = FASTCOVER_defaultAccelParameters[parameters.accel]; /* Initialize context */ - if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + { + size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, coverParams.d, parameters.splitPoint, parameters.f, - accelParams)) { - DISPLAYLEVEL(1, "Failed to initialize context\n"); - return ERROR(GENERIC); + accelParams); + if (ZSTD_isError(initVal)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + return initVal; + } } + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel); /* Build the dictionary */ DISPLAYLEVEL(2, "Building dictionary\n"); { @@ -609,6 +618,7 @@ (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f; const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel; + const unsigned shrinkDict = 0; /* Local variables */ const int displayLevel = parameters->zParams.notificationLevel; unsigned iteration = 1; @@ -616,22 +626,23 @@ unsigned k; COVER_best_t best; POOL_ctx *pool = NULL; + int warned = 0; /* Checks */ if (splitPoint <= 0 || splitPoint > 1) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (kMinK < kMaxD || kMaxK < kMinK) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (nbSamples == 0) { LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n"); - return ERROR(GENERIC); + return ERROR(srcSize_wrong); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n", @@ -658,11 +669,18 @@ /* Initialize the context for this value of d */ FASTCOVER_ctx_t ctx; LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); - if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) { - LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); - COVER_best_destroy(&best); - POOL_free(pool); - return ERROR(GENERIC); + { + size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams); + if (ZSTD_isError(initVal)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + COVER_best_destroy(&best); + POOL_free(pool); + return initVal; + } + } + if (!warned) { + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel); + warned = 1; } /* Loop through k reusing the same context */ for (k = kMinK; k <= kMaxK; k += kStepSize) { @@ -675,7 +693,7 @@ COVER_best_destroy(&best); FASTCOVER_ctx_destroy(&ctx); POOL_free(pool); - return ERROR(GENERIC); + return ERROR(memory_allocation); } data->ctx = &ctx; data->best = &best; @@ -685,6 +703,7 @@ data->parameters.d = d; data->parameters.splitPoint = splitPoint; data->parameters.steps = kSteps; + data->parameters.shrinkDict = shrinkDict; data->parameters.zParams.notificationLevel = g_displayLevel; /* Check the parameters */ if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,