Skip to content

Commit cacf47c

Browse files
committed
Merge branch 'dev' into dubtlazy
and fixed conflicts
2 parents b9a1490 + 04c00f9 commit cacf47c

File tree

13 files changed

+370
-323
lines changed

13 files changed

+370
-323
lines changed

appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
- COMPILER: "gcc"
1010
HOST: "mingw"
1111
PLATFORM: "x64"
12-
SCRIPT: "make allzstd MOREFLAGS=-static && make -C tests test-symbols fullbench-dll fullbench-lib"
12+
SCRIPT: "make allzstd MOREFLAGS=-static && make -C tests test-symbols fullbench-lib"
1313
ARTIFACT: "true"
1414
BUILD: "true"
1515
- COMPILER: "gcc"

lib/common/huf.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks.
206206
For example, it's possible to compress several blocks using the same 'CTable',
207207
or to save and regenerate 'CTable' using external methods.
208208
*/
209-
/* FSE_count() : find it within "fse.h" */
209+
/* FSE_count() : exposed within "fse.h" */
210210
unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
211211
typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
212-
size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
212+
size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */
213213
size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
214214
size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
215215

lib/compress/huf_compress.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu
405405
}
406406

407407
/** HUF_buildCTable() :
408+
* @return : maxNbBits
408409
* Note : count is used before tree is written, so they can safely overlap
409410
*/
410411
size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)

lib/compress/zstd_compress.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2461,7 +2461,6 @@ ZSTD_CDict* ZSTD_initStaticCDict(void* workspace, size_t workspaceSize,
24612461
+ cctxSize;
24622462
ZSTD_CDict* const cdict = (ZSTD_CDict*) workspace;
24632463
void* ptr;
2464-
DEBUGLOG(4, "(size_t)workspace & 7 : %u", (U32)(size_t)workspace & 7);
24652464
if ((size_t)workspace & 7) return NULL; /* 8-aligned */
24662465
DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
24672466
(U32)workspaceSize, (U32)neededSize, (U32)(workspaceSize < neededSize));

lib/dictBuilder/cover.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -537,8 +537,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
537537
/* Checks */
538538
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
539539
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
540-
DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
541-
(COVER_MAX_SAMPLES_SIZE >> 20));
540+
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
541+
(U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
542542
return 0;
543543
}
544544
/* Zero the context */
@@ -651,12 +651,16 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
651651
}
652652

653653
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
654-
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
655-
const size_t *samplesSizes, unsigned nbSamples,
656-
ZDICT_cover_params_t parameters) {
657-
BYTE *const dict = (BYTE *)dictBuffer;
654+
void *dictBuffer, size_t dictBufferCapacity,
655+
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
656+
ZDICT_cover_params_t parameters)
657+
{
658+
BYTE* const dict = (BYTE*)dictBuffer;
658659
COVER_ctx_t ctx;
659660
COVER_map_t activeDmers;
661+
662+
/* Initialize global data */
663+
g_displayLevel = parameters.zParams.notificationLevel;
660664
/* Checks */
661665
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
662666
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
@@ -671,8 +675,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
671675
ZDICT_DICTSIZE_MIN);
672676
return ERROR(dstSize_tooSmall);
673677
}
674-
/* Initialize global data */
675-
g_displayLevel = parameters.zParams.notificationLevel;
676678
/* Initialize context and activeDmers */
677679
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
678680
parameters.d)) {
@@ -947,6 +949,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
947949
unsigned k;
948950
COVER_best_t best;
949951
POOL_ctx *pool = NULL;
952+
950953
/* Checks */
951954
if (kMinK < kMaxD || kMaxK < kMinK) {
952955
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");

lib/dictBuilder/zdict.c

Lines changed: 83 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
207207
U32 cumulLength[LLIMIT] = {0};
208208
U32 savings[LLIMIT] = {0};
209209
const BYTE* b = (const BYTE*)buffer;
210-
size_t length;
211210
size_t maxLength = LLIMIT;
212211
size_t pos = suffix[start];
213212
U32 end = start;
@@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
222221
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
223222
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
224223
/* skip and mark segment */
225-
U16 u16 = MEM_read16(b+pos+4);
226-
U32 u, e = 6;
227-
while (MEM_read16(b+pos+e) == u16) e+=2 ;
228-
if (b[pos+e] == b[pos+e-1]) e++;
229-
for (u=1; u<e; u++)
224+
U16 const pattern16 = MEM_read16(b+pos+4);
225+
U32 u, patternEnd = 6;
226+
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
227+
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
228+
for (u=1; u<patternEnd; u++)
230229
doneMarks[pos+u] = 1;
231230
return solution;
232231
}
233232

234233
/* look forward */
235-
do {
236-
end++;
237-
length = ZDICT_count(b + pos, b + suffix[end]);
238-
} while (length >=MINMATCHLENGTH);
234+
{ size_t length;
235+
do {
236+
end++;
237+
length = ZDICT_count(b + pos, b + suffix[end]);
238+
} while (length >= MINMATCHLENGTH);
239+
}
239240

240241
/* look backward */
241-
do {
242-
length = ZDICT_count(b + pos, b + *(suffix+start-1));
243-
if (length >=MINMATCHLENGTH) start--;
244-
} while(length >= MINMATCHLENGTH);
242+
{ size_t length;
243+
do {
244+
length = ZDICT_count(b + pos, b + *(suffix+start-1));
245+
if (length >=MINMATCHLENGTH) start--;
246+
} while(length >= MINMATCHLENGTH);
247+
}
245248

246249
/* exit if not found a minimum nb of repetitions */
247250
if (end-start < minRatio) {
@@ -268,7 +271,7 @@ static dictItem ZDICT_analyzePos(
268271
U32 selectedCount = 0;
269272
U32 selectedID = currentID;
270273
for (id =refinedStart; id < refinedEnd; id++) {
271-
if (b[ suffix[id] + searchLength] != currentChar) {
274+
if (b[suffix[id] + searchLength] != currentChar) {
272275
if (currentCount > selectedCount) {
273276
selectedCount = currentCount;
274277
selectedID = currentID;
@@ -297,20 +300,23 @@ static dictItem ZDICT_analyzePos(
297300
memset(lengthList, 0, sizeof(lengthList));
298301

299302
/* look forward */
300-
do {
301-
end++;
302-
length = ZDICT_count(b + pos, b + suffix[end]);
303-
if (length >= LLIMIT) length = LLIMIT-1;
304-
lengthList[length]++;
305-
} while (length >=MINMATCHLENGTH);
303+
{ size_t length;
304+
do {
305+
end++;
306+
length = ZDICT_count(b + pos, b + suffix[end]);
307+
if (length >= LLIMIT) length = LLIMIT-1;
308+
lengthList[length]++;
309+
} while (length >=MINMATCHLENGTH);
310+
}
306311

307312
/* look backward */
308-
length = MINMATCHLENGTH;
309-
while ((length >= MINMATCHLENGTH) & (start > 0)) {
310-
length = ZDICT_count(b + pos, b + suffix[start - 1]);
311-
if (length >= LLIMIT) length = LLIMIT - 1;
312-
lengthList[length]++;
313-
if (length >= MINMATCHLENGTH) start--;
313+
{ size_t length = MINMATCHLENGTH;
314+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
315+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
316+
if (length >= LLIMIT) length = LLIMIT - 1;
317+
lengthList[length]++;
318+
if (length >= MINMATCHLENGTH) start--;
319+
}
314320
}
315321

316322
/* largest useful length */
@@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
345351
/* mark positions done */
346352
{ U32 id;
347353
for (id=start; id<end; id++) {
348-
U32 p, pEnd;
354+
U32 p, pEnd, length;
349355
U32 const testedPos = suffix[id];
350356
if (testedPos == pos)
351357
length = solution.length;
352358
else {
353-
length = ZDICT_count(b+pos, b+testedPos);
359+
length = (U32)ZDICT_count(b+pos, b+testedPos);
354360
if (length > solution.length) length = solution.length;
355361
}
356362
pEnd = (U32)(testedPos + length);
@@ -575,29 +581,30 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
575581

576582
typedef struct
577583
{
578-
ZSTD_CCtx* ref;
579-
ZSTD_CCtx* zc;
584+
ZSTD_CCtx* ref; /* contains reference to dictionary */
585+
ZSTD_CCtx* zc; /* working context */
580586
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
581587
} EStats_ress_t;
582588

583589
#define MAXREPOFFSET 1024
584590

585591
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
586-
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
587-
const void* src, size_t srcSize, U32 notificationLevel)
592+
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
593+
const void* src, size_t srcSize,
594+
U32 notificationLevel)
588595
{
589596
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
590597
size_t cSize;
591598

592599
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
593-
{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
594-
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
600+
{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
601+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
595602
}
596603
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
597604
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
598605

599606
if (cSize) { /* if == 0; block is not compressible */
600-
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
607+
const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
601608

602609
/* literals stats */
603610
{ const BYTE* bytePtr;
@@ -659,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
659666
}
660667
}
661668

669+
/* ZDICT_flatLit() :
670+
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
671+
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
672+
*/
673+
static void ZDICT_flatLit(U32* countLit)
674+
{
675+
int u;
676+
for (u=1; u<256; u++) countLit[u] = 2;
677+
countLit[0] = 4;
678+
countLit[253] = 1;
679+
countLit[254] = 1;
680+
}
662681

663682
#define OFFCODE_MAX 30 /* only applicable to first block */
664683
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
@@ -688,6 +707,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
688707
BYTE* dstPtr = (BYTE*)dstBuffer;
689708

690709
/* init */
710+
DEBUGLOG(4, "ZDICT_analyzeEntropy");
691711
esr.ref = ZSTD_createCCtx();
692712
esr.zc = ZSTD_createCCtx();
693713
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
@@ -713,7 +733,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
713733
goto _cleanup;
714734
} }
715735

716-
/* collect stats on all files */
736+
/* collect stats on all samples */
717737
for (u=0; u<nbFiles; u++) {
718738
ZDICT_countEStats(esr, params,
719739
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
@@ -722,14 +742,21 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
722742
pos += fileSizes[u];
723743
}
724744

725-
/* analyze */
726-
errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
727-
if (HUF_isError(errorCode)) {
728-
eSize = ERROR(GENERIC);
729-
DISPLAYLEVEL(1, "HUF_buildCTable error \n");
730-
goto _cleanup;
745+
/* analyze, build stats, starting with literals */
746+
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
747+
if (HUF_isError(maxNbBits)) {
748+
eSize = ERROR(GENERIC);
749+
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
750+
goto _cleanup;
751+
}
752+
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
753+
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
754+
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
755+
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
756+
assert(maxNbBits==9);
757+
}
758+
huffLog = (U32)maxNbBits;
731759
}
732-
huffLog = (U32)errorCode;
733760

734761
/* looking for most common first offsets */
735762
{ U32 offset;
@@ -850,6 +877,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
850877
U32 const notificationLevel = params.notificationLevel;
851878

852879
/* check conditions */
880+
DEBUGLOG(4, "ZDICT_finalizeDictionary");
853881
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
854882
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
855883
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
@@ -1025,8 +1053,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
10251053
}
10261054

10271055

1028-
/* issue : samplesBuffer need to be followed by a noisy guard band.
1029-
* work around : duplicate the buffer, and add the noise */
1056+
/* ZDICT_trainFromBuffer_legacy() :
1057+
* issue : samplesBuffer need to be followed by a noisy guard band.
1058+
* work around : duplicate the buffer, and add the noise */
10301059
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
10311060
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
10321061
ZDICT_legacy_params_t params)
@@ -1054,18 +1083,22 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
10541083
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
10551084
{
10561085
ZDICT_cover_params_t params;
1086+
DEBUGLOG(3, "ZDICT_trainFromBuffer");
10571087
memset(&params, 0, sizeof(params));
10581088
params.d = 8;
10591089
params.steps = 4;
1060-
/* Default to level 6 since no compression level information is avaialble */
1090+
/* Default to level 6 since no compression level information is available */
10611091
params.zParams.compressionLevel = 6;
1092+
#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
1093+
params.zParams.notificationLevel = ZSTD_DEBUG;
1094+
#endif
10621095
return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
1063-
samplesBuffer, samplesSizes,
1064-
nbSamples, &params);
1096+
samplesBuffer, samplesSizes, nbSamples,
1097+
&params);
10651098
}
10661099

10671100
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1068-
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1101+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
10691102
{
10701103
ZDICT_params_t params;
10711104
memset(&params, 0, sizeof(params));

0 commit comments

Comments
 (0)