@@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
207207 U32 cumulLength [LLIMIT ] = {0 };
208208 U32 savings [LLIMIT ] = {0 };
209209 const BYTE * b = (const BYTE * )buffer ;
210- size_t length ;
211210 size_t maxLength = LLIMIT ;
212211 size_t pos = suffix [start ];
213212 U32 end = start ;
@@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
222221 || (MEM_read16 (b + pos + 1 ) == MEM_read16 (b + pos + 3 ))
223222 || (MEM_read16 (b + pos + 2 ) == MEM_read16 (b + pos + 4 )) ) {
224223 /* skip and mark segment */
225- U16 u16 = MEM_read16 (b + pos + 4 );
226- U32 u , e = 6 ;
227- while (MEM_read16 (b + pos + e ) == u16 ) e += 2 ;
228- if (b [pos + e ] == b [pos + e - 1 ]) e ++ ;
229- for (u = 1 ; u < e ; u ++ )
224+ U16 const pattern16 = MEM_read16 (b + pos + 4 );
225+ U32 u , patternEnd = 6 ;
226+ while (MEM_read16 (b + pos + patternEnd ) == pattern16 ) patternEnd += 2 ;
227+ if (b [pos + patternEnd ] == b [pos + patternEnd - 1 ]) patternEnd ++ ;
228+ for (u = 1 ; u < patternEnd ; u ++ )
230229 doneMarks [pos + u ] = 1 ;
231230 return solution ;
232231 }
233232
234233 /* look forward */
235- do {
236- end ++ ;
237- length = ZDICT_count (b + pos , b + suffix [end ]);
238- } while (length >=MINMATCHLENGTH );
234+ { size_t length ;
235+ do {
236+ end ++ ;
237+ length = ZDICT_count (b + pos , b + suffix [end ]);
238+ } while (length >= MINMATCHLENGTH );
239+ }
239240
240241 /* look backward */
241- do {
242- length = ZDICT_count (b + pos , b + * (suffix + start - 1 ));
243- if (length >=MINMATCHLENGTH ) start -- ;
244- } while (length >= MINMATCHLENGTH );
242+ { size_t length ;
243+ do {
244+ length = ZDICT_count (b + pos , b + * (suffix + start - 1 ));
245+ if (length >=MINMATCHLENGTH ) start -- ;
246+ } while (length >= MINMATCHLENGTH );
247+ }
245248
246249 /* exit if not found a minimum nb of repetitions */
247250 if (end - start < minRatio ) {
@@ -268,7 +271,7 @@ static dictItem ZDICT_analyzePos(
268271 U32 selectedCount = 0 ;
269272 U32 selectedID = currentID ;
270273 for (id = refinedStart ; id < refinedEnd ; id ++ ) {
271- if (b [ suffix [id ] + searchLength ] != currentChar ) {
274+ if (b [suffix [id ] + searchLength ] != currentChar ) {
272275 if (currentCount > selectedCount ) {
273276 selectedCount = currentCount ;
274277 selectedID = currentID ;
@@ -297,20 +300,23 @@ static dictItem ZDICT_analyzePos(
297300 memset (lengthList , 0 , sizeof (lengthList ));
298301
299302 /* look forward */
300- do {
301- end ++ ;
302- length = ZDICT_count (b + pos , b + suffix [end ]);
303- if (length >= LLIMIT ) length = LLIMIT - 1 ;
304- lengthList [length ]++ ;
305- } while (length >=MINMATCHLENGTH );
303+ { size_t length ;
304+ do {
305+ end ++ ;
306+ length = ZDICT_count (b + pos , b + suffix [end ]);
307+ if (length >= LLIMIT ) length = LLIMIT - 1 ;
308+ lengthList [length ]++ ;
309+ } while (length >=MINMATCHLENGTH );
310+ }
306311
307312 /* look backward */
308- length = MINMATCHLENGTH ;
309- while ((length >= MINMATCHLENGTH ) & (start > 0 )) {
310- length = ZDICT_count (b + pos , b + suffix [start - 1 ]);
311- if (length >= LLIMIT ) length = LLIMIT - 1 ;
312- lengthList [length ]++ ;
313- if (length >= MINMATCHLENGTH ) start -- ;
313+ { size_t length = MINMATCHLENGTH ;
314+ while ((length >= MINMATCHLENGTH ) & (start > 0 )) {
315+ length = ZDICT_count (b + pos , b + suffix [start - 1 ]);
316+ if (length >= LLIMIT ) length = LLIMIT - 1 ;
317+ lengthList [length ]++ ;
318+ if (length >= MINMATCHLENGTH ) start -- ;
319+ }
314320 }
315321
316322 /* largest useful length */
@@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
345351 /* mark positions done */
346352 { U32 id ;
347353 for (id = start ; id < end ; id ++ ) {
348- U32 p , pEnd ;
354+ U32 p , pEnd , length ;
349355 U32 const testedPos = suffix [id ];
350356 if (testedPos == pos )
351357 length = solution .length ;
352358 else {
353- length = ZDICT_count (b + pos , b + testedPos );
359+ length = ( U32 ) ZDICT_count (b + pos , b + testedPos );
354360 if (length > solution .length ) length = solution .length ;
355361 }
356362 pEnd = (U32 )(testedPos + length );
@@ -575,29 +581,30 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
575581
576582typedef struct
577583{
578- ZSTD_CCtx * ref ;
579- ZSTD_CCtx * zc ;
584+ ZSTD_CCtx * ref ; /* contains reference to dictionary */
585+ ZSTD_CCtx * zc ; /* working context */
580586 void * workPlace ; /* must be ZSTD_BLOCKSIZE_MAX allocated */
581587} EStats_ress_t ;
582588
583589#define MAXREPOFFSET 1024
584590
585591static void ZDICT_countEStats (EStats_ress_t esr , ZSTD_parameters params ,
586- U32 * countLit , U32 * offsetcodeCount , U32 * matchlengthCount , U32 * litlengthCount , U32 * repOffsets ,
587- const void * src , size_t srcSize , U32 notificationLevel )
592+ U32 * countLit , U32 * offsetcodeCount , U32 * matchlengthCount , U32 * litlengthCount , U32 * repOffsets ,
593+ const void * src , size_t srcSize ,
594+ U32 notificationLevel )
588595{
589596 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX , 1 << params .cParams .windowLog );
590597 size_t cSize ;
591598
592599 if (srcSize > blockSizeMax ) srcSize = blockSizeMax ; /* protection vs large samples */
593- { size_t const errorCode = ZSTD_copyCCtx (esr .zc , esr .ref , 0 );
594- if (ZSTD_isError (errorCode )) { DISPLAYLEVEL (1 , "warning : ZSTD_copyCCtx failed \n" ); return ; }
600+ { size_t const errorCode = ZSTD_copyCCtx (esr .zc , esr .ref , 0 );
601+ if (ZSTD_isError (errorCode )) { DISPLAYLEVEL (1 , "warning : ZSTD_copyCCtx failed \n" ); return ; }
595602 }
596603 cSize = ZSTD_compressBlock (esr .zc , esr .workPlace , ZSTD_BLOCKSIZE_MAX , src , srcSize );
597604 if (ZSTD_isError (cSize )) { DISPLAYLEVEL (3 , "warning : could not compress sample size %u \n" , (U32 )srcSize ); return ; }
598605
599606 if (cSize ) { /* if == 0; block is not compressible */
600- const seqStore_t * seqStorePtr = ZSTD_getSeqStore (esr .zc );
607+ const seqStore_t * const seqStorePtr = ZSTD_getSeqStore (esr .zc );
601608
602609 /* literals stats */
603610 { const BYTE * bytePtr ;
@@ -659,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
659666 }
660667}
661668
669+ /* ZDICT_flatLit() :
670+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
671+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
672+ */
673+ static void ZDICT_flatLit (U32 * countLit )
674+ {
675+ int u ;
676+ for (u = 1 ; u < 256 ; u ++ ) countLit [u ] = 2 ;
677+ countLit [0 ] = 4 ;
678+ countLit [253 ] = 1 ;
679+ countLit [254 ] = 1 ;
680+ }
662681
663682#define OFFCODE_MAX 30 /* only applicable to first block */
664683static size_t ZDICT_analyzeEntropy (void * dstBuffer , size_t maxDstSize ,
@@ -688,6 +707,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
688707 BYTE * dstPtr = (BYTE * )dstBuffer ;
689708
690709 /* init */
710+ DEBUGLOG (4 , "ZDICT_analyzeEntropy" );
691711 esr .ref = ZSTD_createCCtx ();
692712 esr .zc = ZSTD_createCCtx ();
693713 esr .workPlace = malloc (ZSTD_BLOCKSIZE_MAX );
@@ -713,7 +733,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
713733 goto _cleanup ;
714734 } }
715735
716- /* collect stats on all files */
736+ /* collect stats on all samples */
717737 for (u = 0 ; u < nbFiles ; u ++ ) {
718738 ZDICT_countEStats (esr , params ,
719739 countLit , offcodeCount , matchLengthCount , litLengthCount , repOffset ,
@@ -722,14 +742,21 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
722742 pos += fileSizes [u ];
723743 }
724744
725- /* analyze */
726- errorCode = HUF_buildCTable (hufTable , countLit , 255 , huffLog );
727- if (HUF_isError (errorCode )) {
728- eSize = ERROR (GENERIC );
729- DISPLAYLEVEL (1 , "HUF_buildCTable error \n" );
730- goto _cleanup ;
745+ /* analyze, build stats, starting with literals */
746+ { size_t maxNbBits = HUF_buildCTable (hufTable , countLit , 255 , huffLog );
747+ if (HUF_isError (maxNbBits )) {
748+ eSize = ERROR (GENERIC );
749+ DISPLAYLEVEL (1 , " HUF_buildCTable error \n" );
750+ goto _cleanup ;
751+ }
752+ if (maxNbBits == 8 ) { /* not compressible : will fail on HUF_writeCTable() */
753+ DISPLAYLEVEL (2 , "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n" );
754+ ZDICT_flatLit (countLit ); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
755+ maxNbBits = HUF_buildCTable (hufTable , countLit , 255 , huffLog );
756+ assert (maxNbBits == 9 );
757+ }
758+ huffLog = (U32 )maxNbBits ;
731759 }
732- huffLog = (U32 )errorCode ;
733760
734761 /* looking for most common first offsets */
735762 { U32 offset ;
@@ -850,6 +877,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
850877 U32 const notificationLevel = params .notificationLevel ;
851878
852879 /* check conditions */
880+ DEBUGLOG (4 , "ZDICT_finalizeDictionary" );
853881 if (dictBufferCapacity < dictContentSize ) return ERROR (dstSize_tooSmall );
854882 if (dictContentSize < ZDICT_CONTENTSIZE_MIN ) return ERROR (srcSize_wrong );
855883 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN ) return ERROR (dstSize_tooSmall );
@@ -1025,8 +1053,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
10251053}
10261054
10271055
1028- /* issue : samplesBuffer need to be followed by a noisy guard band.
1029- * work around : duplicate the buffer, and add the noise */
1056+ /* ZDICT_trainFromBuffer_legacy() :
1057+ * issue : samplesBuffer need to be followed by a noisy guard band.
1058+ * work around : duplicate the buffer, and add the noise */
10301059size_t ZDICT_trainFromBuffer_legacy (void * dictBuffer , size_t dictBufferCapacity ,
10311060 const void * samplesBuffer , const size_t * samplesSizes , unsigned nbSamples ,
10321061 ZDICT_legacy_params_t params )
@@ -1054,18 +1083,22 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
10541083 const void * samplesBuffer , const size_t * samplesSizes , unsigned nbSamples )
10551084{
10561085 ZDICT_cover_params_t params ;
1086+ DEBUGLOG (3 , "ZDICT_trainFromBuffer" );
10571087 memset (& params , 0 , sizeof (params ));
10581088 params .d = 8 ;
10591089 params .steps = 4 ;
1060- /* Default to level 6 since no compression level information is avaialble */
1090+ /* Default to level 6 since no compression level information is available */
10611091 params .zParams .compressionLevel = 6 ;
1092+ #if defined(ZSTD_DEBUG ) && (ZSTD_DEBUG >=1 )
1093+ params .zParams .notificationLevel = ZSTD_DEBUG ;
1094+ #endif
10621095 return ZDICT_optimizeTrainFromBuffer_cover (dictBuffer , dictBufferCapacity ,
1063- samplesBuffer , samplesSizes ,
1064- nbSamples , & params );
1096+ samplesBuffer , samplesSizes , nbSamples ,
1097+ & params );
10651098}
10661099
10671100size_t ZDICT_addEntropyTablesFromBuffer (void * dictBuffer , size_t dictContentSize , size_t dictBufferCapacity ,
1068- const void * samplesBuffer , const size_t * samplesSizes , unsigned nbSamples )
1101+ const void * samplesBuffer , const size_t * samplesSizes , unsigned nbSamples )
10691102{
10701103 ZDICT_params_t params ;
10711104 memset (& params , 0 , sizeof (params ));
0 commit comments