@@ -31,7 +31,11 @@ static struct
3131 uint64_t m_MaxGrandParentOverlapBytes; // !< needs tuning, but not essential
3232 // !< since moves eliminated
3333 int64_t m_ExpandedCompactionByteSizeLimit; // !< needs tuning
34- uint64_t m_MaxBytesForLevel; // !< ignored if m_OverlappedFiles is true
34+
35+ // next two ignored if m_OverlappedFiles is true
36+ uint64_t m_MaxBytesForLevel; // !< start write throttle above this
37+ uint64_t m_DesiredBytesForLevel; // !< compact into next level until this
38+
3539 uint64_t m_MaxFileSizeForLevel; // !< google really applies this
3640 // !< to file size of NEXT level
3741 bool m_OverlappedFiles; // !< false means sst files are sorted
@@ -44,15 +48,18 @@ static struct
4448// to one level-1 file and are each slightly larger than 60,000,000.
4549// level-1 file size of 1,500,000,000 applies to output file of this level
4650// being written to level-2. The value is five times the 300,000,000 of level-1.
51+ // level-3 is the "landing zone" / first sorted level. Try to keep it clear.
52+ // hence the low m_DesiredBytes for level
4753
54+ // WARNING: m_OverlappedFiles flags need to match config::kNumOverlapFiles ... until unified
4855{
49- {10485760 , 262144000 , 57671680 , 209715200 , 300000000 , true },
50- {10485760 , 262144000 , 57671680 , 419430400 ,1500000000 , true },
51- {10485760 , 262144000 , 57671680 , 4194304000 , 31457280 , true },
52- {10485760 , 262144000 , 57671680 , 2097152000 , 41943040 , false },
53- {10485760 , 262144000 , 57671680 , 41943040000 , 52428800 , false },
54- {10485760 , 262144000 , 57671680 , 419430400000 , 62914560 , false },
55- {10485760 , 262144000 , 57671680 , 4194304000000 , 73400320 , false }
56+ {10485760 , 262144000 , 57671680 , 209715200 , 0 , 300000000 , true },
57+ {10485760 , 262144000 , 57671680 , 419430400 , 0 , 1500000000 , true },
58+ {10485760 , 262144000 , 57671680 , 4194304000 , 0 , 31457280 , true },
59+ {10485760 , 125829120 , 57671680 , 1610612736 , 30000000 , 41943040 , false },
60+ {10485760 , 147286400 , 57671680 , 41943040000 , 33554432000 , 52428800 , false },
61+ {10485760 , 188743680 , 57671680 , 419430400000 , 335544320000 , 62914560 , false },
62+ {10485760 , 220200960 , 57671680 , 4194304000000 , 3355443200000 , 73400320 , false }
5663};
5764
5865
@@ -544,19 +551,17 @@ Version::VerifyLevels(
544551 if (!gLevelTraits [level].m_OverlappedFiles && 1 <files_[level].size ())
545552 {
546553 const std::vector<FileMetaData*>& files = files_[level];
547- int inner, outer;
554+ size_t inner, outer;
548555
549556 for (outer=0 ; outer<files.size ()-1 && !overlap_found; ++outer)
550557 {
551558 FileMetaData* outer_meta = files_[level][outer];
552- const Slice outer_start = outer_meta->smallest .user_key ();
553559 const Slice outer_limit = outer_meta->largest .user_key ();
554560
555561 for (inner=outer+1 ; inner<files.size () && !overlap_found; ++inner)
556562 {
557563 FileMetaData* inner_meta = files_[level][inner];
558564 const Slice inner_start = inner_meta->smallest .user_key ();
559- const Slice inner_limit = inner_meta->largest .user_key ();
560565
561566 // do files overlap? assumes vector sorted by "start"
562567 if (user_cmp->Compare (inner_start, outer_limit) <= 0 )
@@ -1054,29 +1059,67 @@ void VersionSet::Finalize(Version* v) {
10541059 // file size is small (perhaps because of a small write-buffer
10551060 // setting, or very high compression ratios, or lots of
10561061 // overwrites/deletions).
1057- score = v->files_ [level].size () /
1058- static_cast <double >(config::kL0_CompactionTrigger );
10591062
1060- // don't screw around ... get data written to disk!
1061- if (0 ==level
1062- && (size_t )config::kL0_SlowdownWritesTrigger <= v->files_ [level].size ())
1063- score*=1000000.0 ;
1063+ score=0 ;
1064+
1065+ // score of 1 at compaction trigger, incrementing for each thereafter
1066+ if ( config::kL0_CompactionTrigger <= v->files_ [level].size ())
1067+ score += v->files_ [level].size () - config::kL0_CompactionTrigger +1 ;
1068+
1069+ // raise score above slowdown trigger to ensure this out scores
1070+ // compactions at config::kNumOverlapLevels level
1071+ if ( config::kL0_SlowdownWritesTrigger <= v->files_ [level].size ())
1072+ score += (v->files_ [level].size () - config::kL0_SlowdownWritesTrigger )*3 ;
10641073
10651074 // compute penalty for write throttle if too many Level-0 files accumulating
1066- if (( size_t ) config::kL0_CompactionTrigger < v->files_ [level].size ())
1075+ if (config::kL0_CompactionTrigger < v->files_ [level].size ())
10671076 {
1068- penalty+=v->files_ [level].size () - config::kL0_CompactionTrigger ;
1077+ // assume each overlapped file represents another pass at same key
1078+ // and we are "close" on compaction backlog
1079+ if ( v->files_ [level].size () < config::kL0_SlowdownWritesTrigger )
1080+ {
1081+ penalty+= (v->files_ [level].size () - config::kL0_CompactionTrigger );
1082+ } // if
1083+
1084+ // no longer estimating work, now trying to throw on the breaks
1085+ // to keep leveldb from stalling
1086+ else
1087+ {
1088+ int loop, count, value;
1089+
1090+ count=(v->files_ [level].size () - config::kL0_SlowdownWritesTrigger ) +1 ;
1091+
1092+ // logarithmic throttle. 8 works against FusionIO, but 7 or 6 should be tested.
1093+ for (loop=0 , value=8 ; loop<count; ++loop)
1094+ value*=8 ;
1095+
1096+ penalty+=value;
1097+ } // else
10691098 } // if
10701099
1100+ // don't screw around ... get data written to disk!
1101+ if (0 ==level
1102+ && (size_t )config::kL0_SlowdownWritesTrigger <= v->files_ [level].size ())
1103+ score*=1000000.0 ;
1104+ else
1105+ score*=10 ; // give weight to overlapped levels over non-overlapped
1106+
10711107 } else {
10721108 // Compute the ratio of current size to size limit.
1109+ double penalty_score;
1110+
10731111 const uint64_t level_bytes = TotalFileSize (v->files_ [level]);
1074- score = static_cast <double >(level_bytes) / gLevelTraits [level].m_MaxBytesForLevel ;
1112+ score = static_cast <double >(level_bytes) / gLevelTraits [level].m_DesiredBytesForLevel ;
10751113
1076- // riak 1.4: new overlapped levels remove the requirement for
1077- // aggressive penalties here, hence the retirement of "*2" and previous "*5".
1078- if (2.6 <score)
1079- penalty+=(static_cast <int >(score))-1 ;// was *2; // was *5;
1114+ if (config::kNumOverlapLevels !=level)
1115+ penalty_score = static_cast <double >(level_bytes) / gLevelTraits [level].m_MaxBytesForLevel ;
1116+
1117+ // first sort layer needs to clear before next dump of overlapped files.
1118+ else
1119+ penalty_score = static_cast <double >(level_bytes) / gLevelTraits [level].m_DesiredBytesForLevel ;
1120+
1121+ if (1.0 <penalty_score)
1122+ penalty+=(static_cast <int >(penalty_score));
10801123 }
10811124
10821125 if (score > best_score) {
@@ -1088,8 +1131,9 @@ void VersionSet::Finalize(Version* v) {
10881131 v->compaction_level_ = best_level;
10891132 v->compaction_score_ = best_score;
10901133
1091- if (500 <penalty)
1092- penalty=500 ;
1134+ if (100000 <penalty)
1135+ penalty=100000 ;
1136+
10931137 v->write_penalty_ = penalty;
10941138
10951139}
@@ -1546,11 +1590,10 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
15461590 return ret_flag;
15471591}
15481592
1549- bool Compaction::ShouldStopBefore (const Slice& internal_key) {
1550- #if 0
1551- // 04/11/2013 code seems to create way too many small files
1552- // in lower levels once highers start to populate
1553- // this causes max_open_files to blow out too early
1593+ bool Compaction::ShouldStopBefore (const Slice& internal_key, size_t key_count) {
1594+
1595+ // This is a look ahead to see how costly this key will make the subsequent compaction
1596+ // of this new file to the next higher level. Start a new file if the cost is high.
15541597 if (!gLevelTraits [level ()+1 ].m_OverlappedFiles )
15551598 {
15561599 // Scan to find earliest grandparent file that contains key.
@@ -1569,18 +1612,21 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
15691612 // Too much overlap for current output; start new output
15701613 overlapped_bytes_ = 0 ;
15711614 return true ;
1572- } else {
1573- return false;
1574- }
1615+ } // if
1616+
1617+ // Second consideration: sorted files need to keep the bloom filter size controlled
1618+ // to meet file open speed goals
1619+ else
1620+ {
1621+ overlapped_bytes_ = 0 ;
1622+ return (75000 < key_count);
1623+ } // else
15751624 } // if
15761625 else
15771626 {
15781627 // overlapped levels do NOT split their output file
15791628 return false ;
15801629 }
1581- #else
1582- return false ;
1583- #endif
15841630}
15851631
15861632void Compaction::ReleaseInputs () {
0 commit comments