Skip to content

Commit be0d467

Browse files
author
Matthew Von-Maszewski
committed
Merge pull request #81 from basho/mv-bloom-size-limit
Mv bloom size limit
2 parents 78e42d1 + 79547d7 commit be0d467

File tree

7 files changed

+108
-49
lines changed

7 files changed

+108
-49
lines changed

db/db_impl.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ DBImpl::KeepOrDelete(
315315
if (type == kTableFile) {
316316
// temporary hard coding of extra overlapped
317317
// levels
318-
table_cache_->Evict(number, (Level<3));
318+
table_cache_->Evict(number, (Level<config::kNumOverlapLevels));
319319
}
320320
Log(options_.info_log, "Delete type=%d #%lld\n",
321321
int(type),
@@ -1087,8 +1087,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
10871087
imm_micros+=PrioritizeWork(is_level0_compaction);
10881088

10891089
Slice key = input->key();
1090-
if (compact->compaction->ShouldStopBefore(key) &&
1091-
compact->builder != NULL) {
1090+
if (compact->builder != NULL
1091+
&& compact->compaction->ShouldStopBefore(key, compact->builder->NumEntries())) {
1092+
10921093
status = FinishCompactionOutputFile(compact, input);
10931094
if (!status.ok()) {
10941095
break;
@@ -1427,7 +1428,10 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
14271428
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
14281429
int throttle;
14291430

1431+
// protect use of versions_ ... apply lock
1432+
mutex_.Lock();
14301433
throttle=versions_->WriteThrottleUsec(bg_compaction_scheduled_);
1434+
mutex_.Unlock();
14311435
if (0!=throttle)
14321436
{
14331437
/// slowing each call down sequentially
@@ -1436,6 +1440,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
14361440
// throttle is per key write, how many in batch?
14371441
// (batch multiplier killed AAE, removed)
14381442
env_->SleepForMicroseconds(throttle /* * WriteBatchInternal::Count(my_batch)*/);
1443+
gPerfCounters->Add(ePerfDebug0, throttle);
14391444
} // if
14401445

14411446
Writer w(&mutex_);

db/dbformat.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,13 @@ class Compaction;
2222
// parameters set via options.
2323
namespace config {
2424
static const int kNumLevels = 7;
25+
static const int kNumOverlapLevels = 3;
2526

2627
// Level-0 compaction is started when we hit this many files.
27-
static const int kL0_CompactionTrigger = 4;
28+
static const size_t kL0_CompactionTrigger = 4;
2829

2930
// Soft limit on number of level-0 files. We slow down writes at this point.
30-
static const int kL0_SlowdownWritesTrigger = 8;
31+
static const size_t kL0_SlowdownWritesTrigger = 8;
3132

3233
// Maximum number of level-0 files. We stop writes at this point.
3334
static const int kL0_StopWritesTrigger = 12;

db/table_cache.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level
8282

8383
// temporary hardcoding to match number of levels defined as
8484
// overlapped in version_set.cc
85-
if (level<3)
85+
if (level<config::kNumOverlapLevels)
8686
cache_->Addref(*handle);
8787
}
8888
}

db/version_set.cc

Lines changed: 83 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@ static struct
3131
uint64_t m_MaxGrandParentOverlapBytes; //!< needs tuning, but not essential
3232
//!< since moves eliminated
3333
int64_t m_ExpandedCompactionByteSizeLimit; //!< needs tuning
34-
uint64_t m_MaxBytesForLevel; //!< ignored if m_OverlappedFiles is true
34+
35+
// next two ignored if m_OverlappedFiles is true
36+
uint64_t m_MaxBytesForLevel; //!< start write throttle above this
37+
uint64_t m_DesiredBytesForLevel; //!< compact into next level until this
38+
3539
uint64_t m_MaxFileSizeForLevel; //!< google really applies this
3640
//!< to file size of NEXT level
3741
bool m_OverlappedFiles; //!< false means sst files are sorted
@@ -44,15 +48,18 @@ static struct
4448
// to one level-1 file and are each slightly larger than 60,000,000.
4549
// level-1 file size of 1,500,000,000 applies to output file of this level
4650
// being written to level-2. The value is five times the 300,000,000 of level-1.
51+
// level-3 is the "landing zone" / first sorted level. Try to keep it clear.
52+
// hence the low m_DesiredBytes for level
4753

54+
// WARNING: m_OverlappedFiles flags need to match config::kNumOverlapFiles ... until unified
4855
{
49-
{10485760, 262144000, 57671680, 209715200, 300000000, true},
50-
{10485760, 262144000, 57671680, 419430400,1500000000, true},
51-
{10485760, 262144000, 57671680, 4194304000, 31457280, true},
52-
{10485760, 262144000, 57671680, 2097152000, 41943040, false},
53-
{10485760, 262144000, 57671680, 41943040000, 52428800, false},
54-
{10485760, 262144000, 57671680, 419430400000, 62914560, false},
55-
{10485760, 262144000, 57671680, 4194304000000, 73400320, false}
56+
{10485760, 262144000, 57671680, 209715200, 0, 300000000, true},
57+
{10485760, 262144000, 57671680, 419430400, 0, 1500000000, true},
58+
{10485760, 262144000, 57671680, 4194304000, 0, 31457280, true},
59+
{10485760, 125829120, 57671680, 1610612736, 30000000, 41943040, false},
60+
{10485760, 147286400, 57671680, 41943040000, 33554432000, 52428800, false},
61+
{10485760, 188743680, 57671680, 419430400000, 335544320000, 62914560, false},
62+
{10485760, 220200960, 57671680, 4194304000000, 3355443200000, 73400320, false}
5663
};
5764

5865

@@ -544,19 +551,17 @@ Version::VerifyLevels(
544551
if (!gLevelTraits[level].m_OverlappedFiles && 1<files_[level].size())
545552
{
546553
const std::vector<FileMetaData*>& files = files_[level];
547-
int inner, outer;
554+
size_t inner, outer;
548555

549556
for (outer=0; outer<files.size()-1 && !overlap_found; ++outer)
550557
{
551558
FileMetaData* outer_meta = files_[level][outer];
552-
const Slice outer_start = outer_meta->smallest.user_key();
553559
const Slice outer_limit = outer_meta->largest.user_key();
554560

555561
for (inner=outer+1; inner<files.size() && !overlap_found; ++inner)
556562
{
557563
FileMetaData* inner_meta = files_[level][inner];
558564
const Slice inner_start = inner_meta->smallest.user_key();
559-
const Slice inner_limit = inner_meta->largest.user_key();
560565

561566
// do files overlap? assumes vector sorted by "start"
562567
if (user_cmp->Compare(inner_start, outer_limit) <= 0)
@@ -1054,29 +1059,67 @@ void VersionSet::Finalize(Version* v) {
10541059
// file size is small (perhaps because of a small write-buffer
10551060
// setting, or very high compression ratios, or lots of
10561061
// overwrites/deletions).
1057-
score = v->files_[level].size() /
1058-
static_cast<double>(config::kL0_CompactionTrigger);
10591062

1060-
// don't screw around ... get data written to disk!
1061-
if (0==level
1062-
&& (size_t)config::kL0_SlowdownWritesTrigger <= v->files_[level].size())
1063-
score*=1000000.0;
1063+
score=0;
1064+
1065+
// score of 1 at compaction trigger, incrementing for each thereafter
1066+
if ( config::kL0_CompactionTrigger <= v->files_[level].size())
1067+
score += v->files_[level].size() - config::kL0_CompactionTrigger +1;
1068+
1069+
// raise score above slowdown trigger to ensure this out scores
1070+
// compactions at config::kNumOverlapLevels level
1071+
if ( config::kL0_SlowdownWritesTrigger <= v->files_[level].size())
1072+
score += (v->files_[level].size() - config::kL0_SlowdownWritesTrigger)*3;
10641073

10651074
// compute penalty for write throttle if too many Level-0 files accumulating
1066-
if ((size_t)config::kL0_CompactionTrigger < v->files_[level].size())
1075+
if (config::kL0_CompactionTrigger < v->files_[level].size())
10671076
{
1068-
penalty+=v->files_[level].size() - config::kL0_CompactionTrigger;
1077+
// assume each overlapped file represents another pass at same key
1078+
// and we are "close" on compaction backlog
1079+
if ( v->files_[level].size() < config::kL0_SlowdownWritesTrigger)
1080+
{
1081+
penalty+= (v->files_[level].size() - config::kL0_CompactionTrigger);
1082+
} // if
1083+
1084+
// no longer estimating work, now trying to throw on the breaks
1085+
// to keep leveldb from stalling
1086+
else
1087+
{
1088+
int loop, count, value;
1089+
1090+
count=(v->files_[level].size() - config::kL0_SlowdownWritesTrigger) +1;
1091+
1092+
// logarithmic throttle. 8 works against FusionIO, but 7 or 6 should be tested.
1093+
for (loop=0, value=8; loop<count; ++loop)
1094+
value*=8;
1095+
1096+
penalty+=value;
1097+
} // else
10691098
} // if
10701099

1100+
// don't screw around ... get data written to disk!
1101+
if (0==level
1102+
&& (size_t)config::kL0_SlowdownWritesTrigger <= v->files_[level].size())
1103+
score*=1000000.0;
1104+
else
1105+
score*=10; // give weight to overlapped levels over non-overlapped
1106+
10711107
} else {
10721108
// Compute the ratio of current size to size limit.
1109+
double penalty_score;
1110+
10731111
const uint64_t level_bytes = TotalFileSize(v->files_[level]);
1074-
score = static_cast<double>(level_bytes) / gLevelTraits[level].m_MaxBytesForLevel;
1112+
score = static_cast<double>(level_bytes) / gLevelTraits[level].m_DesiredBytesForLevel;
10751113

1076-
// riak 1.4: new overlapped levels remove the requirement for
1077-
// aggressive penalties here, hence the retirement of "*2" and previous "*5".
1078-
if (2.6<score)
1079-
penalty+=(static_cast<int>(score))-1;// was *2; // was *5;
1114+
if (config::kNumOverlapLevels!=level)
1115+
penalty_score = static_cast<double>(level_bytes) / gLevelTraits[level].m_MaxBytesForLevel;
1116+
1117+
// first sort layer needs to clear before next dump of overlapped files.
1118+
else
1119+
penalty_score = static_cast<double>(level_bytes) / gLevelTraits[level].m_DesiredBytesForLevel;
1120+
1121+
if (1.0<penalty_score)
1122+
penalty+=(static_cast<int>(penalty_score));
10801123
}
10811124

10821125
if (score > best_score) {
@@ -1088,8 +1131,9 @@ void VersionSet::Finalize(Version* v) {
10881131
v->compaction_level_ = best_level;
10891132
v->compaction_score_ = best_score;
10901133

1091-
if (500<penalty)
1092-
penalty=500;
1134+
if (100000<penalty)
1135+
penalty=100000;
1136+
10931137
v->write_penalty_ = penalty;
10941138

10951139
}
@@ -1546,11 +1590,10 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
15461590
return ret_flag;
15471591
}
15481592

1549-
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
1550-
#if 0
1551-
// 04/11/2013 code seems to create way too many small files
1552-
// in lower levels once highers start to populate
1553-
// this causes max_open_files to blow out too early
1593+
bool Compaction::ShouldStopBefore(const Slice& internal_key, size_t key_count) {
1594+
1595+
// This is a look ahead to see how costly this key will make the subsequent compaction
1596+
// of this new file to the next higher level. Start a new file if the cost is high.
15541597
if (!gLevelTraits[level()+1].m_OverlappedFiles)
15551598
{
15561599
// Scan to find earliest grandparent file that contains key.
@@ -1569,18 +1612,21 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
15691612
// Too much overlap for current output; start new output
15701613
overlapped_bytes_ = 0;
15711614
return true;
1572-
} else {
1573-
return false;
1574-
}
1615+
} // if
1616+
1617+
// Second consideration: sorted files need to keep the bloom filter size controlled
1618+
// to meet file open speed goals
1619+
else
1620+
{
1621+
overlapped_bytes_ = 0;
1622+
return (75000 < key_count);
1623+
} // else
15751624
} // if
15761625
else
15771626
{
15781627
// overlapped levels do NOT split their output file
15791628
return false;
15801629
}
1581-
#else
1582-
return false;
1583-
#endif
15841630
}
15851631

15861632
void Compaction::ReleaseInputs() {

db/version_set.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ class Compaction {
357357

358358
// Returns true iff we should stop building the current output
359359
// before processing "internal_key".
360-
bool ShouldStopBefore(const Slice& internal_key);
360+
bool ShouldStopBefore(const Slice& internal_key, size_t key_count);
361361

362362
// Release the input version for the compaction, once the compaction
363363
// is successful.
@@ -382,7 +382,7 @@ class Compaction {
382382
std::vector<FileMetaData*> grandparents_;
383383
size_t grandparent_index_; // Index in grandparent_starts_
384384
bool seen_key_; // Some output key has been seen
385-
int64_t overlapped_bytes_; // Bytes of overlap between current output
385+
uint64_t overlapped_bytes_; // Bytes of overlap between current output
386386
// and grandparent files
387387

388388
// State for implementing IsBaseLevelForKey

util/env_posix.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,13 @@ void * arg)
105105

106106
// average write time for level 1+ compactions per key
107107
// times the average number of tasks waiting
108-
new_throttle=(tot_micros / tot_keys)
109-
* (tot_backlog / tot_compact);
108+
// ( the *100 stuff is to exploit fractional data in integers )
109+
new_throttle=((tot_micros*100) / tot_keys)
110+
* ((tot_backlog*100) / tot_compact);
110111

112+
new_throttle /= 10000; // remove *100 stuff
113+
if (0==new_throttle)
114+
new_throttle=1; // throttle must have an effect
111115
} // if
112116

113117
// attempt to most recent level0
@@ -122,7 +126,7 @@ void * arg)
122126
} // else if
123127
else
124128
{
125-
new_throttle=0;
129+
new_throttle=1;
126130
} // else
127131

128132
// change the throttle slowly
@@ -131,6 +135,9 @@ void * arg)
131135
else
132136
gThrottleRate-=(gThrottleRate - new_throttle)/THROTTLE_SCALING;
133137

138+
if (0==gThrottleRate)
139+
gThrottleRate=1; // throttle must always have an effect
140+
134141
gPerfCounters->Set(ePerfThrottleGauge, gThrottleRate);
135142
gPerfCounters->Add(ePerfThrottleCounter, gThrottleRate*THROTTLE_SECONDS);
136143

util/options.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Options::Options()
1515
: comparator(BytewiseComparator()),
1616
create_if_missing(false),
1717
error_if_exists(false),
18-
paranoid_checks(false),
18+
paranoid_checks(true),
1919
env(Env::Default()),
2020
info_log(NULL),
2121
write_buffer_size(4<<20),

0 commit comments

Comments
 (0)