Skip to content

Commit 7d39bbb

Browse files
committed
Avoid allocating on every input line.
By using `BufRead::read_line` instead of `BufRead::lines`. Now allocations are only required (a) for the first occurrence of a particular line, and (b) for lines that are modified by `-e`. This requires some changes to handle cases where the line is a `&str` vs a `String` (e.g. after modification due to `-e`). This roughly doubles the speed of `counts` on files where every line is the same, and leaves it unchanged on files where every line is different. Typical cases will be between those two extremes. This idea came from @Shnatsel via nnethercote/perf-book#68.
1 parent af5ea94 commit 7d39bbb

2 files changed

Lines changed: 59 additions & 27 deletions

File tree

src/main.rs

Lines changed: 58 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -85,23 +85,23 @@ fn do_main() -> io::Result<()> {
8585

8686
let erased_label = if erase { ", erased" } else { "" };
8787
match weights {
88-
Unit => process(readers, "", |line| (line, 1i64)),
88+
Unit => process(readers, "", |_line| (None, 1i64)),
8989
Integral => {
9090
let re = Regex::new(r"(([+-]?)\d+)(\D*)$").unwrap();
9191
process(
9292
readers,
9393
&format!(" (weighted integral{})", erased_label),
9494
|line| {
95-
if let Some(captures) = re.captures(&line) {
95+
if let Some(captures) = re.captures(line) {
9696
let weight = i64::from_str(&captures[1]).unwrap();
97-
let line = if erase {
98-
re.replace(&line, "NNN${3}").to_string()
97+
if erase {
98+
let line = re.replace(line, "NNN${3}").to_string();
99+
(Some(line), weight)
99100
} else {
100-
line
101-
};
102-
(line, weight)
101+
(None, weight)
102+
}
103103
} else {
104-
(line, 1i64)
104+
(None, 1i64)
105105
}
106106
},
107107
)
@@ -112,16 +112,16 @@ fn do_main() -> io::Result<()> {
112112
readers,
113113
&format!(" (weighted fractional{})", erased_label),
114114
|line| {
115-
if let Some(captures) = re.captures(&line) {
115+
if let Some(captures) = re.captures(line) {
116116
let weight = f64::from_str(&captures[1]).unwrap();
117-
let line = if erase {
118-
re.replace(&line, "NNN${4}").to_string()
117+
if erase {
118+
let line = re.replace(line, "NNN${4}").to_string();
119+
(Some(line), weight)
119120
} else {
120-
line
121-
};
122-
(line, weight)
121+
(None, weight)
122+
}
123123
} else {
124-
(line, 1f64)
124+
(None, 1f64)
125125
}
126126
},
127127
)
@@ -137,22 +137,54 @@ fn process<F, N>(
137137
get_line_and_weight: F,
138138
) -> io::Result<()>
139139
where
140-
F: Fn(String) -> (String, N),
140+
F: Fn(&str) -> (Option<String>, N),
141141
N: Total,
142142
{
143-
let mut counts: FxHashMap<String, N> = FxHashMap::default();
144143
let mut total = N::from(0u32);
144+
let mut counts: FxHashMap<String, N> = FxHashMap::default();
145145

146-
for reader in readers {
147-
for line in reader.lines() {
148-
let Ok(line) = line else {
149-
eprintln!("counts: non-UTF-8 input detected, aborting");
150-
std::process::exit(1);
151-
};
152-
let (line, weight) = get_line_and_weight(line);
153-
let entry = counts.entry(line).or_insert_with(|| N::from(0u32));
154-
*entry += weight;
146+
// `reader.lines()` is the obvious way to do this loop, but that requires
147+
// allocating every line into a `String`. Instead we use
148+
// `reader.read_line()` and use a single string for every iteration. On the
149+
// first occurrence of a line we need to do a `to_string` to insert it into
150+
// the table. On subsequent occurrences we don't. Most `counts` input tends
151+
// to have significant numbers of repeated lines, so this approach reduces
152+
// allocation counts greatly.
153+
let mut line_with_nl = String::new();
154+
for mut reader in readers {
155+
loop {
156+
match reader.read_line(&mut line_with_nl) {
157+
Ok(0) => break,
158+
Ok(_) => {}
159+
Err(err) => {
160+
eprintln!("counts: {}", err);
161+
std::process::exit(1);
162+
}
163+
}
164+
165+
let line = &line_with_nl[..line_with_nl.len() - 1];
166+
let (modified_line, weight) = get_line_and_weight(line);
167+
match modified_line {
168+
None => {
169+
// The line has not been modified. Only promote to `String`
170+
// if it hasn't been seen before.
171+
if let Some(entry) = counts.get_mut(line) {
172+
*entry += weight;
173+
} else {
174+
counts.insert(line.to_string(), weight);
175+
}
176+
}
177+
Some(modified_line) => {
178+
// The line has been modified, which means it has already
179+
// been promoted to a `String`.
180+
let entry = counts.entry(modified_line).or_insert_with(|| N::from(0u32));
181+
*entry += weight;
182+
}
183+
}
155184
total += weight;
185+
186+
// We are finished with the contents of this line.
187+
line_with_nl.clear();
156188
}
157189
}
158190

tests/cli.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ baz 23 - +1
228228
#[test]
229229
fn non_utf8() -> Result<(), Box<dyn std::error::Error>> {
230230
let input = unsafe { std::str::from_utf8_unchecked(&[0x97, 0x98, 0x99, 0xff]) };
231-
let expected_output = "counts: non-UTF-8 input detected, aborting\n";
231+
let expected_output = "counts: stream did not contain valid UTF-8\n";
232232

233233
bad_test(input, expected_output)
234234
}

0 commit comments

Comments
 (0)