forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbench_text_vectorizers.py
More file actions
72 lines (51 loc) · 2 KB
/
bench_text_vectorizers.py
File metadata and controls
72 lines (51 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
To run this benchmark, you will need,
* scikit-learn
* pandas
* memory_profiler
* psutil (optional, but recommended)
"""
import timeit
import itertools
import numpy as np
import pandas as pd
from memory_profiler import memory_usage
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer,
HashingVectorizer)
n_repeat = 3
def run_vectorizer(Vectorizer, X, **params):
def f():
vect = Vectorizer(**params)
vect.fit_transform(X)
return f
text = fetch_20newsgroups(subset='train').data[:1000]
print("="*80 + '\n#' + " Text vectorizers benchmark" + '\n' + '='*80 + '\n')
print("Using a subset of the 20 newsrgoups dataset ({} documents)."
.format(len(text)))
print("This benchmarks runs in ~1 min ...")
res = []
for Vectorizer, (analyzer, ngram_range) in itertools.product(
[CountVectorizer, TfidfVectorizer, HashingVectorizer],
[('word', (1, 1)),
('word', (1, 2)),
('char', (4, 4)),
('char_wb', (4, 4))
]):
bench = {'vectorizer': Vectorizer.__name__}
params = {'analyzer': analyzer, 'ngram_range': ngram_range}
bench.update(params)
dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params),
number=1,
repeat=n_repeat)
bench['time'] = "{:.3f} (+-{:.3f})".format(np.mean(dt), np.std(dt))
mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params))
bench['memory'] = "{:.1f}".format(np.max(mem_usage))
res.append(bench)
df = pd.DataFrame(res).set_index(['analyzer', 'ngram_range', 'vectorizer'])
print('\n========== Run time performance (sec) ===========\n')
print('Computing the mean and the standard deviation '
'of the run time over {} runs...\n'.format(n_repeat))
print(df['time'].unstack(level=-1))
print('\n=============== Memory usage (MB) ===============\n')
print(df['memory'].unstack(level=-1))