forked from katryo/bing_search_naive_bayes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_with_fetched_web_pages.py
More file actions
39 lines (35 loc) · 1.27 KB
/
train_with_fetched_web_pages.py
File metadata and controls
39 lines (35 loc) · 1.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
import pickle
import constants
from web_page import WebPage
from naive_bayes import NaiveBayes
def load_html_files():
"""
HTMLファイルがあるディレクトリにいる前提で使う
"""
pages = []
for i in range(constants.NUM_OF_FETCHED_PAGES):
with open('%s_%s.html' % (constants.QUERY, str(i)), 'r') as f:
page = WebPage()
page.html_body = f.read()
page.remove_html_tags()
pages.append(page)
return pages
if __name__ == '__main__':
# もういちど別の場所で使うのなら関数にする
if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME):
os.mkdir(constants.FETCHED_PAGES_DIR_NAME)
os.chdir(constants.FETCHED_PAGES_DIR_NAME)
pages = load_html_files()
pkl_nb_path = os.path.join('..', constants.NB_PKL_FILENAME)
# もしすでにNaiveBayesオブジェクトをpickle保存していたらそれを学習させる
if os.path.exists(pkl_nb_path):
with open(pkl_nb_path, 'rb') as f:
nb = pickle.load(f)
else:
nb = NaiveBayes()
for page in pages:
nb.train(page.html_body, constants.QUERY)
# せっかく学習させたんだから保存しよう
with open(pkl_nb_path, 'wb') as f:
pickle.dump(nb, f)