-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdoc2vec_train.py
More file actions
51 lines (43 loc) · 1.41 KB
/
doc2vec_train.py
File metadata and controls
51 lines (43 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#-*- coding: utf-8 -*-
from gensim.models import doc2vec
import sys
import multiprocessing
cores = multiprocessing.cpu_count()
#doc2vec parameters
vector_size = 300
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1 #0 = dbow; 1 = dmpv
worker_count = cores #number of parallel processes
print(len(sys.argv))
if len(sys.argv) >= 3:
inputfile = sys.argv[1]
modelfile = sys.argv[2]
else:
inputfile = "./data/moby.txt"
modelfile = "./model/sample.model"
word2vec_file = modelfile + ".word2vec_format"
sentences=doc2vec.TaggedLineDocument(inputfile)
# #build voca
# print("build voca")
# doc_vectorizer = doc2vec.Doc2Vec(min_count=word_min_count, size=vector_size, alpha=0.025, min_alpha=0.025, seed=1234, workers=worker_count)
# doc_vectorizer.build_vocab(sentences)
#
#
# # Train document vectors!
# print("train ducoment vectore!")
# for epoch in range(10):
# print("doc_vectorizer.train(sentences)")
# print(sentences)
# doc_vectorizer.train(sentences)
# print("doc_vectorizer.alpha -= 0.002 # decrease the learning rate")
# doc_vectorizer.alpha -= 0.002 # decrease the learning rate
# print("doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay")
# doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay
#
# # To save
# doc_vectorizer.save(modelfile)
# doc_vectorizer.save_word2vec_format(word2vec_file, binary=False)