-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
124 lines (97 loc) · 3.86 KB
/
utils.py
File metadata and controls
124 lines (97 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pickle
import numpy as np
import torch
from torch.autograd import Variable
PAD = '<PAD>' # TODO
def save_pickle(d, path):
print('save pickle to', path)
with open(path, mode='wb') as f:
pickle.dump(d, f)
def load_pickle(path):
print('load', path)
with open(path, mode='rb') as f:
return pickle.load(f)
def load_vocabulary(vocab_path, label_path):
id_to_word = {}
with open(vocab_path) as f:
lines = f.readlines()
for l in lines:
d = l.rstrip().split('\t')
if d[0] not in id_to_word:
id_to_word[d[0]] = d[1]
label_to_ans = {}
label_to_ans_text = {}
with open(label_path) as f:
lines = f.readlines()
for l in lines:
label, answer = l.rstrip().split('\t')
if label not in label_to_ans:
label_to_ans[label] = answer
label_to_ans_text[label] = [id_to_word[t] for t in answer.split(' ')]
return id_to_word, label_to_ans, label_to_ans_text
# def load_data_elmwise(fpath, id_to_word, label_to_ans_text):
# data = []
# with open(fpath) as f:
# lines = f.readlines()
# for l in lines:
# d = l.rstrip().split('\t')
# q = [id_to_word[t] for t in d[1].split(' ')] # question
# poss = [label_to_ans_text[t] for t in d[2].split(' ')] # ground-truth
# negs = [label_to_ans_text[t] for t in d[3].split(' ') if t not in d[2]] # candidate-pool without ground-truth
# for pos in poss:
# for neg in negs:
# data.append((q, pos, neg))
# return data
def load_data(fpath, id_to_word, label_to_ans_text):
data = []
with open(fpath) as f:
lines = f.readlines()
for l in lines:
d = l.rstrip().split('\t')
q = [id_to_word[t] for t in d[1].split(' ')] # question
poss = [label_to_ans_text[t] for t in d[2].split(' ')] # ground-truth
negs = [label_to_ans_text[t] for t in d[3].split(' ') if t not in d[2]] # candidate-pool without ground-truth
for pos in poss:
data.append((q, pos, negs))
return data
def load_data2(fpath, id_to_word, label_to_ans_text):
data = []
with open(fpath) as f:
lines = f.readlines()
for l in lines[12:]:
d = l.rstrip().split('\t')
q = [id_to_word[t] for t in d[1].split(' ')] # question
# poss = [label_to_ans_text[t] for t in d[2].split(' ')] # ground-truth
# cands = [label_to_ans_text[t] for t in d[3].split(' ')] # candidate-pool
poss = [t for t in d[2].split(' ')] # ground-truth
cands = [t for t in d[3].split(' ')] # candidate-pool
data.append((q, poss, cands))
return data
def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)
def to_np(x):
return x.data.cpu().numpy()
def load_embd_weights(word2vec, vocab_size, embd_size, w2i):
embedding_matrix = np.zeros((vocab_size, embd_size))
print('embed_matrix.shape', embedding_matrix.shape)
found_ct = 0
for word, idx in w2i.items():
# words not found in embedding index will be all-zeros.
if word in word2vec.wv:
embedding_matrix[idx] = word2vec.wv[word]
found_ct += 1
print(found_ct, 'words are found in word2vec. vocab_size is', vocab_size)
return torch.from_numpy(embedding_matrix).type(torch.FloatTensor)
def padding(data, max_sent_len, pad_token):
pad_len = max(0, max_sent_len - len(data))
data += [pad_token] * pad_len
data = data[:max_sent_len]
return data
def make_vector(data, w2i, seq_len):
ret_data = [padding([w2i[w] for w in d], seq_len, w2i[PAD]) for d in data]
return to_var(torch.LongTensor(ret_data))
class Config(object):
def __init__(self, **entries):
self.__dict__.update(entries)