-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathjcs_main.py
More file actions
executable file
·154 lines (112 loc) · 6.79 KB
/
jcs_main.py
File metadata and controls
executable file
·154 lines (112 loc) · 6.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
'''
Run lexical substitution experiments
'''
import sys
import time
import argparse
import re
import numpy
from jcs.jcs_io import extract_word_weight
from jcs.data.context_instance import ContextInstance
from jcs.jcs_io import vec_to_str
from jcs.jcs_io import vec_to_str_generated
from jcs.cs_embedding_inferrer import CsEmbeddingInferrer
from jcs.context2vec_inferrer import Context2vecInferrer
target_re = re.compile(".*__(.*)__.*")
def read_candidates(candidates_file):
target2candidates = {}
# finally.r::eventually;ultimately
with open(candidates_file, 'r') as f:
for line in f:
segments = line.split('::')
target = segments[0]
candidates = set(segments[1].strip().split(';'))
target2candidates[target] = candidates
return target2candidates
def run_test(inferrer):
if args.candidatesfile != None:
target2candidates = read_candidates(args.candidatesfile)
else:
target2candidates = None
tfi = open(args.testfile, 'r')
tfo = open(args.resultsfile, 'w')
tfo_ranked = open(args.resultsfile+'.ranked', 'w')
tfo_generated_oot = open(args.resultsfile+'.generated.oot', 'w')
tfo_generated_best = open(args.resultsfile+'.generated.best', 'w')
lines = 0
while True:
context_line = tfi.readline()
if not context_line:
break;
lst_instance = ContextInstance(context_line, args.no_pos)
lines += 1
if (args.debug == True):
tfo.write("\nTest context:\n")
tfo.write("***************\n")
tfo.write(lst_instance.decorate_context())
result_vec = inferrer.find_inferred(lst_instance, tfo)
generated_results = inferrer.generate_inferred(result_vec, lst_instance.target, lst_instance.target_lemma, lst_instance.pos)
tfo.write("\nGenerated lemmatized results\n")
tfo.write("***************\n")
tfo.write("GENERATED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(generated_results.iteritems(), args.topgenerated)+"\n")
tfo_generated_oot.write(' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(generated_results.iteritems(), args.topgenerated)+"\n")
tfo_generated_best.write(' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " :: " + vec_to_str_generated(generated_results.iteritems(), 1)+"\n")
filtered_results = inferrer.filter_inferred(result_vec, target2candidates[lst_instance.target_key], lst_instance.pos)
tfo.write("\nFiltered results\n")
tfo.write("***************\n")
tfo.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(filtered_results.iteritems(), len(filtered_results))+"\n")
tfo_ranked.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(filtered_results.iteritems(), len(filtered_results))+"\n")
# print "end %f" % time.time()
if lines % 10 == 0:
print "Read %d lines" % lines
print "Read %d lines in total" % lines
print "Time per word: %f msec" % inferrer.msec_per_word()
tfi.close()
tfo.close()
tfo_ranked.close()
tfo_generated_oot.close()
tfo_generated_best.close()
def run(args):
print time.asctime(time.localtime(time.time()))
if args.inferrer == 'emb':
inferrer = CsEmbeddingInferrer(args.vocabfile, args.ignoretarget, args.contextmath, args.embeddingpath, args.embeddingpathc, args.testfileconll, args.bow_size, 10)
print "Using CsEmbeddingInferrer"
elif args.inferrer == 'lstm':
inferrer = Context2vecInferrer(args.lstm_config, args.ignoretarget, args.contextmath, 10)
print "Using Context2vecInferrer"
else:
raise Exception("Unknown inferrer type: " + args.inferrer)
print time.asctime(time.localtime(time.time()))
run_test(inferrer)
print "Finished"
print time.asctime(time.localtime(time.time()))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='JCS utility')
parser.add_argument('--inferrer', choices=['lstm', 'emb'],
default='lstm',
help='context type ("lstm", "emb")')
# Only for Context2vecInferrer
parser.add_argument('-lstm_config', action="store", dest="lstm_config", default=None, help="config file of lstm context model and respective word embeddings")
# Only for CsEmbeddingInferrer
parser.add_argument('-embeddingpath', action="store", dest="embeddingpath", default=None, help="prefix to files containing word embeddings")
parser.add_argument('-embeddingpathc', action="store", dest="embeddingpathc", default=None, help="prefix to files containing context word embeddings")
parser.add_argument('-vocabfile', action="store", dest="vocabfile")
parser.add_argument('-bow',action='store',dest='bow_size', default=-1, type=int, help="context bag-of-words window size. 0 means entire sentence. -1 means syntactic dependency contexts.")
# Common
parser.add_argument('-targetsfile', action="store", dest="targetsfile", default=None)
parser.add_argument('-testfile', action="store", dest="testfile")
parser.add_argument('-testfileconll', action="store", dest="testfileconll", default=None, help="test file with sentences parsed in conll format")
parser.add_argument('-candidatesfile', action="store", dest="candidatesfile", default=None)
parser.add_argument('-resultsfile', action="store", dest="resultsfile")
parser.add_argument('-contextmath', action="store", dest="contextmath", default=None, help="arithmetics used to consider context [add|mult|geomean|none]")
parser.add_argument('--ignoretarget', action="store_true", dest="ignoretarget", default=False, help="ignore lhs target. compute only context compatibility.")
parser.add_argument('--nopos',action='store_true',dest='no_pos', default=False, help="ignore part-of-speech of target word")
parser.add_argument('-topgenerated', action="store", dest="topgenerated", type=int, default=10, help="top entries to print in generated parvecs")
parser.add_argument('--debug',action='store_true',dest='debug')
args = parser.parse_args(sys.argv[1:])
config_file_name = args.resultsfile + ".CONFIG"
cf = open(config_file_name, 'w')
cf.write(' '.join(sys.argv)+'\n')
cf.close()
numpy.seterr(all='raise', divide='raise', over='raise', under='raise', invalid='raise')
run(args)