-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathcs_inferrer.py
More file actions
executable file
·104 lines (78 loc) · 4.09 KB
/
cs_inferrer.py
File metadata and controls
executable file
·104 lines (78 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
'''
Base class for context sensitive inference modules
'''
import re
from nltk.stem.wordnet import WordNetLemmatizer
from jcs.data.pos import to_wordnet_pos
# just something to return in case not enough words were generated
default_generated_results = ['time', 'people', 'information', 'work', 'first', 'like', 'year', 'make', 'day', 'service']
#generated_word_re = re.compile('^[a-zA-Z]+(-[a-zA-Z]+)*$')
generated_word_re = re.compile('^[a-zA-Z]+$')
class CsInferrer(object):
'''
classdocs
'''
def __init__(self):
'''
Constructor
'''
self.time = [0.0, 0]
def inference_time(self, seconds):
self.time[0] += seconds
self.time[1] += 1
# processing time in msec
def msec_per_word(self):
return 1000*self.time[0]/self.time[1] if self.time[1] > 0 else 0.0
def generate_inferred(self, result_vec, target_word, target_lemma, pos):
generated_results = {}
min_weight = None
if result_vec is not None:
for word, weight in result_vec:
if generated_word_re.match(word) != None: # make sure this is not junk
wn_pos = to_wordnet_pos[pos]
lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
if word != target_word and lemma != target_lemma:
if lemma in generated_results:
weight = max(weight, generated_results[lemma])
generated_results[lemma] = weight
if min_weight is None:
min_weight = weight
else:
min_weight = min(min_weight, weight)
if min_weight is None:
min_weight = 0.0
i = 0.0
for lemma in default_generated_results:
if len(generated_results) >= len(default_generated_results):
break;
i -= 1.0
generated_results[lemma] = min_weight + i
return generated_results
def filter_inferred(self, result_vec, candidates, pos):
filtered_results = {}
candidates_found = set()
if result_vec != None:
for word, weight in result_vec:
wn_pos = to_wordnet_pos[pos]
lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
if lemma in candidates:
self.add_inference_result(lemma, weight, filtered_results, candidates_found)
if lemma.title() in candidates:
self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
if word in candidates: # there are some few cases where the candidates are not lemmatized
self.add_inference_result(word, weight, filtered_results, candidates_found)
if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
self.add_inference_result(word.title(), weight, filtered_results, candidates_found)
# assign negative weights for candidates with no score
# they will appear last sorted according to their unigram count
# candidates_left = candidates - candidates_found
# for candidate in candidates_left:
# count = self.w2counts[candidate] if candidate in self.w2counts else 1
# score = -1 - (1.0/count) # between (-1,-2]
# filtered_results[candidate] = score
return filtered_results
def add_inference_result(self, token, weight, filtered_results, candidates_found):
candidates_found.add(token)
best_last_weight = filtered_results[token] if token in filtered_results else None
if best_last_weight == None or weight > best_last_weight:
filtered_results[token] = weight