-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
194 lines (143 loc) · 4.83 KB
/
Makefile
File metadata and controls
194 lines (143 loc) · 4.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
.PHONY: clean all copy_cmudict copy_kilgarriff download_cmudict download_kilgarriff test
ALL := intermediate/correlated_ipa
ALL += intermediate/correlated_ipa_no_spaces
ALL += target/q1_frequencies
ALL += target/q2_post_w_frequencies
all: $(ALL)
clean:
rm -f source/* intermediate/* target/*
copy_cmudict:
cp local_source/cmudict source/cmudict
copy_kilgarriff:
cp local_source/kilgarriff source/kilgarriff
download_cmudict:
curl http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b \
> source/cmudict
download_kilgarriff:
curl http://www.kilgarriff.co.uk/BNClists/all.num.gz \
| gunzip \
> source/kilgarriff
test:
diff local_intermediate/correlated_ipa intermediate/correlated_ipa
diff local_target/q1_frequencies target/q1_frequencies
diff local_target/q2_post_w_frequencies target/q2_post_w_frequencies
############################################################
source/cmudict: copy_cmudict
source/kilgarriff: copy_kilgarriff
# source/cmudict: download_cmudict
# source/kilgarriff: download_kilgarriff
############################################################
# ;;; # CMUdict
# CAT K AE1 T
# READ R EH1 D
# READ(1) R IY1 D
# THE DH AH0
# ZOO Z UW1
intermediate/cmudict_05_no_comments: source/cmudict
cat source/cmudict \
| grep --text -v '^;;;' \
> intermediate/cmudict_05_no_comments
# CAT K AE1 T
# READ R EH1 D
# READ(1) R IY1 D
# THE DH AH0
# ZOO Z UW1
# There's what looks like bad unicode in exactly one entry in
# cmudict. (Something like "DEJA". "DEJA" in all ASCII is in fact
# present.) It's probably OK to just remove this entry entirely.
intermediate/cmudict_07_remove_bad_unicode: intermediate/cmudict_05_no_comments
cat intermediate/cmudict_05_no_comments \
| grep --text -v 'D EY2 JH AA1' \
> intermediate/cmudict_07_remove_bad_unicode
intermediate/cmudict_10_first_only: intermediate/cmudict_07_remove_bad_unicode
cat intermediate/cmudict_07_remove_bad_unicode \
| grep --text -v '^[^ ]*\([0-9]\)' \
> intermediate/cmudict_10_first_only
# CAT K AE1 T
# READ R EH1 D
# THE DH AH0
# ZOO Z UW1
intermediate/cmudict_20_discard_stress: intermediate/cmudict_10_first_only
cat intermediate/cmudict_10_first_only \
| scripts/discard_stress.py \
> intermediate/cmudict_20_discard_stress
# CAT K AE T
# READ R EH D
# THE DH AH
# ZOO Z UW
intermediate/cmudict_processed: intermediate/cmudict_20_discard_stress
cp intermediate/cmudict_20_discard_stress \
intermediate/cmudict_processed
# CAT K AE T
# READ R EH D
# THE DH AH
# ZOO Z UW
############################################################
# 36 !!WHOLE_CORPUS !!ANY 10
# 20 the at0 5
# 10 read vvx 3
# 5 read vvy 2
# 1 zoo nn1 1
intermediate/kilgarriff_05_discard_fields: source/kilgarriff
cat source/kilgarriff \
| awk '{print $$1, $$2}' \
> intermediate/kilgarriff_05_discard_fields
# 36 !!WHOLE_CORPUS
# 20 the
# 10 read
# 5 read
# 1 zoo
intermediate/kilgarriff_07_discard_total: intermediate/kilgarriff_05_discard_fields
cat intermediate/kilgarriff_05_discard_fields \
| sed '1d' \
> intermediate/kilgarriff_07_discard_total
# 20 the
# 10 read
# 5 read
# 1 zoo
intermediate/kilgarriff_10_squashed: intermediate/kilgarriff_07_discard_total
cat intermediate/kilgarriff_07_discard_total \
| scripts/squash_kilgarriff.py \
> intermediate/kilgarriff_10_squashed
# 20 the
# 15 read
# 1 zoo
intermediate/kilgarriff_20_sorted: intermediate/kilgarriff_10_squashed
cat intermediate/kilgarriff_10_squashed \
| sort --numeric-sort --reverse \
> intermediate/kilgarriff_20_sorted
intermediate/kilgarriff_processed: intermediate/kilgarriff_20_sorted
cp intermediate/kilgarriff_20_sorted \
intermediate/kilgarriff_processed
# 20 the
# 15 read
# 1 zoo
############################################################
# This rule also generates the `uncorrelated` file, but I'm not
# sure how to represent this in Make syntax.
intermediate/correlated_arpa: intermediate/kilgarriff_processed intermediate/cmudict_processed
scripts/make_correlated.py \
intermediate/cmudict_processed \
intermediate/kilgarriff_processed \
intermediate/correlated_arpa \
intermediate/uncorrelated
# 20 the DH AH
# 15 read R EH D
# 1 zoo Z UW
intermediate/correlated_ipa: intermediate/correlated_arpa
cat intermediate/correlated_arpa \
| scripts/to_ipa.py \
> intermediate/correlated_ipa
intermediate/correlated_ipa_no_spaces: intermediate/correlated_ipa
cat intermediate/correlated_ipa \
| scripts/unspace_ipa.py \
> intermediate/correlated_ipa_no_spaces
############################################################
target/q1_frequencies: intermediate/correlated_ipa
cat intermediate/correlated_ipa \
| scripts/compute_frequencies.py \
> target/q1_frequencies
target/q2_post_w_frequencies: intermediate/correlated_ipa
cat intermediate/correlated_ipa \
| scripts/compute_post_w_frequencies.py \
> target/q2_post_w_frequencies