Skip to content

Commit ff7cc9b

Browse files
committed
3.0draft1 - smoothing out the writeup
1 parent 3c887e1 commit ff7cc9b

File tree

18 files changed

+194
-110
lines changed

18 files changed

+194
-110
lines changed

Makefile

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ bin/smasher:
9595

9696
# The open tree taxonomy
9797

98-
ott: tax/ott/log.tsv tax/ott/version.txt
98+
ott: tax/ott/log.tsv tax/ott/version.txt tax/ott/README.html
9999
tax/ott/log.tsv: $(CLASS) make-ott.py assemble_ott.py adjustments.py amendments.py \
100100
tax/silva/taxonomy.tsv \
101101
tax/fung/taxonomy.tsv tax/713/taxonomy.tsv \
@@ -114,14 +114,17 @@ tax/ott/log.tsv: $(CLASS) make-ott.py assemble_ott.py adjustments.py amendments.
114114
@date
115115
@rm -f *py.class
116116
@mkdir -p tax/ott
117-
@echo Writing transcript to tax/ott/transcript.out.new
118-
time bin/jython make-ott.py 2>&1 | tee tax/ott/transcript.out.new
117+
@echo Writing transcript to tax/ott/transcript.out
118+
time bin/jython make-ott.py $(WHICH) 2>&1 | tee tax/ott/transcript.out.new
119119
mv tax/ott/transcript.out.new tax/ott/transcript.out
120120
echo $(WHICH) >tax/ott/version.txt
121121

122122
tax/ott/version.txt:
123123
echo $(WHICH) >tax/ott/version.txt
124124

125+
tax/ott/README.html: tax/ott/about.json util/make_readme.py
126+
python util/make_readme.py tax/ott/ >$@
127+
125128
# ----- Taxonomy inputs
126129

127130
# Input: Index Fungorum
@@ -536,8 +539,11 @@ t/tax/aster/taxonomy.tsv: compile t/aster.py \
536539
@mkdir -p `dirname $@`
537540
bin/jython t/aster.py
538541

542+
t/tax/aster/README.html: t/tax/aster/about.json util/make_readme.py
543+
python util/make_readme.py t/tax/aster/ >$@
544+
539545
test: aster
540-
aster: t/tax/aster/taxonomy.tsv
546+
aster: t/tax/aster/taxonomy.tsv t/tax/aster/README.html
541547

542548
aster-tarball: t/tax/aster/taxonomy.tsv
543549
(mkdir -p $(TARDIR) && \

amendments.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from org.opentreeoflife.taxa import TsvEdits
44
from claim import *
5-
from chromista_spreadsheet import fixChromista
5+
#from chromista_spreadsheet import fixChromista
66

77
# ----- Final patches -----
88

@@ -101,8 +101,12 @@ def patch_ott(ott):
101101

102102
# Patches from the Katz lab to give decent parents to taxa classified
103103
# as Chromista or Protozoa
104-
print '-- Chromista/Protozoa spreadsheet from Katz lab --'
105-
fixChromista(ott)
104+
# DISABLED: as of 2017-02-19, all but one of the changes listed on the spreadsheet
105+
# either were already there, or else the taxon was missing. So it doesn't
106+
# make much sense to continue using it.
107+
#
108+
# print '-- Chromista/Protozoa spreadsheet from Katz lab --'
109+
# fixChromista(ott)
106110
# 2016-06-30 deleted from spreadsheet because ambiguous:
107111
# Enigma,Protozoa,Polychaeta ,,,,, -
108112
# Acantharia,Protozoa,Radiozoa,,,,,

assemble_ott.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@
2525
additions_clone_path = 'feed/amendments/amendments-1'
2626
new_taxa_path = 'new_taxa'
2727

28-
def create_ott():
28+
def create_ott(version):
2929

3030
ott = UnionTaxonomy.newTaxonomy('ott')
31+
ott.version = version;
3132

3233
# Would be nice if there were tests for all of these...
3334
for name in names_of_interest:
@@ -245,6 +246,8 @@ def get_default_extinct_info_from_gbif(gbif, gbif_to_ott):
245246
# It's OK if it's also in IRMNG
246247
flagged += 1
247248
taxon.extinct()
249+
else:
250+
print '| PaleoDB taxon %s appears to be extant' % taxon
248251
infile.close()
249252
print '| Flagged %s of %s taxa from paleodb\n' % (flagged, paleos)
250253

doc/method/method-details.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,9 @@ follows:
236236
1. Let C' = those members of C that have score Z
237237
1. If Z > 0 and C' contains only one candidate, we are done (match is that candidate)
238238
1. Otherwise, replace C with C' and proceed to the next heuristic
239-
4. If C is singleton, its member is taken to be the correct match.
240-
5. Otherwise, the source node does not match unambiguously.
239+
4. If C is singleton after all heuristics are exhausted, its
240+
member is taken to be the correct match.
241+
5. Otherwise, the source node does not match unambiguously; alignment fails.
241242

242243
### Failure to choose
243244

@@ -247,16 +248,18 @@ it is dropped, which is OK because it probably corresponds to one of
247248
the existing candidates and therefore would make no new contribution
248249
to the workspace. If the ambiguous source node has children, it is
249250
treated as unaligned and therefore new, possibly turning an N-way
250-
homonym into an N+1-way homonym, which could easily be wrong.
251+
homonym into an N+1-way homonym. This could easily be wrong because
252+
it is so unlikely that the source node really represents a distinct taxon.
251253
Usually, the subsequent merge phase determines that the grouping is
252254
not needed because it inconsistent or can be 'absorbed', and it is
253255
dropped. If it is not dropped, then there is a troublesome situation
254256
that calls for manual review.
255257

256-
For example, for GBIF _Katoella pulchra_, the candidates are NCBI
258+
As an example of an unaligned tip, consider GBIF _Katoella pulchra_.
259+
The candidates are NCBI
257260
_Davallodes pulchra_ and _Davallodes yunnanensis_. (There is no
258-
_Katoella pulchra_ in the workspace at the time of the alignment and
259-
the two candidates come from synonymies with _Katoella pulchra_
261+
_Katoella pulchra_ in the workspace at the time of alignment.
262+
The two candidates come from synonymies with _Katoella pulchra_
260263
declared by GBIF.)
261264
Neither candidate is preferable to the other, so
262265
_Katoella pulchra_ is left unaligned and
@@ -338,6 +341,11 @@ and the new source, we retain the workspace.
338341

339342
So that we have a term for this situation, say that x is _absorbed_ into z.
340343

344+
[KC: I couldn't find an example that looked like case number 6. We could replace
345+
what was there with a new tree showing conflict, but it would have to
346+
be very simple. The only two cases I've found so far (Pisces and
347+
Archaeognatha) have the form ((a,b)c) + (a,(b,c)). Thoughts?]
348+
341349
## Finishing the assembly
342350

343351
After all source taxonomies are aligned and merged, we apply general ad hoc

doc/method/method-sources.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ linked from the OTT taxonomy files and user interfaces so that
1717
provenance is always available.
1818

1919
**Separation taxa**
20-
This is a small curated tree containing 27 major groups such
20+
This is a small curated tree containing 29 major groups such
2121
as animals, plants, and fungi. Its purpose is to assist
2222
in separating homonyms. If a node
2323
is found in one of these separation groups, then it will not match a
@@ -37,8 +37,8 @@ Metazoa, compared with over 500,000 taxa under Metazoa in NCBI Taxonomy.
3737

3838
**Extinct / extant annotations**
3939
Curators requested information about whether taxa were extinct
40-
vs. extant. (See below for the reason this was so important.) This
41-
information was not explicitly present in any of our other sources, so we imported IRMNG,
40+
vs. extant. With the exception of limited data from WoRMS and Index Fungorum, this
41+
information was not explicitly present in our other sources, so we imported IRMNG,
4242
which logs the extinct / extant status of taxa.
4343

4444
As a secondary heuristic, records from GBIF that originate from
@@ -59,10 +59,11 @@ We suppress the following source taxonomy records:
5959
sequences', or any of about 15 similar designations
6060

6161
The IPNI and IRMNG records are suppressed because they include many
62-
invalid names. Although the original taxonomic sources indicate which
63-
names are known to be invalid, this information is not preserved when
64-
the records are exported by GBIF, since Darwin Core does not provide a
65-
standard way to express it. Note that the GBIF taxonomy might import
66-
the same name from more than one source, but its export file only
67-
lists one of the sources. We suppress the record if that source is
68-
IPNI or IRMNG, but not if it is some other source.
62+
invalid names. We pick up most of the valid names from other sources,
63+
such as direct from IRMNG, so this is not a great loss. Although
64+
the original taxonomic sources indicate which names are known to be
65+
invalid, this information is not preserved when the records are
66+
exported by the GBIF backbone. Note that the GBIF backbone might
67+
import the same name from more than one source, but its provenance
68+
information only lists one of the sources. We suppress the record if
69+
that source is IPNI or IRMNG, but not if it is some other source.

doc/method/sources_table.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11

2+
import sys, csv
3+
24
# Name
35
# Release data / download date / version
46
# Number of taxon records in source (terminal only?)
@@ -9,13 +11,13 @@
911
# Reference number(s) - full reference in article's reference list
1012
# Maximum depth
1113

12-
table = [
14+
properties = [
1315
{'name': 'separation taxa',
1416
'reference': 'see code',
1517
'version': '4b3ba1a',
1618
'priority': 1,
17-
'focus': '',
18-
'taxa': 28,
19+
'focus': 'life',
20+
'taxa': 29,
1921
'synonyms': 8,
2022
'goals': ''},
2123
{'name': 'ARB-SILVA',
@@ -88,7 +90,7 @@
8890
'taxa': 1706655, # Boils down to 1685134
8991
'synonyms': 685983,
9092
'goals': 'T'}, # Boils down to 659851
91-
{'name': 'OpenTree curation',
93+
{'name': 'Open Tree curation',
9294
'reference': 'see code',
9395
'version': '4b3ba1a',
9496
'priority': 10,
@@ -98,32 +100,55 @@
98100
'goals': 'O'}
99101
]
100102

101-
def cell(val):
102-
print ' <td>'
103-
print ' ', val
104-
print ' </td>'
103+
header = ['name', 'reference',
104+
#'version', #Removed at KC's request
105+
'focus', 'taxa', 'synonyms', 'priority', 'reasons']
106+
107+
table = []
108+
table.append(header)
105109

110+
for plist in properties:
111+
table.append([plist['name'], plist['reference'],
112+
#plist['version'],
113+
plist['focus'], plist['taxa'], plist['synonyms'], plist['priority'], plist['goals']])
114+
115+
116+
def show_table_csv(table):
117+
print
118+
print '```'
119+
writer = csv.writer(sys.stdout, lineterminator=' \n')
120+
for row in table:
121+
writer.writerow(row)
122+
print '```'
123+
124+
def show_table_html(table):
125+
print '<table>'
126+
for row in table:
127+
do_row(row)
128+
print '</table>'
106129

107130
def do_row(cells):
108131
print ' <tr>'
109132
for val in cells:
110133
cell(val)
111134
print ' </tr>'
112135

113-
header = ['name', 'reference', 'version', 'focus', 'taxa', 'synonyms', 'priority', 'reasons']
136+
def cell(val):
137+
print ' <td>'
138+
print ' ', val
139+
print ' </td>'
114140

115141

116142
print '<!--**** THIS FILE IS AUTOMATICALLY GENERATED - DO NOT EDIT ****-->'
117143
print '### (Table 1)'
118144

119-
print '<table>'
120-
do_row(header)
121-
for row in table:
122-
do_row([row['name'], row['reference'], row['version'], row['focus'], row['taxa'], row['synonyms'], row['priority'], row['goals']])
123-
print '</table>'
145+
show_table_html(table)
146+
show_table_csv(table)
147+
148+
print
149+
print """[JAR: get final curation counts. `cat amendments/*.json | grep original_label | wc` plus `grep "^add" feed/ott/edits/*.tsv | wc`]"""
124150

125151
"""
126-
The root clade could be a column in the table? No.
127152
128153
Maybe put number of binomials in the table?
129154

feed/ott/edits/flag-test.tsv

Lines changed: 0 additions & 2 deletions
This file was deleted.

make-ott.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
# Called from Makefile
22

3+
import sys
34
import assemble_ott
45

5-
ott = assemble_ott.create_ott()
6+
version = sys.argv[1]
7+
8+
ott = assemble_ott.create_ott(version)
69

710
ott.dump('tax/ott/')
811
assemble_ott.report(ott)

org/opentreeoflife/smasher/MergeMachine.java

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ void report(Taxonomy source, int startroots, int startcount) {
133133
that has one.
134134
*/
135135
void augment(Taxon node, Taxon sink) {
136+
if (node.prunedp) return;
136137
Taxon unode = alignment.getTaxon(node);
137138

138139
if (node.children == null) {
@@ -147,7 +148,9 @@ else if (a.value <= Answer.HECK_NO)
147148
// (weak no) or ambiguous (noinfo)
148149
// YES > NOINFO > NO > HECK_NO (sorry)
149150
acceptNew(node, "new/polysemy");
150-
}
151+
else
152+
tick("ambiguous/redundant");
153+
}
151154
} else {
152155
if (unode != null) {
153156
for (Taxon child: node.children)
@@ -159,7 +162,7 @@ else if (a.value <= Answer.HECK_NO)
159162
augment(child, sink);
160163
// Examine mapped parents of the children
161164
boolean consistentp = true;
162-
Taxon commonParent = null; // should end up being targetMrca(node)
165+
Taxon commonParent = null;
163166
Taxon child1 = null, child2 = null; // for inconsistency reporting
164167
int count = 0;
165168
for (Taxon child : node.children) {
@@ -186,18 +189,12 @@ else if (a.value <= Answer.HECK_NO)
186189
} else if (!consistentp) {
187190
inconsistent(node, child1, child2, sink);
188191
} else if (!commonParent.descendsFrom(sink)) {
189-
// This is the philosophically troublesome case.
190-
// Could be either an outlier/mistake, or something serious.
191-
if (node.markEvent("sibling-sink mismatch"))
192-
System.out.format("* Parent of %s's children's images, %s, is not a descendant of %s\n",
193-
node, commonParent, sink);
194-
inconsistent(node, child1, child2, sink);
192+
overtake(node, commonParent, sink);
195193
} else if (refinementp(node, sink)) {
196194
Taxon newnode = acceptNew(node, "new/refinement");
197195
takeOld(node, newnode);
198196
takeOn(node, newnode, 0); // augmentation
199197
} else {
200-
// 'trouble' = paraphyly risk - plain merge.
201198
takeOn(node, commonParent, 0);
202199
// should include a witness for debugging purposes - merged to/from what?
203200
reject(node, "reject/merged", commonParent, Taxonomy.MERGED);
@@ -215,7 +212,8 @@ else if (a.value <= Answer.HECK_NO)
215212

216213
void inconsistent(Taxon node, Taxon child1, Taxon child2, Taxon sink) {
217214
// Paraphyletic / conflicted.
218-
// Put the new children unplaced under the mrca of the placed children.
215+
// Put the new children unplaced under the sink, or the mrca of the
216+
// placed children, whichever is smaller.
219217
reportConflict(node, child1, child2, sink);
220218
// Tighten it if possible... does this always make sense?
221219
Taxon unode = alignment.getTargetMrca(node);
@@ -225,6 +223,35 @@ void inconsistent(Taxon node, Taxon child1, Taxon child2, Taxon sink) {
225223
reject(node, "reject/inconsistent", sink, Taxonomy.INCONSISTENT);
226224
}
227225

226+
private final static boolean MORE_SENSIBLE_BUT_DOESNT_WORK = false;
227+
228+
// The symptom of getting this wrong is the creation of a cycle.
229+
230+
void overtake(Taxon node, Taxon commonParent, Taxon sink) {
231+
// This is a troublesome case.
232+
// Workspace says children are under sink, but source says they're not.
233+
if (node.markEvent("sibling-sink mismatch"))
234+
System.out.format("* Parent of %s's children's images, %s, is an ancestor of %s\n",
235+
node,
236+
commonParent,
237+
sink);
238+
239+
if (MORE_SENSIBLE_BUT_DOESNT_WORK) {
240+
takeOn(node, commonParent, 0);
241+
reject(node, "reject/overtaken", commonParent, Taxonomy.MERGED);
242+
} else {
243+
// was: inconsistent(node, child1, child2, sink);
244+
Taxon point;
245+
Taxon unode = alignment.getTargetMrca(node);
246+
if (unode != null && unode.descendsFrom(sink))
247+
point = unode;
248+
else
249+
point = sink;
250+
takeOn(node, point, Taxonomy.UNPLACED);
251+
reject(node, "reject/overtaken", point, Taxonomy.MERGED);
252+
}
253+
}
254+
228255
/* Refinement: feature necessary for merging Silva into the
229256
skeleton and NCBI into Silva. This lets an internal "new" node
230257
(in the "new" taxonomy) be inserted in between internal "old"

0 commit comments

Comments
 (0)