Skip to content

Commit 3022702

Browse files
author
HannesK
committed
added classifier.check_csv
1 parent 0372316 commit 3022702

File tree

4 files changed

+63
-18
lines changed

4 files changed

+63
-18
lines changed

classifier.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,24 @@ def csv2rows(FnameCSV):
4949

5050
with open(FnameCSV, 'rb') as f:
5151
reader = csv.reader(f, delimiter=",")
52-
header = reader.next()
53-
header = [x.strip() for x in header]
52+
for x in reader:
53+
if not x:continue
54+
header = x
55+
header = [x.strip() for x in header]
56+
break
57+
5458
miRNAs = [x for x in header if not x in ["ID", "Annots"]]
5559

60+
IDs = set([])
5661
rows = []
5762
for x in reader:
5863
if not x: continue
64+
if not x[0].strip(): continue
5965
newrow = dict(zip(header,[y.strip() for y in x]))
66+
if newrow["ID"] in IDs:
67+
print "\n***ERROR: row IDs must be unique, found duplicate (%s)."%newrow["ID"]
68+
raise Exception
69+
IDs.add(newrow["ID"])
6070
rows.append(newrow)
6171

6272
return miRNAs, rows
@@ -368,9 +378,6 @@ def function(SampleDict):
368378

369379
return function
370380

371-
372-
373-
374381

375382

376383
def check_classifier(FnameCSV, GateInputs):
@@ -407,8 +414,37 @@ def check_classifier(FnameCSV, GateInputs):
407414
print " result = classifier and data are consistent"
408415

409416

410-
411-
417+
418+
def check_csv(FnameCSV):
419+
"""
420+
counts how many miRNAs are constant across all samples, and
421+
checks if there are inconsistencies in the data (identical miRNA profile but different annotation)
422+
"""
423+
424+
print "\n--- check_csv"
425+
426+
miRNAs, rows = csv2rows(FnameCSV)
427+
print " miRNAs: ", len(miRNAs)
428+
print " samples:", len(rows)
429+
430+
inconsistencies = []
431+
seen = []
432+
for x in rows:
433+
for y in seen:
434+
if all(x[rna]==y[rna] for rna in miRNAs):
435+
if x["Annots"]!=y["Annots"]:
436+
inconsistencies.append(x["ID"])
437+
seen.append(x)
438+
439+
constants = []
440+
for rna in miRNAs:
441+
value = rows[0][rna]
442+
if all(x[rna]==value for x in rows):
443+
constants.append(rna)
444+
445+
print " inconsistencies (%i): %s"%(len(inconsistencies),",".join(inconsistencies) or "-")
446+
print " constants (%i): %s"%(len(constants),",".join(constants) or "-")
447+
412448

413449
def mat2csv(FnameMAT, Threshold):
414450
"""

readme.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,11 @@ The output informs you of all encountered inconsistencies (malfunctions):
207207
result = 1 inconsistencies set(['2'])
208208
```
209209

210-
210+
#### check csv data
211+
To check whether there are _inconsistencies_ or _constants_ in the data call the function `classifier.check_csv`.
212+
It lists all miRNAs that are constant across all samples, i.e., always _high_ or always _low_,
213+
and checks if there are profiles that are identical in terms of the miRNA profile but differ w.r.t. to the annotation.
214+
If there are inconsistencies then there is no classifier.
211215

212216

213217
## Files

toy.csv

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
ID, Annots, g1, g2, g3
2-
1, 0, 1, 1, 0
3-
2, 0, 0, 0, 1
41

2+
ID, Annots, g1, g2, g3
3+
4+
1, 0, 1, 1, 0
5+
2, 0, 0, 0, 1
56
3, 1, 0, 1, 0
67

78

toy.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import classifier
4444

4545
if __name__=="__main__":
46-
if 1 :
46+
if 1:
4747
classifier.csv2asp(
4848
FnameCSV,
4949
FnameASP,
@@ -53,18 +53,22 @@
5353
EfficiencyConstraint,
5454
OptimizationStrategy)
5555

56-
if 1 :
56+
if 1:
5757
GateInputs = "gate_input(1,positive,g2) gate_input(2,positive,g3) gate_input(2,negative,g1)"
5858
classifier.check_classifier(FnameCSV, GateInputs)
5959

60-
if 1 :
60+
if 1:
6161
GateInputs = "gate_input(1,positive,g2) gate_input(2,positive,g3) gate_input(2,negative,g1)"
6262
FnamePDF = "toy.pdf"
6363
classifier.gateinputs2pdf(FnamePDF, GateInputs)
6464

65-
if 0 :
66-
FnameMAT = "toy.mat"
67-
Threshold = 250
68-
classifier.mat2csv(FnameMAT, Threshold)
65+
if 0:
66+
FnameMAT = "toy.mat"
67+
Threshold = 250
68+
classifier.mat2csv(FnameMAT, Threshold)
69+
70+
if 1:
71+
classifier.check_csv(FnameCSV)
72+
6973

7074

0 commit comments

Comments
 (0)