@@ -49,14 +49,24 @@ def csv2rows(FnameCSV):
4949
5050 with open (FnameCSV , 'rb' ) as f :
5151 reader = csv .reader (f , delimiter = "," )
52- header = reader .next ()
53- header = [x .strip () for x in header ]
52+ for x in reader :
53+ if not x :continue
54+ header = x
55+ header = [x .strip () for x in header ]
56+ break
57+
5458 miRNAs = [x for x in header if not x in ["ID" , "Annots" ]]
5559
60+ IDs = set ([])
5661 rows = []
5762 for x in reader :
5863 if not x : continue
64+ if not x [0 ].strip (): continue
5965 newrow = dict (zip (header ,[y .strip () for y in x ]))
66+ if newrow ["ID" ] in IDs :
67+ print "\n ***ERROR: row IDs must be unique, found duplicate (%s)." % newrow ["ID" ]
68+ raise Exception
69+ IDs .add (newrow ["ID" ])
6070 rows .append (newrow )
6171
6272 return miRNAs , rows
@@ -368,9 +378,6 @@ def function(SampleDict):
368378
369379 return function
370380
371-
372-
373-
374381
375382
376383def check_classifier (FnameCSV , GateInputs ):
@@ -407,8 +414,37 @@ def check_classifier(FnameCSV, GateInputs):
407414 print " result = classifier and data are consistent"
408415
409416
410-
411-
417+
418+ def check_csv (FnameCSV ):
419+ """
420+ counts how many miRNAs are constant across all samples, and
421+ checks if there are inconsistencies in the data (identical miRNA profile but different annotation)
422+ """
423+
424+ print "\n --- check_csv"
425+
426+ miRNAs , rows = csv2rows (FnameCSV )
427+ print " miRNAs: " , len (miRNAs )
428+ print " samples:" , len (rows )
429+
430+ inconsistencies = []
431+ seen = []
432+ for x in rows :
433+ for y in seen :
434+ if all (x [rna ]== y [rna ] for rna in miRNAs ):
435+ if x ["Annots" ]!= y ["Annots" ]:
436+ inconsistencies .append (x ["ID" ])
437+ seen .append (x )
438+
439+ constants = []
440+ for rna in miRNAs :
441+ value = rows [0 ][rna ]
442+ if all (x [rna ]== value for x in rows ):
443+ constants .append (rna )
444+
445+ print " inconsistencies (%i): %s" % (len (inconsistencies ),"," .join (inconsistencies ) or "-" )
446+ print " constants (%i): %s" % (len (constants ),"," .join (constants ) or "-" )
447+
412448
413449def mat2csv (FnameMAT , Threshold ):
414450 """
0 commit comments