Skip to content

Commit 45513dd

Browse files
committed
Merge pull request #66 from lennax/lenna
Filter by sample
2 parents d1a9fdc + cbe8d90 commit 45513dd

File tree

6 files changed

+211
-7
lines changed

6 files changed

+211
-7
lines changed

scripts/vcf_sample_filter.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python
2+
3+
# Author: Lenna X. Peterson
4+
# github.com/lennax
5+
# arklenna at gmail dot com
6+
7+
import argparse
8+
import logging
9+
10+
from vcf import SampleFilter
11+
12+
13+
if __name__ == "__main__":
14+
parser = argparse.ArgumentParser()
15+
parser.add_argument("file", help="VCF file to filter")
16+
parser.add_argument("-o", metavar="outfile",
17+
help="File to write out filtered samples")
18+
parser.add_argument("-f", metavar="filters",
19+
help="Comma-separated list of sample indices or names \
20+
to filter")
21+
parser.add_argument("-i", "--invert", action="store_true",
22+
help="Keep rather than discard the filtered samples")
23+
parser.add_argument("-q", "--quiet", action="store_true",
24+
help="Less output")
25+
26+
args = parser.parse_args()
27+
28+
if args.quiet:
29+
log_level = logging.WARNING
30+
else:
31+
log_level = logging.INFO
32+
logging.basicConfig(format='%(message)s', level=log_level)
33+
34+
sf = SampleFilter(infile=args.file, outfile=args.o,
35+
filters=args.f, invert=args.invert)
36+
if args.f is None:
37+
print "Samples:"
38+
for idx, val in enumerate(sf.samples):
39+
print "{0}: {1}".format(idx, val)

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@
4747
setup(
4848
name='PyVCF',
4949
packages=['vcf', 'vcf.test'],
50-
scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py'],
50+
scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py',
51+
'scripts/vcf_sample_filter.py'],
5152
author='James Casbon and @jdoughertyii',
5253
author_email='casbon@gmail.com',
5354
description='Variant Call Format (VCF) parser for Python',

vcf/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
>>> print record.INFO['AF']
6060
[0.5]
6161
62-
There are a number of convienience methods and properties for each ``Record`` allowing you to
62+
There are a number of convenience methods and properties for each ``Record`` allowing you to
6363
examine properties of interest::
6464
6565
>>> print record.num_called, record.call_rate, record.num_unknown
@@ -176,5 +176,6 @@
176176
from vcf.parser import VCFReader, VCFWriter
177177
from vcf.filters import Base as Filter
178178
from vcf.parser import RESERVED_INFO, RESERVED_FORMAT
179+
from vcf.sample_filter import SampleFilter
179180

180181
VERSION = '0.6.7'

vcf/parser.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import codecs
12
import collections
2-
import re
33
import csv
44
import gzip
5-
import sys
65
import itertools
7-
import codecs
6+
import os
7+
import re
8+
import sys
89

910
try:
1011
from collections import OrderedDict
@@ -430,7 +431,6 @@ def _parse_samples(self, samples, samp_fmt, site):
430431
# check whether we already know how to parse this format
431432
if samp_fmt not in self._format_cache:
432433
self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
433-
434434
samp_fmt = self._format_cache[samp_fmt]
435435

436436
if cparse:
@@ -601,7 +601,7 @@ def fetch(self, chrom, start, end=None):
601601

602602

603603
class Writer(object):
604-
""" VCF Writer """
604+
"""VCF Writer. On Windows Python 2, open stream with 'wb'."""
605605

606606
# Reverse keys and values in header field count dictionary
607607
counts = dict((v,k) for k,v in field_counts.iteritems())

vcf/sample_filter.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Author: Lenna X. Peterson
2+
# github.com/lennax
3+
# arklenna at gmail dot com
4+
5+
import logging
6+
import sys
7+
import warnings
8+
9+
10+
from parser import Reader, Writer
11+
12+
13+
class SampleFilter(object):
14+
"""
15+
Modifies the vcf Reader to filter each row by sample as it is parsed.
16+
17+
"""
18+
19+
def __init__(self, infile, outfile=None, filters=None, invert=False):
20+
# Methods to add to Reader
21+
def get_filter(self):
22+
return self._samp_filter
23+
24+
def set_filter(self, filt):
25+
self._samp_filter = filt
26+
if filt:
27+
self.samples = [val for idx, val in enumerate(self.samples)
28+
if idx not in set(filt)]
29+
30+
def filter_samples(fn):
31+
"""Decorator function to filter sample parameter"""
32+
def filt(self, samples, *args):
33+
samples = [val for idx, val in enumerate(samples)
34+
if idx not in set(self.sample_filter)]
35+
return fn(self, samples, *args)
36+
return filt
37+
38+
# Add property to Reader for filter list
39+
Reader.sample_filter = property(get_filter, set_filter)
40+
Reader._samp_filter = []
41+
# Modify Reader._parse_samples to filter samples
42+
self._orig_parse_samples = Reader._parse_samples
43+
Reader._parse_samples = filter_samples(Reader._parse_samples)
44+
self.parser = Reader(filename=infile)
45+
# Store initial samples and indices
46+
self.samples = self.parser.samples
47+
self.smp_idx = dict([(v, k) for k, v in enumerate(self.samples)])
48+
# Properties for filter/writer
49+
self.outfile = outfile
50+
self.invert = invert
51+
self.filters = filters
52+
if filters is not None:
53+
self.set_filters()
54+
self.write()
55+
56+
def __del__(self):
57+
try:
58+
self._undo_monkey_patch()
59+
except AttributeError:
60+
pass
61+
62+
def set_filters(self, filters=None, invert=False):
63+
"""Convert filters from string to list of indices, set on Reader"""
64+
if filters is not None:
65+
self.filters = filters
66+
if invert:
67+
self.invert = invert
68+
filt_l = self.filters.split(",")
69+
filt_s = set(filt_l)
70+
if len(filt_s) < len(filt_l):
71+
warnings.warn("Non-unique filters, ignoring", RuntimeWarning)
72+
73+
def filt2idx(item):
74+
"""Convert filter to valid sample index"""
75+
try:
76+
item = int(item)
77+
except ValueError:
78+
# not an idx, check if it's a value
79+
return self.smp_idx.get(item)
80+
else:
81+
# is int, check if it's an idx
82+
if item < len(self.samples):
83+
return item
84+
filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s)))
85+
if len(filters) < len(filt_s):
86+
# TODO print the filters that were ignored
87+
warnings.warn("Invalid filters, ignoring", RuntimeWarning)
88+
89+
if self.invert:
90+
filters = set(xrange(len(self.samples))).difference(filters)
91+
92+
# `sample_filter` setter updates `samples`
93+
self.parser.sample_filter = filters
94+
if len(self.parser.samples) == 0:
95+
warnings.warn("Number of samples to keep is zero", RuntimeWarning)
96+
logging.info("Keeping these samples: {0}\n".format(self.parser.samples))
97+
return self.parser.samples
98+
99+
def write(self, outfile=None):
100+
if outfile is not None:
101+
self.outfile = outfile
102+
if self.outfile is None:
103+
_out = sys.stdout
104+
elif hasattr(self.outfile, 'write'):
105+
_out = self.outfile
106+
else:
107+
_out = open(self.outfile, "wb")
108+
logging.info("Writing to '{0}'\n".format(self.outfile))
109+
writer = Writer(_out, self.parser)
110+
for row in self.parser:
111+
writer.write_record(row)
112+
113+
def _undo_monkey_patch(self):
114+
Reader._parse_samples = self._orig_parse_samples
115+
delattr(Reader, 'sample_filter')

vcf/test/test_vcf.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import commands
66
import cPickle
77
from StringIO import StringIO
8+
import subprocess
89

910
import vcf
1011
from vcf import utils
@@ -870,6 +871,52 @@ def testOpenFilenameGzipped(self):
870871
self.assertEqual(self.samples, r.samples)
871872

872873

874+
class TestSampleFilter(unittest.TestCase):
875+
def testCLIListSamples(self):
876+
proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
877+
out, err = proc.communicate()
878+
self.assertEqual(proc.returncode, 0)
879+
self.assertFalse(err)
880+
expected_out = ['Samples:', '0: NA00001', '1: NA00002', '2: NA00003']
881+
self.assertEqual(out.splitlines(), expected_out)
882+
883+
def testCLIWithFilter(self):
884+
proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2 --quiet', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
885+
out, err = proc.communicate()
886+
self.assertEqual(proc.returncode, 0)
887+
self.assertTrue(out)
888+
self.assertFalse(err)
889+
buf = StringIO()
890+
buf.write(out)
891+
buf.seek(0)
892+
#print(buf.getvalue())
893+
reader = vcf.Reader(buf)
894+
self.assertEqual(reader.samples, ['NA00001'])
895+
rec = reader.next()
896+
self.assertEqual(len(rec.samples), 1)
897+
898+
def testSampleFilterModule(self):
899+
# init filter with filename, get list of samples
900+
filt = vcf.SampleFilter('vcf/test/example-4.1.vcf')
901+
self.assertEqual(filt.samples, ['NA00001', 'NA00002', 'NA00003'])
902+
# set filter, check which samples will be kept
903+
filtered = filt.set_filters(filters="0", invert=True)
904+
self.assertEqual(filtered, ['NA00001'])
905+
# write filtered file to StringIO
906+
buf = StringIO()
907+
filt.write(buf)
908+
buf.seek(0)
909+
#print(buf.getvalue())
910+
# undo monkey patch by destroying instance
911+
del filt
912+
self.assertTrue('sample_filter' not in dir(vcf.Reader))
913+
# read output
914+
reader = vcf.Reader(buf)
915+
self.assertEqual(reader.samples, ['NA00001'])
916+
rec = reader.next()
917+
self.assertEqual(len(rec.samples), 1)
918+
919+
873920
class TestFilter(unittest.TestCase):
874921

875922

@@ -1033,6 +1080,7 @@ def test_meta(self):
10331080
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall))
10341081
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix))
10351082
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods))
1083+
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter))
10361084
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter))
10371085
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression))
10381086
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUtils))

0 commit comments

Comments
 (0)