Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
75cf8f8
Initial per-sample line filtering.
Jul 2, 2012
18deb2a
Improved samp filter performance, allow invert.
Jul 2, 2012
8477e6f
Args can be provided all at once or in sequence.
Jul 2, 2012
73376c8
Reduced amount of sample filter code in parser.
Jul 2, 2012
362bbab
Actually write out sample-filtered file.
Jul 3, 2012
d71b2cd
Switched Writer \r\n to os.linesep.
Jul 3, 2012
bce2c47
Fixed sample name list update/printing.
Jul 3, 2012
67744c0
Moved all sample filtering to filter script.
Jul 6, 2012
a048ec0
Implemented argparse.
Jul 7, 2012
19ce645
Tweak args, pep8, move empty outfile warning.
Jul 7, 2012
95fc70b
Fixed argparse arg names.
Jul 7, 2012
67afb27
Changed default out to sys.stdout
Jul 7, 2012
33d2b5c
Added unit test for sample filtering script.
Jul 7, 2012
792d685
Added authorship statement.
Jul 7, 2012
d78a945
Added sample filter to list of scripts in setup.
Jul 7, 2012
75c4775
Moved sample filter object to src dir.
Jul 9, 2012
0047032
Using logging for easy quiet mode.
Jul 9, 2012
6b1fa89
Unit test for sample filter module.
Jul 9, 2012
817f5e9
Docs/test for undo_monkey_patch
Jul 9, 2012
0b0d809
Changed tests to use subprocess returncode.
Jul 9, 2012
746ece9
Destructor undoes patch; warn if 0 samples kept
Jul 9, 2012
30321c5
Recommend explicit use of del.
Jul 9, 2012
49f8897
Added empty filter list; del is now less critical.
Jul 9, 2012
ba00d83
Merge branch 'master' of https://github.com/jamescasbon/PyVCF into lenna
Feb 22, 2014
cbe8d90
Restore subprocess import to test
Feb 22, 2014
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions scripts/vcf_sample_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python

# Author: Lenna X. Peterson
# github.com/lennax
# arklenna at gmail dot com

import argparse
import logging

from vcf import SampleFilter


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("file", help="VCF file to filter")
parser.add_argument("-o", metavar="outfile",
help="File to write out filtered samples")
parser.add_argument("-f", metavar="filters",
help="Comma-separated list of sample indices or names \
to filter")
parser.add_argument("-i", "--invert", action="store_true",
help="Keep rather than discard the filtered samples")
parser.add_argument("-q", "--quiet", action="store_true",
help="Less output")

args = parser.parse_args()

if args.quiet:
log_level = logging.WARNING
else:
log_level = logging.INFO
logging.basicConfig(format='%(message)s', level=log_level)

sf = SampleFilter(infile=args.file, outfile=args.o,
filters=args.f, invert=args.invert)
if args.f is None:
print "Samples:"
for idx, val in enumerate(sf.samples):
print "{0}: {1}".format(idx, val)
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@
setup(
name='PyVCF',
packages=['vcf', 'vcf.test'],
scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py'],
scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py',
'scripts/vcf_sample_filter.py'],
author='James Casbon and @jdoughertyii',
author_email='casbon@gmail.com',
description='Variant Call Format (VCF) parser for Python',
Expand Down
3 changes: 2 additions & 1 deletion vcf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
>>> print record.INFO['AF']
[0.5]

There are a number of convienience methods and properties for each ``Record`` allowing you to
There are a number of convenience methods and properties for each ``Record`` allowing you to
examine properties of interest::

>>> print record.num_called, record.call_rate, record.num_unknown
Expand Down Expand Up @@ -176,5 +176,6 @@
from vcf.parser import VCFReader, VCFWriter
from vcf.filters import Base as Filter
from vcf.parser import RESERVED_INFO, RESERVED_FORMAT
from vcf.sample_filter import SampleFilter

VERSION = '0.6.7'
10 changes: 5 additions & 5 deletions vcf/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import codecs
import collections
import re
import csv
import gzip
import sys
import itertools
import codecs
import os
import re
import sys

try:
from collections import OrderedDict
Expand Down Expand Up @@ -430,7 +431,6 @@ def _parse_samples(self, samples, samp_fmt, site):
# check whether we already know how to parse this format
if samp_fmt not in self._format_cache:
self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)

samp_fmt = self._format_cache[samp_fmt]

if cparse:
Expand Down Expand Up @@ -601,7 +601,7 @@ def fetch(self, chrom, start, end=None):


class Writer(object):
""" VCF Writer """
"""VCF Writer. On Windows Python 2, open stream with 'wb'."""

# Reverse keys and values in header field count dictionary
counts = dict((v,k) for k,v in field_counts.iteritems())
Expand Down
115 changes: 115 additions & 0 deletions vcf/sample_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Author: Lenna X. Peterson
# github.com/lennax
# arklenna at gmail dot com

import logging
import sys
import warnings


from parser import Reader, Writer


class SampleFilter(object):
"""
Modifies the vcf Reader to filter each row by sample as it is parsed.

"""

def __init__(self, infile, outfile=None, filters=None, invert=False):
# Methods to add to Reader
def get_filter(self):
return self._samp_filter

def set_filter(self, filt):
self._samp_filter = filt
if filt:
self.samples = [val for idx, val in enumerate(self.samples)
if idx not in set(filt)]

def filter_samples(fn):
"""Decorator function to filter sample parameter"""
def filt(self, samples, *args):
samples = [val for idx, val in enumerate(samples)
if idx not in set(self.sample_filter)]
return fn(self, samples, *args)
return filt

# Add property to Reader for filter list
Reader.sample_filter = property(get_filter, set_filter)
Reader._samp_filter = []
# Modify Reader._parse_samples to filter samples
self._orig_parse_samples = Reader._parse_samples
Reader._parse_samples = filter_samples(Reader._parse_samples)
self.parser = Reader(filename=infile)
# Store initial samples and indices
self.samples = self.parser.samples
self.smp_idx = dict([(v, k) for k, v in enumerate(self.samples)])
# Properties for filter/writer
self.outfile = outfile
self.invert = invert
self.filters = filters
if filters is not None:
self.set_filters()
self.write()

def __del__(self):
try:
self._undo_monkey_patch()
except AttributeError:
pass

def set_filters(self, filters=None, invert=False):
"""Convert filters from string to list of indices, set on Reader"""
if filters is not None:
self.filters = filters
if invert:
self.invert = invert
filt_l = self.filters.split(",")
filt_s = set(filt_l)
if len(filt_s) < len(filt_l):
warnings.warn("Non-unique filters, ignoring", RuntimeWarning)

def filt2idx(item):
"""Convert filter to valid sample index"""
try:
item = int(item)
except ValueError:
# not an idx, check if it's a value
return self.smp_idx.get(item)
else:
# is int, check if it's an idx
if item < len(self.samples):
return item
filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s)))
if len(filters) < len(filt_s):
# TODO print the filters that were ignored
warnings.warn("Invalid filters, ignoring", RuntimeWarning)

if self.invert:
filters = set(xrange(len(self.samples))).difference(filters)

# `sample_filter` setter updates `samples`
self.parser.sample_filter = filters
if len(self.parser.samples) == 0:
warnings.warn("Number of samples to keep is zero", RuntimeWarning)
logging.info("Keeping these samples: {0}\n".format(self.parser.samples))
return self.parser.samples

def write(self, outfile=None):
if outfile is not None:
self.outfile = outfile
if self.outfile is None:
_out = sys.stdout
elif hasattr(self.outfile, 'write'):
_out = self.outfile
else:
_out = open(self.outfile, "wb")
logging.info("Writing to '{0}'\n".format(self.outfile))
writer = Writer(_out, self.parser)
for row in self.parser:
writer.write_record(row)

def _undo_monkey_patch(self):
Reader._parse_samples = self._orig_parse_samples
delattr(Reader, 'sample_filter')
48 changes: 48 additions & 0 deletions vcf/test/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import commands
import cPickle
from StringIO import StringIO
import subprocess

import vcf
from vcf import utils
Expand Down Expand Up @@ -870,6 +871,52 @@ def testOpenFilenameGzipped(self):
self.assertEqual(self.samples, r.samples)


class TestSampleFilter(unittest.TestCase):
def testCLIListSamples(self):
proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = proc.communicate()
self.assertEqual(proc.returncode, 0)
self.assertFalse(err)
expected_out = ['Samples:', '0: NA00001', '1: NA00002', '2: NA00003']
self.assertEqual(out.splitlines(), expected_out)

def testCLIWithFilter(self):
proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2 --quiet', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = proc.communicate()
self.assertEqual(proc.returncode, 0)
self.assertTrue(out)
self.assertFalse(err)
buf = StringIO()
buf.write(out)
buf.seek(0)
#print(buf.getvalue())
reader = vcf.Reader(buf)
self.assertEqual(reader.samples, ['NA00001'])
rec = reader.next()
self.assertEqual(len(rec.samples), 1)

def testSampleFilterModule(self):
# init filter with filename, get list of samples
filt = vcf.SampleFilter('vcf/test/example-4.1.vcf')
self.assertEqual(filt.samples, ['NA00001', 'NA00002', 'NA00003'])
# set filter, check which samples will be kept
filtered = filt.set_filters(filters="0", invert=True)
self.assertEqual(filtered, ['NA00001'])
# write filtered file to StringIO
buf = StringIO()
filt.write(buf)
buf.seek(0)
#print(buf.getvalue())
# undo monkey patch by destroying instance
del filt
self.assertTrue('sample_filter' not in dir(vcf.Reader))
# read output
reader = vcf.Reader(buf)
self.assertEqual(reader.samples, ['NA00001'])
rec = reader.next()
self.assertEqual(len(rec.samples), 1)


class TestFilter(unittest.TestCase):


Expand Down Expand Up @@ -1033,6 +1080,7 @@ def test_meta(self):
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUtils))
Expand Down