From 14e5d9e70d79361d99f691affe3e53b9dea4393d Mon Sep 17 00:00:00 2001 From: Leopold von Seckendorff Date: Fri, 29 Dec 2017 16:25:39 +0100 Subject: [PATCH] fixed a number of problems related to batch lookups. batch lookups where being limited by the var `max_variants_per_batch` to 200 variants per GET request, which was inconsistent with the recommended 10,000 variants per batch, this was corrected in client.py along with some small readability changes. batchRequestClient.py * now uses standard default values implemented in argparse, and takes advantage of the "chunking" mechanism already implemented in the client batch_lookup function. Note that this breaks the "chunk reporting" functionality of the previous implementation, where a notice was printed to STDOUT for each chunk. Additional messages were added for the request and writing the output file. * The option to pass parameters via CLI was added in a way which is consistent with run.py. * ref_genome was converted to use strings instead of integers, to be consistent with documentation. run.py was modified for readability and to be consistent with batchRequestClient. Note that the default values from argparse are separate from those found in the client. Calling the variantapi client from batchRequestClient.py overrides the default value for max_variants_per_batch. README was modified to include batchRequestClient. --- README.md | 17 +++++- batchRequestClient.py | 138 +++++++++++++++++++++++++----------------- run.py | 46 +++++++++++--- variantapi/client.py | 15 +++-- 4 files changed, 145 insertions(+), 71 deletions(-) mode change 100644 => 100755 batchRequestClient.py diff --git a/README.md b/README.md index 35e2ed9..ceba936 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,9 @@ You may wish to catch this exception and proceed with your own code logic # proceed with your code flow e.g. print(e) # 404 (invalid reference genome) -### Example Usage +## Example Command Line Usage +### Single Variants or Small Batches You may download and run the run.py python file after installation of the package to test the api client directly e.g. @@ -75,6 +76,20 @@ Run for a list of available options +### Large Batches + +When retrieving information for large batches, you may use `batchRequestClient.py`. +It accepts a text file with one variant per line and outputs a json file. e.g. + + ./batchRequestClient.py -i vars.txt -o test.json -k 'your token' -n 10 + +Run + + ./batchRequestClient.py -h + +for a list of available options + +## Reference To view available request parameters (used in the params method parameter) refer to an example at [api.varsome.com](https://api.varsome.com) or the [api documentation](http://docs.varsome.apiary.io). diff --git a/batchRequestClient.py b/batchRequestClient.py old mode 100644 new mode 100755 index 03f7fad..548a429 --- a/batchRequestClient.py +++ b/batchRequestClient.py @@ -2,13 +2,15 @@ # A simple client application that does the following: # - Loads a text file containing one variant per row -# - Performs a batch lookups to the Saphetor Variant API using N variants at a time. +# - Performs a batch lookups to the Saphetor Variant API using n variants at a time. # - Saves the results in a new file. # # It uses the following module: -# variantapi.client (https://github.com/saphetor/variant-api-client-python) -# -# Note: To sort output json file execute: +# variantapi.client +# https://github.com/saphetor/variant-api-client-python +# +# Note: +# To sort output json file execute: # jq -S '.' output.txt > output_sorted.txt import argparse @@ -22,64 +24,90 @@ __author__ = 'stephanos-androutsellis' -# Declare reference genome as a global variable -_ref_genome = 1019 - - def main(argv): - # Read and parse arguments - infile = '' - outfile = '' - batch_size = 5000 - - parser = argparse.ArgumentParser(description='Simple batch lookup Client application') - parser.add_argument('-i', help='Input file', type=str, metavar='Input File', required=True) - parser.add_argument('-o', help='Output file', type=str, metavar='Output File', required=True) - parser.add_argument('-n', help="Number of variants to batch", type=int, metavar='Batch size', required=True,default=5000) - parser.add_argument('-k', help='Your key to the API', type=str, metavar='API Key', required=False) - parser.add_argument('-g', help='Reference genome either 1019 (default) or 1038', type=int, - metavar='Reference Genome', required=False, default=1019) - - args = parser.parse_args() - infile = args.i - outfile = args.o - batch_size = args.n if args.n is not None else batch_size - api_key = args.k - ref_genome = args.g if args.g is not None else _ref_genome + infile = '' + outfile = '' - # Open and load input file into list - print("Reading input file ", infile) - with open(infile) as fi: - variants = fi.readlines() - variants = [v.strip('\n') for v in variants] + parser = argparse.ArgumentParser( + description='Simple batch lookup Client application. ' + ) + parser.add_argument('-i', + help='Input file', + type=str, + metavar='Input File', + required=True + ) + parser.add_argument('-o', + help='Output file', + type=str, + metavar='Output File', + required=True + ) + parser.add_argument('-n', + help="Number of variants per GET request", + type=int, + metavar='Batch size', + required=False, + default=10000 + ) + parser.add_argument('-k', + help='Your key to the API', + type=str, + metavar='API Key', + required=False + ) + parser.add_argument('-g', + help='Reference genome either hg19 (default) or hg38', + type=str, + metavar='Reference Genome', + required=False, + default='hg19' + ) + parser.add_argument('-p', + help='Request parameters ' + 'e.g. add-all-data=1 expand-pubmed-articles=0', + type=str, + metavar='Request Params', + required=False, + nargs='+' + ) - # Prepare output for writing. - print("Opening output file ", outfile) - fo = open(outfile,'w') + args = parser.parse_args() + infile = args.i + outfile = args.o + batch_size = args.n + api_key = args.k + ref_genome = args.g + request_parameters = None + if args.p: + request_parameters = {param[0]: param[1] for param in [ + param.split("=") for param in args.p + ] + } - # Initialize client connection to API - api = VariantAPIClient(api_key) - if (api is None): - print("Failed to connect to API") - sys.exit() + # Open and load input file into list + print("Reading input file ", infile) + with open(infile) as fi: + variants = fi.readlines() + variants = [v.strip('\n') for v in variants] - batch_counter = 0 - finished = False - while not finished: - start_index = batch_counter*batch_size - end_index = (batch_counter+1)*batch_size - if (end_index > len(variants)): - end_index = len(variants) - finished = True - print(start_index, ":", end_index-1) - batch_variants = variants[start_index:end_index] - print("Lookup for: ", batch_variants, "with ref_genome= ", ref_genome) - batch_data = api.batch_lookup(batch_variants, ref_genome=ref_genome) + # Initialize client connection to API + api = VariantAPIClient(api_key, max_variants_per_batch=batch_size) + if (api is None): + print("Failed to connect to API") + sys.exit() - fo.write(json.dumps(batch_data, indent=2)) - batch_counter += 1 + print("posting GET requests... ", end='') + results = api.batch_lookup( + variants, + params=request_parameters, + ref_genome=ref_genome + ) + print("done") - print ("Finished ", batch_counter, " batch lookups") + print("writing output file ", outfile) + with open (outfile, 'w') as fo: + fo.write(json.dumps(results, indent=4)) if __name__ == '__main__': main(argv) diff --git a/run.py b/run.py index d52e909..8cf3696 100755 --- a/run.py +++ b/run.py @@ -14,29 +14,55 @@ def main(argv): parser = argparse.ArgumentParser(description='Sample Variant API calls') - parser.add_argument('-k', help='Your key to the API', type=str, metavar='API Key', required=False) - parser.add_argument('-g', help='Reference genome either hg19 or hg38', type=str, metavar='Reference Genome', - required=False, default='hg19') + parser.add_argument('-k', + help='Your key to the API', + type=str, + metavar='API Key', + required=False + ) + parser.add_argument('-g', + help='Reference genome either hg19 or hg38', + type=str, + metavar='Reference Genome', + required=False, + default='hg19' + ) parser.add_argument('-q', - help='Query to lookup in the API e.g. chr19:20082943:1:G or in case of batch request ' - 'e.g. chr19:20082943:1:G rs113488022', - type=str, metavar='Query', required=True, nargs='+') + help='Query to lookup in the API e.g. chr19:20082943:1:G ' + 'or in case of batch request ' + 'e.g. chr19:20082943:1:G rs113488022', + type=str, + metavar='Query', + required=True, + nargs='+' + ) parser.add_argument('-p', - help='Request parameters e.g. add-all-data=1 expand-pubmed-articles=0', - type=str, metavar='Request Params', required=False, nargs='+') + help='Request parameters ' + 'e.g. add-all-data=1 expand-pubmed-articles=0', + type=str, + metavar='Request Params', + required=False, + nargs='+' + ) args = parser.parse_args() api_key = args.k query = args.q ref_genome = args.g request_parameters = None if args.p: - request_parameters = {param[0]: param[1] for param in [param.split("=") for param in args.p]} + request_parameters = {param[0]: param[1] for param in [ + param.split("=") for param in args.p + ] + } + api = VariantAPIClient(api_key) + if len(query) == 1: result = api.lookup(query[0], params=request_parameters, ref_genome=ref_genome) else: if api_key is None: - sys.exit("You need to pass an api key to perform batch requests") + sys.exit("You need to pass an api key to perform batch requests" + "consider using batchRequestClient.py for large batch lookups") result = api.batch_lookup(query, params=request_parameters, ref_genome=ref_genome) sys.stdout.write(json.dumps(result, indent=4, sort_keys=True) if result else "No result") sys.stdout.write("\n") diff --git a/variantapi/client.py b/variantapi/client.py index 3adbc84..61d7955 100644 --- a/variantapi/client.py +++ b/variantapi/client.py @@ -87,7 +87,7 @@ class VariantAPIClient(VariantAPIClientBase): lookup_path = "/lookup/%s/%s" batch_lookup_path = "/lookup/batch/%s" - def __init__(self, api_key=None, max_variants_per_batch=200): + def __init__(self, api_key=None, max_variants_per_batch=10000): super(VariantAPIClient, self).__init__(api_key) self.max_variants_per_batch = max_variants_per_batch @@ -106,7 +106,10 @@ def lookup(self, query, params=None, ref_genome='hg19'): return self.get(self.lookup_path % (query, ref_genome), params=params) def batch_lookup(self, variants, params=None, ref_genome='hg19'): - """ + """return list of query results for all variants. + + split variants into chunks of size max_variants_per_batch. post GET for each chunk, + but return combined results. :param variants: list of variant representations :param params: dictionary of key value pairs for http GET parameters. Refer to the api documentation @@ -115,9 +118,11 @@ def batch_lookup(self, variants, params=None, ref_genome='hg19'): :return: list of dictionaries with annotations per variant refer to https://api.varsome.com/lookup/schema for dictionary properties """ + n = self.max_variants_per_batch + chunks = [variants[i:i+n] for i in range(0, len(variants), n)] + results = [] - for queries in [variants[x:x + self.max_variants_per_batch] for x in range(0, len(variants), - self.max_variants_per_batch)]: - data = self.post(self.batch_lookup_path % ref_genome, params=params, json_data={'variants': queries}) + for chunk in chunks: + data = self.post(self.batch_lookup_path % ref_genome, params=params, json_data={'variants': chunk}) results.extend(data) return results