diff --git a/README.md b/README.md index 35e2ed9..ceba936 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,9 @@ You may wish to catch this exception and proceed with your own code logic # proceed with your code flow e.g. print(e) # 404 (invalid reference genome) -### Example Usage +## Example Command Line Usage +### Single Variants or Small Batches You may download and run the run.py python file after installation of the package to test the api client directly e.g. @@ -75,6 +76,20 @@ Run for a list of available options +### Large Batches + +When retrieving information for large batches, you may use `batchRequestClient.py`. +It accepts a text file with one variant per line and outputs a json file. e.g. + + ./batchRequestClient.py -i vars.txt -o test.json -k 'your token' -n 10 + +Run + + ./batchRequestClient.py -h + +for a list of available options + +## Reference To view available request parameters (used in the params method parameter) refer to an example at [api.varsome.com](https://api.varsome.com) or the [api documentation](http://docs.varsome.apiary.io). diff --git a/batchRequestClient.py b/batchRequestClient.py old mode 100644 new mode 100755 index 03f7fad..548a429 --- a/batchRequestClient.py +++ b/batchRequestClient.py @@ -2,13 +2,15 @@ # A simple client application that does the following: # - Loads a text file containing one variant per row -# - Performs a batch lookups to the Saphetor Variant API using N variants at a time. +# - Performs a batch lookups to the Saphetor Variant API using n variants at a time. # - Saves the results in a new file. # # It uses the following module: -# variantapi.client (https://github.com/saphetor/variant-api-client-python) -# -# Note: To sort output json file execute: +# variantapi.client +# https://github.com/saphetor/variant-api-client-python +# +# Note: +# To sort output json file execute: # jq -S '.' output.txt > output_sorted.txt import argparse @@ -22,64 +24,90 @@ __author__ = 'stephanos-androutsellis' -# Declare reference genome as a global variable -_ref_genome = 1019 - - def main(argv): - # Read and parse arguments - infile = '' - outfile = '' - batch_size = 5000 - - parser = argparse.ArgumentParser(description='Simple batch lookup Client application') - parser.add_argument('-i', help='Input file', type=str, metavar='Input File', required=True) - parser.add_argument('-o', help='Output file', type=str, metavar='Output File', required=True) - parser.add_argument('-n', help="Number of variants to batch", type=int, metavar='Batch size', required=True,default=5000) - parser.add_argument('-k', help='Your key to the API', type=str, metavar='API Key', required=False) - parser.add_argument('-g', help='Reference genome either 1019 (default) or 1038', type=int, - metavar='Reference Genome', required=False, default=1019) - - args = parser.parse_args() - infile = args.i - outfile = args.o - batch_size = args.n if args.n is not None else batch_size - api_key = args.k - ref_genome = args.g if args.g is not None else _ref_genome + infile = '' + outfile = '' - # Open and load input file into list - print("Reading input file ", infile) - with open(infile) as fi: - variants = fi.readlines() - variants = [v.strip('\n') for v in variants] + parser = argparse.ArgumentParser( + description='Simple batch lookup Client application. ' + ) + parser.add_argument('-i', + help='Input file', + type=str, + metavar='Input File', + required=True + ) + parser.add_argument('-o', + help='Output file', + type=str, + metavar='Output File', + required=True + ) + parser.add_argument('-n', + help="Number of variants per GET request", + type=int, + metavar='Batch size', + required=False, + default=10000 + ) + parser.add_argument('-k', + help='Your key to the API', + type=str, + metavar='API Key', + required=False + ) + parser.add_argument('-g', + help='Reference genome either hg19 (default) or hg38', + type=str, + metavar='Reference Genome', + required=False, + default='hg19' + ) + parser.add_argument('-p', + help='Request parameters ' + 'e.g. add-all-data=1 expand-pubmed-articles=0', + type=str, + metavar='Request Params', + required=False, + nargs='+' + ) - # Prepare output for writing. - print("Opening output file ", outfile) - fo = open(outfile,'w') + args = parser.parse_args() + infile = args.i + outfile = args.o + batch_size = args.n + api_key = args.k + ref_genome = args.g + request_parameters = None + if args.p: + request_parameters = {param[0]: param[1] for param in [ + param.split("=") for param in args.p + ] + } - # Initialize client connection to API - api = VariantAPIClient(api_key) - if (api is None): - print("Failed to connect to API") - sys.exit() + # Open and load input file into list + print("Reading input file ", infile) + with open(infile) as fi: + variants = fi.readlines() + variants = [v.strip('\n') for v in variants] - batch_counter = 0 - finished = False - while not finished: - start_index = batch_counter*batch_size - end_index = (batch_counter+1)*batch_size - if (end_index > len(variants)): - end_index = len(variants) - finished = True - print(start_index, ":", end_index-1) - batch_variants = variants[start_index:end_index] - print("Lookup for: ", batch_variants, "with ref_genome= ", ref_genome) - batch_data = api.batch_lookup(batch_variants, ref_genome=ref_genome) + # Initialize client connection to API + api = VariantAPIClient(api_key, max_variants_per_batch=batch_size) + if (api is None): + print("Failed to connect to API") + sys.exit() - fo.write(json.dumps(batch_data, indent=2)) - batch_counter += 1 + print("posting GET requests... ", end='') + results = api.batch_lookup( + variants, + params=request_parameters, + ref_genome=ref_genome + ) + print("done") - print ("Finished ", batch_counter, " batch lookups") + print("writing output file ", outfile) + with open (outfile, 'w') as fo: + fo.write(json.dumps(results, indent=4)) if __name__ == '__main__': main(argv) diff --git a/run.py b/run.py index d52e909..8cf3696 100755 --- a/run.py +++ b/run.py @@ -14,29 +14,55 @@ def main(argv): parser = argparse.ArgumentParser(description='Sample Variant API calls') - parser.add_argument('-k', help='Your key to the API', type=str, metavar='API Key', required=False) - parser.add_argument('-g', help='Reference genome either hg19 or hg38', type=str, metavar='Reference Genome', - required=False, default='hg19') + parser.add_argument('-k', + help='Your key to the API', + type=str, + metavar='API Key', + required=False + ) + parser.add_argument('-g', + help='Reference genome either hg19 or hg38', + type=str, + metavar='Reference Genome', + required=False, + default='hg19' + ) parser.add_argument('-q', - help='Query to lookup in the API e.g. chr19:20082943:1:G or in case of batch request ' - 'e.g. chr19:20082943:1:G rs113488022', - type=str, metavar='Query', required=True, nargs='+') + help='Query to lookup in the API e.g. chr19:20082943:1:G ' + 'or in case of batch request ' + 'e.g. chr19:20082943:1:G rs113488022', + type=str, + metavar='Query', + required=True, + nargs='+' + ) parser.add_argument('-p', - help='Request parameters e.g. add-all-data=1 expand-pubmed-articles=0', - type=str, metavar='Request Params', required=False, nargs='+') + help='Request parameters ' + 'e.g. add-all-data=1 expand-pubmed-articles=0', + type=str, + metavar='Request Params', + required=False, + nargs='+' + ) args = parser.parse_args() api_key = args.k query = args.q ref_genome = args.g request_parameters = None if args.p: - request_parameters = {param[0]: param[1] for param in [param.split("=") for param in args.p]} + request_parameters = {param[0]: param[1] for param in [ + param.split("=") for param in args.p + ] + } + api = VariantAPIClient(api_key) + if len(query) == 1: result = api.lookup(query[0], params=request_parameters, ref_genome=ref_genome) else: if api_key is None: - sys.exit("You need to pass an api key to perform batch requests") + sys.exit("You need to pass an api key to perform batch requests" + "consider using batchRequestClient.py for large batch lookups") result = api.batch_lookup(query, params=request_parameters, ref_genome=ref_genome) sys.stdout.write(json.dumps(result, indent=4, sort_keys=True) if result else "No result") sys.stdout.write("\n") diff --git a/variantapi/client.py b/variantapi/client.py index 3adbc84..61d7955 100644 --- a/variantapi/client.py +++ b/variantapi/client.py @@ -87,7 +87,7 @@ class VariantAPIClient(VariantAPIClientBase): lookup_path = "/lookup/%s/%s" batch_lookup_path = "/lookup/batch/%s" - def __init__(self, api_key=None, max_variants_per_batch=200): + def __init__(self, api_key=None, max_variants_per_batch=10000): super(VariantAPIClient, self).__init__(api_key) self.max_variants_per_batch = max_variants_per_batch @@ -106,7 +106,10 @@ def lookup(self, query, params=None, ref_genome='hg19'): return self.get(self.lookup_path % (query, ref_genome), params=params) def batch_lookup(self, variants, params=None, ref_genome='hg19'): - """ + """return list of query results for all variants. + + split variants into chunks of size max_variants_per_batch. post GET for each chunk, + but return combined results. :param variants: list of variant representations :param params: dictionary of key value pairs for http GET parameters. Refer to the api documentation @@ -115,9 +118,11 @@ def batch_lookup(self, variants, params=None, ref_genome='hg19'): :return: list of dictionaries with annotations per variant refer to https://api.varsome.com/lookup/schema for dictionary properties """ + n = self.max_variants_per_batch + chunks = [variants[i:i+n] for i in range(0, len(variants), n)] + results = [] - for queries in [variants[x:x + self.max_variants_per_batch] for x in range(0, len(variants), - self.max_variants_per_batch)]: - data = self.post(self.batch_lookup_path % ref_genome, params=params, json_data={'variants': queries}) + for chunk in chunks: + data = self.post(self.batch_lookup_path % ref_genome, params=params, json_data={'variants': chunk}) results.extend(data) return results