diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index 3b241befb..bff4bdb4c 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -1,10 +1,10 @@ import os -from datetime import date -from typing import Optional, TypedDict +from typing import Optional -import requests from cdot.hgvs.dataproviders import SeqFetcher, ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider +from mavedb.lib.mapping import VRSMap + GENOMIC_FASTA_FILES = [ "/data/GCF_000001405.39_GRCh38.p13_genomic.fna.gz", "/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz", @@ -21,29 +21,5 @@ def cdot_rest() -> RESTDataProvider: return RESTDataProvider(seqfetcher=seqfetcher()) -class VRSMap: - url: str - - class ScoreSetMappingResults(TypedDict): - metadata: Optional[dict[str, str]] - dcd_mapping_version: str - mapped_date_utc: date - computed_genomic_reference_sequence: Optional[dict[str, str]] - mapped_genomic_reference_sequence: Optional[dict[str, str]] - computed_protein_reference_sequence: Optional[dict[str, str]] - mapped_protein_reference_sequence: Optional[dict[str, str]] - mapped_scores: Optional[list[dict]] - error_message: Optional[str] - - def __init__(self, url: str) -> None: - self.url = url - - def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults: - uri = f"{self.url}/api/v1/map/{score_set_urn}" - response = requests.post(uri) - response.raise_for_status() - return response.json() - - def vrs_mapper(url: Optional[str] = None) -> VRSMap: return VRSMap(DCD_MAP_URL) if not url else VRSMap(url) diff --git a/src/mavedb/lib/mapping.py b/src/mavedb/lib/mapping.py new file mode 100644 index 000000000..fa608f8c2 --- /dev/null +++ b/src/mavedb/lib/mapping.py @@ -0,0 +1,31 @@ +from datetime import date +from typing import Optional, TypedDict, Union + +import requests + +ANNOTATION_LAYERS = { + "g": "genomic", + "p": "protein", + "c": "cdna", +} + + +class VRSMap: + url: str + + class ScoreSetMappingResults(TypedDict): + metadata: Optional[dict[str, str]] + dcd_mapping_version: str + mapped_date_utc: date + reference_sequences: Optional[dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]] + mapped_scores: Optional[list[dict]] + error_message: Optional[str] + + def __init__(self, url: str) -> None: + self.url = url + + def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults: + uri = f"{self.url}/api/v1/map/{score_set_urn}" + response = requests.post(uri) + response.raise_for_status() + return response.json() diff --git a/src/mavedb/scripts/populate_mapped_variants.py b/src/mavedb/scripts/populate_mapped_variants.py index d1adbd421..ca4b251a4 100644 --- a/src/mavedb/scripts/populate_mapped_variants.py +++ b/src/mavedb/scripts/populate_mapped_variants.py @@ -8,11 +8,12 @@ from sqlalchemy.orm import Session from mavedb.data_providers.services import vrs_mapper +from mavedb.lib.exceptions import NonexistentMappingReferenceError from mavedb.lib.logging.context import format_raised_exception_info_as_dict +from mavedb.lib.mapping import ANNOTATION_LAYERS from mavedb.models.enums.mapping_state import MappingState from mavedb.models.score_set import ScoreSet from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.target_gene import TargetGene from mavedb.models.variant import Variant from mavedb.scripts.environment import script_environment, with_database_session @@ -91,47 +92,43 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all db.commit() logger.info(f"No mapped variants available for {score_set.urn}.") else: - computed_genomic_ref = mapped_scoreset.get("computed_genomic_reference_sequence") - mapped_genomic_ref = mapped_scoreset.get("mapped_genomic_reference_sequence") - computed_protein_ref = mapped_scoreset.get("computed_protein_reference_sequence") - mapped_protein_ref = mapped_scoreset.get("mapped_protein_reference_sequence") - - # assumes one target gene per score set, which is currently true in mavedb as of sept. 2024. - target_gene = db.scalars( - select(TargetGene) - .join(ScoreSet) - .where( - ScoreSet.urn == str(score_set.urn), + reference_metadata = mapped_scoreset.get("reference_sequences") + if not reference_metadata: + raise NonexistentMappingReferenceError() + + for target_gene_identifier in reference_metadata: + target_gene = next( + ( + target_gene + for target_gene in score_set.target_genes + if target_gene.name == target_gene_identifier + ), + None, ) - ).one() - - excluded_pre_mapped_keys = {"sequence"} - if computed_genomic_ref and mapped_genomic_ref: - pre_mapped_metadata = computed_genomic_ref - target_gene.pre_mapped_metadata = cast( - { - "genomic": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + if not target_gene: + raise ValueError( + f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}." + ) + # allow for multiple annotation layers + pre_mapped_metadata = {} + post_mapped_metadata = {} + excluded_pre_mapped_keys = {"sequence"} + for annotation_layer in reference_metadata[target_gene_identifier]: + layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "computed_reference_sequence" + ) + if layer_premapped: + pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = { + k: layer_premapped[k] + for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB) - elif computed_protein_ref and mapped_protein_ref: - pre_mapped_metadata = computed_protein_ref - target_gene.pre_mapped_metadata = cast( - { - "protein": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys - } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB) - else: - raise ValueError(f"incomplete or inconsistent metadata for score set {score_set.urn}") + layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "mapped_reference_sequence" + ) + if layer_postmapped: + post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped + target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB) + target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB) mapped_variants = [ variant_from_mapping(db=db, mapping=mapped_score, dcd_mapping_version=dcd_mapping_version) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 74d645a04..3eced28a4 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -35,6 +35,7 @@ NonexistentMappingResultsError, ) from mavedb.lib.logging.context import format_raised_exception_info_as_dict +from mavedb.lib.mapping import ANNOTATION_LAYERS from mavedb.lib.score_sets import ( columns_for_dataset, create_variants, @@ -390,55 +391,43 @@ async def map_variants_for_score_set( score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": mapping_results.get("error_message")} else: - # TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support: - # this assumes single-target mapping, will need to be changed to support multi-target mapping - # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached - # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. - # TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support: - # this also assumes that the score set is based on a target sequence, not a target accession - - computed_genomic_ref = mapping_results.get("computed_genomic_reference_sequence") - mapped_genomic_ref = mapping_results.get("mapped_genomic_reference_sequence") - computed_protein_ref = mapping_results.get("computed_protein_reference_sequence") - mapped_protein_ref = mapping_results.get("mapped_protein_reference_sequence") - - if computed_genomic_ref: - target_sequence = computed_genomic_ref["sequence"] # noqa: F841 - elif computed_protein_ref: - target_sequence = computed_protein_ref["sequence"] # noqa: F841 - else: + reference_metadata = mapping_results.get("reference_sequences") + if not reference_metadata: raise NonexistentMappingReferenceError() - # TODO(VariantEffect/dcd_mapping2#2): Handle variant mappings for score sets with more than 1 target. - target_gene = score_set.target_genes[0] - - excluded_pre_mapped_keys = {"sequence"} - if computed_genomic_ref and mapped_genomic_ref: - pre_mapped_metadata = computed_genomic_ref - target_gene.pre_mapped_metadata = cast( - { - "genomic": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys - } - }, - JSONB, + for target_gene_identifier in reference_metadata: + target_gene = next( + ( + target_gene + for target_gene in score_set.target_genes + if target_gene.name == target_gene_identifier + ), + None, ) - target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB) - elif computed_protein_ref and mapped_protein_ref: - pre_mapped_metadata = computed_protein_ref - target_gene.pre_mapped_metadata = cast( - { - "protein": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + if not target_gene: + raise ValueError( + f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}." + ) + # allow for multiple annotation layers + pre_mapped_metadata = {} + post_mapped_metadata = {} + excluded_pre_mapped_keys = {"sequence"} + for annotation_layer in reference_metadata[target_gene_identifier]: + layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "computed_reference_sequence" + ) + if layer_premapped: + pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = { + k: layer_premapped[k] + for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB) - else: - raise NonexistentMappingReferenceError() + layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "mapped_reference_sequence" + ) + if layer_postmapped: + post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped + target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB) + target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB) total_variants = 0 successful_mapped_variants = 0 diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 199ff1b47..c70ce6247 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -773,6 +773,147 @@ "officialCollections": [], } +TEST_MINIMAL_MULTI_TARGET_SCORESET = { + "title": "Test Multi Target Score Set Title", + "shortDescription": "Test multi target score set", + "abstractText": "Abstract", + "methodText": "Methods", + "licenseId": 1, + "targetGenes": [ + { + "name": "TEST3", + "category": "protein_coding", + "externalIdentifiers": [], + "targetSequence": { + "sequenceType": "dna", + "sequence": "ACGTTT", + "label": "TEST3", + "taxonomy": { + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + { + "name": "TEST4", + "category": "protein_coding", + "externalIdentifiers": [], + "targetSequence": { + "sequenceType": "dna", + "sequence": "TAATGCC", + "label": "TEST4", + "taxonomy": { + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + ], +} + +TEST_MINIMAL_MULTI_TARGET_SCORESET_RESPONSE = { + "recordType": "ScoreSet", + "title": "Test Multi Target Score Set Title", + "shortDescription": "Test multi target score set", + "abstractText": "Abstract", + "methodText": "Methods", + "createdBy": { + "recordType": "User", + "firstName": TEST_USER["first_name"], + "lastName": TEST_USER["last_name"], + "orcidId": TEST_USER["username"], + }, + "modifiedBy": { + "recordType": "User", + "firstName": TEST_USER["first_name"], + "lastName": TEST_USER["last_name"], + "orcidId": TEST_USER["username"], + }, + "creationDate": date.today().isoformat(), + "modificationDate": date.today().isoformat(), + "license": { + "recordType": "ShortLicense", + **{camelize(k): v for k, v in TEST_LICENSE.items() if k not in ("text",)}, + }, + "numVariants": 0, + "targetGenes": [ + { + "recordType": "TargetGene", + "name": "TEST3", + "category": "protein_coding", + "externalIdentifiers": [], + "id": 1, + "targetSequence": { + "recordType": "TargetSequence", + "sequenceType": "dna", + "sequence": "ACGTTT", + "label": "TEST3", + "taxonomy": { + "recordType": "Taxonomy", + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + { + "recordType": "TargetGene", + "name": "TEST4", + "category": "protein_coding", + "externalIdentifiers": [], + "id": 1, + "targetSequence": { + "recordType": "TargetSequence", + "sequenceType": "dna", + "sequence": "TAATGCC", + "label": "TEST4", + "taxonomy": { + "recordType": "Taxonomy", + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + ], + "metaAnalyzesScoreSetUrns": [], + "metaAnalyzedByScoreSetUrns": [], + "contributors": [], + "doiIdentifiers": [], + "primaryPublicationIdentifiers": [], + "secondaryPublicationIdentifiers": [], + "datasetColumns": {}, + "externalLinks": {}, + "private": True, + "experiment": TEST_MINIMAL_EXPERIMENT_RESPONSE, + # keys to be set after receiving response + "urn": None, + "processingState": ProcessingState.incomplete.name, + "officialCollections": [], +} + TEST_NT_CDOT_TRANSCRIPT = { "start_codon": 0, "stop_codon": 18, @@ -832,17 +973,47 @@ } } -TEST_VARIANT_MAPPING_SCAFFOLD = { +TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD = { "metadata": {}, - "computed_genomic_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.ref_test", - "sequence": "ACGTTT", + "reference_sequences": { + "TEST1": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + } }, - "mapped_genomic_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.map_test", - "sequence_accessions": ["NC_000001.11"], + "mapped_scores": [], + "vrs_version": "2.0", + "dcd_mapping_version": "pytest.0.0", + "mapped_date_utc": datetime.isoformat(datetime.now()), +} + +TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD = { + "metadata": {}, + "reference_sequences": { + "TEST2": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + } }, "mapped_scores": [], "vrs_version": "2.0", @@ -850,6 +1021,43 @@ "mapped_date_utc": datetime.isoformat(datetime.now()), } +TEST_MULTI_TARGET_SCORESET_VARIANT_MAPPING_SCAFFOLD = { + "metadata": {}, + "reference_sequences": { + "TEST3": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test3", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + }, + "TEST4": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test4", + "sequence": "TAATGCC", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + }, + }, + "mapped_scores": [], + "vrs_version": "2.0", + "dcd_mapping_version": "pytest.0.0", + "mapped_date_utc": datetime.isoformat(datetime.now()), +} TEST_MINIMAL_MAPPED_VARIANT = { "pre_mapped": {}, diff --git a/tests/helpers/util/score_set.py b/tests/helpers/util/score_set.py index 69ff7ca5a..13c96f91d 100644 --- a/tests/helpers/util/score_set.py +++ b/tests/helpers/util/score_set.py @@ -16,6 +16,7 @@ from tests.helpers.constants import ( TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_SEQ_SCORESET, + TEST_MINIMAL_MULTI_TARGET_SCORESET, TEST_NT_CDOT_TRANSCRIPT, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, @@ -67,6 +68,24 @@ def create_acc_score_set( return response_data +def create_multi_target_score_set( + client: TestClient, experiment_urn: Optional[str], update: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: + score_set_payload = deepcopy(TEST_MINIMAL_MULTI_TARGET_SCORESET) + if experiment_urn is not None: + score_set_payload["experimentUrn"] = experiment_urn + if update is not None: + score_set_payload.update(update) + jsonschema.validate(instance=score_set_payload, schema=ScoreSetCreate.schema()) + + response = client.post("/api/v1/score-sets/", json=score_set_payload) + assert response.status_code == 200, "Could not create sequence based score set" + + response_data = response.json() + jsonschema.validate(instance=response_data, schema=ScoreSet.schema()) + return response_data + + def create_seq_score_set_with_mapped_variants( client, db, data_provider, experiment_urn, scores_csv_path, update=None, counts_csv_path=None ): diff --git a/tests/worker/data/counts_multi_target.csv b/tests/worker/data/counts_multi_target.csv new file mode 100644 index 000000000..37a1f200d --- /dev/null +++ b/tests/worker/data/counts_multi_target.csv @@ -0,0 +1,4 @@ +hgvs_nt,c_0,c_1 +TEST3:n.1A>T,10,20 +TEST3:n.6T>A,90,2 +TEST4:n.2A>T,15,4 diff --git a/tests/worker/data/scores_multi_target.csv b/tests/worker/data/scores_multi_target.csv new file mode 100644 index 000000000..11dcc55fb --- /dev/null +++ b/tests/worker/data/scores_multi_target.csv @@ -0,0 +1,4 @@ +hgvs_nt,score +TEST3:n.1A>T,0.3 +TEST3:n.6T>A,-1.65 +TEST4:n.2A>T,0.1 diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index b7b350b0a..cdd66c901 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -40,6 +40,7 @@ from tests.helpers.constants import ( + TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD, TEST_CLINGEN_SUBMISSION_RESPONSE, TEST_CLINGEN_SUBMISSION_BAD_RESQUEST_RESPONSE, TEST_CLINGEN_SUBMISSION_UNAUTHORIZED_RESPONSE, @@ -47,8 +48,10 @@ TEST_NT_CDOT_TRANSCRIPT, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_EXPERIMENT, + TEST_MINIMAL_MULTI_TARGET_SCORESET, TEST_MINIMAL_SEQ_SCORESET, - TEST_VARIANT_MAPPING_SCAFFOLD, + TEST_MULTI_TARGET_SCORESET_VARIANT_MAPPING_SCAFFOLD, + TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD, VALID_NT_ACCESSION, TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, @@ -57,16 +60,18 @@ ) from tests.helpers.util.exceptions import awaitable_exception from tests.helpers.util.experiment import create_experiment -from tests.helpers.util.score_set import create_seq_score_set +from tests.helpers.util.score_set import create_acc_score_set, create_multi_target_score_set, create_seq_score_set @pytest.fixture def populate_worker_db(data_files, client): # create score set via API. In production, the API would invoke this worker job experiment = create_experiment(client) - score_set = create_seq_score_set(client, experiment["urn"]) + seq_score_set = create_seq_score_set(client, experiment["urn"]) + acc_score_set = create_acc_score_set(client, experiment["urn"]) + multi_target_score_set = create_multi_target_score_set(client, experiment["urn"]) - return score_set["urn"] + return [seq_score_set["urn"], acc_score_set["urn"], multi_target_score_set["urn"]] async def setup_records_and_files(async_client, data_files, input_score_set): @@ -85,8 +90,16 @@ async def setup_records_and_files(async_client, data_files, input_score_set): score_set = score_set_response.json() jsonschema.validate(instance=score_set, schema=ScoreSet.schema()) - scores_fp = "scores.csv" if "targetSequence" in score_set["targetGenes"][0] else "scores_acc.csv" - counts_fp = "counts.csv" if "targetSequence" in score_set["targetGenes"][0] else "counts_acc.csv" + scores_fp = ( + "scores_multi_target.csv" + if len(score_set["targetGenes"]) > 1 + else ("scores.csv" if "targetSequence" in score_set["targetGenes"][0] else "scores_acc.csv") + ) + counts_fp = ( + "counts_multi_target.csv" + if len(score_set["targetGenes"]) > 1 + else ("counts.csv" if "targetSequence" in score_set["targetGenes"][0] else "counts_acc.csv") + ) with ( open(data_files / scores_fp, "rb") as score_file, open(data_files / counts_fp, "rb") as count_file, @@ -150,10 +163,20 @@ async def sanitize_mapping_queue(standalone_worker_context, score_set): assert int(queued_job.decode("utf-8")) == score_set.id -async def setup_mapping_output(async_client, session, score_set, empty=False): +async def setup_mapping_output( + async_client, session, score_set, score_set_is_seq_based=True, score_set_is_multi_target=False, empty=False +): score_set_response = await async_client.get(f"/api/v1/score-sets/{score_set.urn}") - mapping_output = deepcopy(TEST_VARIANT_MAPPING_SCAFFOLD) + if score_set_is_seq_based: + if score_set_is_multi_target: + # If this is a multi-target sequence based score set, use the scaffold for that. + mapping_output = deepcopy(TEST_MULTI_TARGET_SCORESET_VARIANT_MAPPING_SCAFFOLD) + else: + mapping_output = deepcopy(TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD) + else: + # there is not currently a multi-target accession-based score set test + mapping_output = deepcopy(TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD) mapping_output["metadata"] = score_set_response.json() if empty: @@ -194,6 +217,13 @@ async def setup_mapping_output(async_client, session, score_set, empty=False): ], }, ), + ( + TEST_MINIMAL_MULTI_TARGET_SCORESET, + { + "exception": "encountered 1 invalid variant strings.", + "detail": ["target sequence mismatch for 'n.1T>A' at row 0 for sequence TEST3"], + }, + ), ], ) async def test_create_variants_for_score_set_with_validation_error( @@ -210,8 +240,10 @@ async def test_create_variants_for_score_set_with_validation_error( if input_score_set == TEST_MINIMAL_SEQ_SCORESET: scores.loc[:, HGVS_NT_COLUMN].iloc[0] = "c.1T>A" - else: + elif input_score_set == TEST_MINIMAL_ACC_SCORESET: scores.loc[:, HGVS_NT_COLUMN].iloc[0] = f"{VALID_NT_ACCESSION}:c.1T>A" + elif input_score_set == TEST_MINIMAL_MULTI_TARGET_SCORESET: + scores.loc[:, HGVS_NT_COLUMN].iloc[0] = "TEST3:n.1T>A" with ( patch.object( @@ -242,7 +274,9 @@ async def test_create_variants_for_score_set_with_validation_error( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_caught_exception( input_score_set, setup_worker_db, @@ -276,7 +310,9 @@ async def test_create_variants_for_score_set_with_caught_exception( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_caught_base_exception( input_score_set, setup_worker_db, @@ -309,7 +345,9 @@ async def test_create_variants_for_score_set_with_caught_base_exception( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_existing_variants( input_score_set, setup_worker_db, @@ -365,7 +403,9 @@ async def test_create_variants_for_score_set_with_existing_variants( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_existing_exceptions( input_score_set, setup_worker_db, @@ -429,7 +469,9 @@ async def test_create_variants_for_score_set_with_existing_exceptions( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set( input_score_set, setup_worker_db, @@ -467,7 +509,9 @@ async def test_create_variants_for_score_set( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_enqueues_manager_and_successful_mapping( input_score_set, setup_worker_db, @@ -477,11 +521,13 @@ async def test_create_variants_for_score_set_enqueues_manager_and_successful_map arq_worker, arq_redis, ): + score_set_is_seq = all(["targetSequence" in target for target in input_score_set["targetGenes"]]) + score_set_is_multi_target = len(input_score_set["targetGenes"]) > 1 score_set_urn, scores, counts = await setup_records_and_files(async_client, data_files, input_score_set) score_set = session.scalars(select(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set_urn)).one() async def dummy_mapping_job(): - return await setup_mapping_output(async_client, session, score_set) + return await setup_mapping_output(async_client, session, score_set, score_set_is_seq, score_set_is_multi_target) async def dummy_submission_job(): return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] @@ -511,7 +557,7 @@ async def dummy_linking_job(): await arq_worker.run_check() # Call data provider _get_transcript method if this is an accession based score set, otherwise do not. - if all(["targetSequence" in target for target in input_score_set["targetGenes"]]): + if score_set_is_seq: hdp.assert_not_called() else: hdp.assert_called_once() @@ -533,7 +579,9 @@ async def dummy_linking_job(): @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_exception_skips_mapping( input_score_set, setup_worker_db,