From a68de0a79dfd33cf1d1da7f4808dac87c778c769 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 22 May 2025 11:06:06 -0700 Subject: [PATCH 1/6] Support mapper update for multi-target score sets --- src/mavedb/worker/jobs.py | 80 ++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 74d645a04..b3511fd75 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -50,6 +50,7 @@ from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.mapped_variant import MappedVariant from mavedb.models.published_variant import PublishedVariantsMV +from mavedb.models.target_gene import TargetGene from mavedb.models.score_set import ScoreSet from mavedb.models.user import User from mavedb.models.variant import Variant @@ -247,6 +248,12 @@ async def create_variants_for_score_set( # Mapping variants #################################################################################################### +ANNOTATION_LAYERS = { + "g": "genomic", + "p": "protein", + "c": "cdna", +} + @asynccontextmanager async def mapping_in_execution(redis: ArqRedis, job_id: str): @@ -397,48 +404,43 @@ async def map_variants_for_score_set( # TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support: # this also assumes that the score set is based on a target sequence, not a target accession - computed_genomic_ref = mapping_results.get("computed_genomic_reference_sequence") - mapped_genomic_ref = mapping_results.get("mapped_genomic_reference_sequence") - computed_protein_ref = mapping_results.get("computed_protein_reference_sequence") - mapped_protein_ref = mapping_results.get("mapped_protein_reference_sequence") - - if computed_genomic_ref: - target_sequence = computed_genomic_ref["sequence"] # noqa: F841 - elif computed_protein_ref: - target_sequence = computed_protein_ref["sequence"] # noqa: F841 - else: + reference_metadata = mapping_results.get("reference_sequences") + if not reference_metadata: raise NonexistentMappingReferenceError() - # TODO(VariantEffect/dcd_mapping2#2): Handle variant mappings for score sets with more than 1 target. - target_gene = score_set.target_genes[0] - - excluded_pre_mapped_keys = {"sequence"} - if computed_genomic_ref and mapped_genomic_ref: - pre_mapped_metadata = computed_genomic_ref - target_gene.pre_mapped_metadata = cast( - { - "genomic": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys - } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB) - elif computed_protein_ref and mapped_protein_ref: - pre_mapped_metadata = computed_protein_ref - target_gene.pre_mapped_metadata = cast( - { - "protein": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + for target_gene_identifier in reference_metadata: + target_gene = db.scalars( + select( + TargetGene.where( + TargetGene.name == target_gene_identifier, TargetGene.score_set_id == score_set.id + ) + ) + ).one_or_none() + if not target_gene: + raise ValueError( + f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}." + ) + # allow for multiple annotation layers + pre_mapped_metadata = {} + post_mapped_metadata = {} + excluded_pre_mapped_keys = {"sequence"} + for annotation_layer in reference_metadata[target_gene_identifier]: + layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "computed_reference_sequence" + ) + if layer_premapped: + pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = { + k: layer_premapped[k] + for k in set(list(layer_premapped.keys())) + - excluded_pre_mapped_keys # TODO does this work if no 'sequence' key? } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB) - else: - raise NonexistentMappingReferenceError() + layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "mapped_reference_sequence" + ) + if layer_postmapped: + post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped + target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB) + target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB) total_variants = 0 successful_mapped_variants = 0 From 3e165bce8f3777b6e8abc08ee2c1e8627512ef9b Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Jun 2025 11:54:52 -0700 Subject: [PATCH 2/6] Fix bugs in mapper job, and centralize some mapper resources variant_mapper_manager previously failed because of an incorrectly placed 'where' function and because we did two related db queries, one for a score set and one for target gene(s) within that score set, which resulted in an error when adding the score set changes to the db. Instead, query to select the score set, and then loop through the score set's target genes rather than querying the target genes table in the db. --- src/mavedb/data_providers/services.py | 30 +++-------------------- src/mavedb/lib/mapping.py | 34 +++++++++++++++++++++++++++ src/mavedb/worker/jobs.py | 33 ++++++++------------------ 3 files changed, 47 insertions(+), 50 deletions(-) create mode 100644 src/mavedb/lib/mapping.py diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index 3b241befb..bff4bdb4c 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -1,10 +1,10 @@ import os -from datetime import date -from typing import Optional, TypedDict +from typing import Optional -import requests from cdot.hgvs.dataproviders import SeqFetcher, ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider +from mavedb.lib.mapping import VRSMap + GENOMIC_FASTA_FILES = [ "/data/GCF_000001405.39_GRCh38.p13_genomic.fna.gz", "/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz", @@ -21,29 +21,5 @@ def cdot_rest() -> RESTDataProvider: return RESTDataProvider(seqfetcher=seqfetcher()) -class VRSMap: - url: str - - class ScoreSetMappingResults(TypedDict): - metadata: Optional[dict[str, str]] - dcd_mapping_version: str - mapped_date_utc: date - computed_genomic_reference_sequence: Optional[dict[str, str]] - mapped_genomic_reference_sequence: Optional[dict[str, str]] - computed_protein_reference_sequence: Optional[dict[str, str]] - mapped_protein_reference_sequence: Optional[dict[str, str]] - mapped_scores: Optional[list[dict]] - error_message: Optional[str] - - def __init__(self, url: str) -> None: - self.url = url - - def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults: - uri = f"{self.url}/api/v1/map/{score_set_urn}" - response = requests.post(uri) - response.raise_for_status() - return response.json() - - def vrs_mapper(url: Optional[str] = None) -> VRSMap: return VRSMap(DCD_MAP_URL) if not url else VRSMap(url) diff --git a/src/mavedb/lib/mapping.py b/src/mavedb/lib/mapping.py new file mode 100644 index 000000000..7727199fe --- /dev/null +++ b/src/mavedb/lib/mapping.py @@ -0,0 +1,34 @@ +from datetime import date +from typing import Optional, TypedDict + +import requests + +ANNOTATION_LAYERS = { + "g": "genomic", + "p": "protein", + "c": "cdna", +} + + +class VRSMap: + url: str + + class ScoreSetMappingResults(TypedDict): + metadata: Optional[dict[str, str]] + dcd_mapping_version: str + mapped_date_utc: date + computed_genomic_reference_sequence: Optional[dict[str, str]] + mapped_genomic_reference_sequence: Optional[dict[str, str]] + computed_protein_reference_sequence: Optional[dict[str, str]] + mapped_protein_reference_sequence: Optional[dict[str, str]] + mapped_scores: Optional[list[dict]] + error_message: Optional[str] + + def __init__(self, url: str) -> None: + self.url = url + + def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults: + uri = f"{self.url}/api/v1/map/{score_set_urn}" + response = requests.post(uri) + response.raise_for_status() + return response.json() diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index b3511fd75..3eced28a4 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -35,6 +35,7 @@ NonexistentMappingResultsError, ) from mavedb.lib.logging.context import format_raised_exception_info_as_dict +from mavedb.lib.mapping import ANNOTATION_LAYERS from mavedb.lib.score_sets import ( columns_for_dataset, create_variants, @@ -50,7 +51,6 @@ from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.mapped_variant import MappedVariant from mavedb.models.published_variant import PublishedVariantsMV -from mavedb.models.target_gene import TargetGene from mavedb.models.score_set import ScoreSet from mavedb.models.user import User from mavedb.models.variant import Variant @@ -248,12 +248,6 @@ async def create_variants_for_score_set( # Mapping variants #################################################################################################### -ANNOTATION_LAYERS = { - "g": "genomic", - "p": "protein", - "c": "cdna", -} - @asynccontextmanager async def mapping_in_execution(redis: ArqRedis, job_id: str): @@ -397,25 +391,19 @@ async def map_variants_for_score_set( score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": mapping_results.get("error_message")} else: - # TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support: - # this assumes single-target mapping, will need to be changed to support multi-target mapping - # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached - # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. - # TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support: - # this also assumes that the score set is based on a target sequence, not a target accession - reference_metadata = mapping_results.get("reference_sequences") if not reference_metadata: raise NonexistentMappingReferenceError() for target_gene_identifier in reference_metadata: - target_gene = db.scalars( - select( - TargetGene.where( - TargetGene.name == target_gene_identifier, TargetGene.score_set_id == score_set.id - ) - ) - ).one_or_none() + target_gene = next( + ( + target_gene + for target_gene in score_set.target_genes + if target_gene.name == target_gene_identifier + ), + None, + ) if not target_gene: raise ValueError( f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}." @@ -431,8 +419,7 @@ async def map_variants_for_score_set( if layer_premapped: pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = { k: layer_premapped[k] - for k in set(list(layer_premapped.keys())) - - excluded_pre_mapped_keys # TODO does this work if no 'sequence' key? + for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys } layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( "mapped_reference_sequence" From 0d6efc67af39a9b64ff2b4ea711c22edb49b79b4 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Jun 2025 11:59:00 -0700 Subject: [PATCH 3/6] Update tests to reflect multi-target mapper changes --- tests/helpers/constants.py | 48 +++++++++++++++++++++++++++++++------- tests/worker/test_jobs.py | 15 ++++++++---- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 199ff1b47..af1ca17f5 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -832,17 +832,47 @@ } } -TEST_VARIANT_MAPPING_SCAFFOLD = { +TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD = { "metadata": {}, - "computed_genomic_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.ref_test", - "sequence": "ACGTTT", + "reference_sequences": { + "TEST1": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + } }, - "mapped_genomic_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.map_test", - "sequence_accessions": ["NC_000001.11"], + "mapped_scores": [], + "vrs_version": "2.0", + "dcd_mapping_version": "pytest.0.0", + "mapped_date_utc": datetime.isoformat(datetime.now()), +} + +TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD = { + "metadata": {}, + "reference_sequences": { + "TEST2": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + } }, "mapped_scores": [], "vrs_version": "2.0", diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index b7b350b0a..117ebed1f 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -40,6 +40,7 @@ from tests.helpers.constants import ( + TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD, TEST_CLINGEN_SUBMISSION_RESPONSE, TEST_CLINGEN_SUBMISSION_BAD_RESQUEST_RESPONSE, TEST_CLINGEN_SUBMISSION_UNAUTHORIZED_RESPONSE, @@ -48,7 +49,7 @@ TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_EXPERIMENT, TEST_MINIMAL_SEQ_SCORESET, - TEST_VARIANT_MAPPING_SCAFFOLD, + TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD, VALID_NT_ACCESSION, TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, @@ -150,10 +151,13 @@ async def sanitize_mapping_queue(standalone_worker_context, score_set): assert int(queued_job.decode("utf-8")) == score_set.id -async def setup_mapping_output(async_client, session, score_set, empty=False): +async def setup_mapping_output(async_client, session, score_set, score_set_is_seq_based=True, empty=False): score_set_response = await async_client.get(f"/api/v1/score-sets/{score_set.urn}") - mapping_output = deepcopy(TEST_VARIANT_MAPPING_SCAFFOLD) + if score_set_is_seq_based: + mapping_output = deepcopy(TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD) + else: + mapping_output = deepcopy(TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD) mapping_output["metadata"] = score_set_response.json() if empty: @@ -477,11 +481,12 @@ async def test_create_variants_for_score_set_enqueues_manager_and_successful_map arq_worker, arq_redis, ): + score_set_is_seq = all(["targetSequence" in target for target in input_score_set["targetGenes"]]) score_set_urn, scores, counts = await setup_records_and_files(async_client, data_files, input_score_set) score_set = session.scalars(select(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set_urn)).one() async def dummy_mapping_job(): - return await setup_mapping_output(async_client, session, score_set) + return await setup_mapping_output(async_client, session, score_set, score_set_is_seq) async def dummy_submission_job(): return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] @@ -511,7 +516,7 @@ async def dummy_linking_job(): await arq_worker.run_check() # Call data provider _get_transcript method if this is an accession based score set, otherwise do not. - if all(["targetSequence" in target for target in input_score_set["targetGenes"]]): + if score_set_is_seq: hdp.assert_not_called() else: hdp.assert_called_once() From 06d199630320867b5b4b5661d3694257a1739fcd Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 9 Jun 2025 14:14:13 -0700 Subject: [PATCH 4/6] Update VRSMap class and script for multi-target mapper changes --- src/mavedb/lib/mapping.py | 7 +- .../scripts/populate_mapped_variants.py | 77 +++++++++---------- 2 files changed, 39 insertions(+), 45 deletions(-) diff --git a/src/mavedb/lib/mapping.py b/src/mavedb/lib/mapping.py index 7727199fe..fa608f8c2 100644 --- a/src/mavedb/lib/mapping.py +++ b/src/mavedb/lib/mapping.py @@ -1,5 +1,5 @@ from datetime import date -from typing import Optional, TypedDict +from typing import Optional, TypedDict, Union import requests @@ -17,10 +17,7 @@ class ScoreSetMappingResults(TypedDict): metadata: Optional[dict[str, str]] dcd_mapping_version: str mapped_date_utc: date - computed_genomic_reference_sequence: Optional[dict[str, str]] - mapped_genomic_reference_sequence: Optional[dict[str, str]] - computed_protein_reference_sequence: Optional[dict[str, str]] - mapped_protein_reference_sequence: Optional[dict[str, str]] + reference_sequences: Optional[dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]] mapped_scores: Optional[list[dict]] error_message: Optional[str] diff --git a/src/mavedb/scripts/populate_mapped_variants.py b/src/mavedb/scripts/populate_mapped_variants.py index d1adbd421..ca4b251a4 100644 --- a/src/mavedb/scripts/populate_mapped_variants.py +++ b/src/mavedb/scripts/populate_mapped_variants.py @@ -8,11 +8,12 @@ from sqlalchemy.orm import Session from mavedb.data_providers.services import vrs_mapper +from mavedb.lib.exceptions import NonexistentMappingReferenceError from mavedb.lib.logging.context import format_raised_exception_info_as_dict +from mavedb.lib.mapping import ANNOTATION_LAYERS from mavedb.models.enums.mapping_state import MappingState from mavedb.models.score_set import ScoreSet from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.target_gene import TargetGene from mavedb.models.variant import Variant from mavedb.scripts.environment import script_environment, with_database_session @@ -91,47 +92,43 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all db.commit() logger.info(f"No mapped variants available for {score_set.urn}.") else: - computed_genomic_ref = mapped_scoreset.get("computed_genomic_reference_sequence") - mapped_genomic_ref = mapped_scoreset.get("mapped_genomic_reference_sequence") - computed_protein_ref = mapped_scoreset.get("computed_protein_reference_sequence") - mapped_protein_ref = mapped_scoreset.get("mapped_protein_reference_sequence") - - # assumes one target gene per score set, which is currently true in mavedb as of sept. 2024. - target_gene = db.scalars( - select(TargetGene) - .join(ScoreSet) - .where( - ScoreSet.urn == str(score_set.urn), + reference_metadata = mapped_scoreset.get("reference_sequences") + if not reference_metadata: + raise NonexistentMappingReferenceError() + + for target_gene_identifier in reference_metadata: + target_gene = next( + ( + target_gene + for target_gene in score_set.target_genes + if target_gene.name == target_gene_identifier + ), + None, ) - ).one() - - excluded_pre_mapped_keys = {"sequence"} - if computed_genomic_ref and mapped_genomic_ref: - pre_mapped_metadata = computed_genomic_ref - target_gene.pre_mapped_metadata = cast( - { - "genomic": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + if not target_gene: + raise ValueError( + f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}." + ) + # allow for multiple annotation layers + pre_mapped_metadata = {} + post_mapped_metadata = {} + excluded_pre_mapped_keys = {"sequence"} + for annotation_layer in reference_metadata[target_gene_identifier]: + layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "computed_reference_sequence" + ) + if layer_premapped: + pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = { + k: layer_premapped[k] + for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB) - elif computed_protein_ref and mapped_protein_ref: - pre_mapped_metadata = computed_protein_ref - target_gene.pre_mapped_metadata = cast( - { - "protein": { - k: pre_mapped_metadata[k] - for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys - } - }, - JSONB, - ) - target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB) - else: - raise ValueError(f"incomplete or inconsistent metadata for score set {score_set.urn}") + layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( + "mapped_reference_sequence" + ) + if layer_postmapped: + post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped + target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB) + target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB) mapped_variants = [ variant_from_mapping(db=db, mapping=mapped_score, dcd_mapping_version=dcd_mapping_version) From deabbc28e15dab81326b1a577627240dc79784ca Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 10 Jun 2025 13:55:29 -0700 Subject: [PATCH 5/6] Create accession-based score set in db before mapping tests --- tests/worker/test_jobs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 117ebed1f..05de6e339 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -58,16 +58,17 @@ ) from tests.helpers.util.exceptions import awaitable_exception from tests.helpers.util.experiment import create_experiment -from tests.helpers.util.score_set import create_seq_score_set +from tests.helpers.util.score_set import create_acc_score_set, create_seq_score_set @pytest.fixture def populate_worker_db(data_files, client): # create score set via API. In production, the API would invoke this worker job experiment = create_experiment(client) - score_set = create_seq_score_set(client, experiment["urn"]) + seq_score_set = create_seq_score_set(client, experiment["urn"]) + acc_score_set = create_acc_score_set(client, experiment["urn"]) - return score_set["urn"] + return [seq_score_set["urn"], acc_score_set["urn"]] async def setup_records_and_files(async_client, data_files, input_score_set): From ccbe20f77b4c8622ef31bebbb417447f641ae989 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 11 Jun 2025 08:19:06 -0700 Subject: [PATCH 6/6] Add worker tests for multi-target scoresets --- tests/helpers/constants.py | 178 ++++++++++++++++++++++ tests/helpers/util/score_set.py | 19 +++ tests/worker/data/counts_multi_target.csv | 4 + tests/worker/data/scores_multi_target.csv | 4 + tests/worker/test_jobs.py | 72 +++++++-- 5 files changed, 262 insertions(+), 15 deletions(-) create mode 100644 tests/worker/data/counts_multi_target.csv create mode 100644 tests/worker/data/scores_multi_target.csv diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index af1ca17f5..c70ce6247 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -773,6 +773,147 @@ "officialCollections": [], } +TEST_MINIMAL_MULTI_TARGET_SCORESET = { + "title": "Test Multi Target Score Set Title", + "shortDescription": "Test multi target score set", + "abstractText": "Abstract", + "methodText": "Methods", + "licenseId": 1, + "targetGenes": [ + { + "name": "TEST3", + "category": "protein_coding", + "externalIdentifiers": [], + "targetSequence": { + "sequenceType": "dna", + "sequence": "ACGTTT", + "label": "TEST3", + "taxonomy": { + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + { + "name": "TEST4", + "category": "protein_coding", + "externalIdentifiers": [], + "targetSequence": { + "sequenceType": "dna", + "sequence": "TAATGCC", + "label": "TEST4", + "taxonomy": { + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + ], +} + +TEST_MINIMAL_MULTI_TARGET_SCORESET_RESPONSE = { + "recordType": "ScoreSet", + "title": "Test Multi Target Score Set Title", + "shortDescription": "Test multi target score set", + "abstractText": "Abstract", + "methodText": "Methods", + "createdBy": { + "recordType": "User", + "firstName": TEST_USER["first_name"], + "lastName": TEST_USER["last_name"], + "orcidId": TEST_USER["username"], + }, + "modifiedBy": { + "recordType": "User", + "firstName": TEST_USER["first_name"], + "lastName": TEST_USER["last_name"], + "orcidId": TEST_USER["username"], + }, + "creationDate": date.today().isoformat(), + "modificationDate": date.today().isoformat(), + "license": { + "recordType": "ShortLicense", + **{camelize(k): v for k, v in TEST_LICENSE.items() if k not in ("text",)}, + }, + "numVariants": 0, + "targetGenes": [ + { + "recordType": "TargetGene", + "name": "TEST3", + "category": "protein_coding", + "externalIdentifiers": [], + "id": 1, + "targetSequence": { + "recordType": "TargetSequence", + "sequenceType": "dna", + "sequence": "ACGTTT", + "label": "TEST3", + "taxonomy": { + "recordType": "Taxonomy", + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + { + "recordType": "TargetGene", + "name": "TEST4", + "category": "protein_coding", + "externalIdentifiers": [], + "id": 1, + "targetSequence": { + "recordType": "TargetSequence", + "sequenceType": "dna", + "sequence": "TAATGCC", + "label": "TEST4", + "taxonomy": { + "recordType": "Taxonomy", + "taxId": TEST_TAXONOMY["tax_id"], + "organismName": TEST_TAXONOMY["organism_name"], + "commonName": TEST_TAXONOMY["common_name"], + "rank": TEST_TAXONOMY["rank"], + "hasDescribedSpeciesName": TEST_TAXONOMY["has_described_species_name"], + "articleReference": TEST_TAXONOMY["article_reference"], + "id": TEST_TAXONOMY["id"], + "url": TEST_TAXONOMY["url"], + }, + }, + }, + ], + "metaAnalyzesScoreSetUrns": [], + "metaAnalyzedByScoreSetUrns": [], + "contributors": [], + "doiIdentifiers": [], + "primaryPublicationIdentifiers": [], + "secondaryPublicationIdentifiers": [], + "datasetColumns": {}, + "externalLinks": {}, + "private": True, + "experiment": TEST_MINIMAL_EXPERIMENT_RESPONSE, + # keys to be set after receiving response + "urn": None, + "processingState": ProcessingState.incomplete.name, + "officialCollections": [], +} + TEST_NT_CDOT_TRANSCRIPT = { "start_codon": 0, "stop_codon": 18, @@ -880,6 +1021,43 @@ "mapped_date_utc": datetime.isoformat(datetime.now()), } +TEST_MULTI_TARGET_SCORESET_VARIANT_MAPPING_SCAFFOLD = { + "metadata": {}, + "reference_sequences": { + "TEST3": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test3", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + }, + "TEST4": { + "g": { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test4", + "sequence": "TAATGCC", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + } + }, + }, + "mapped_scores": [], + "vrs_version": "2.0", + "dcd_mapping_version": "pytest.0.0", + "mapped_date_utc": datetime.isoformat(datetime.now()), +} TEST_MINIMAL_MAPPED_VARIANT = { "pre_mapped": {}, diff --git a/tests/helpers/util/score_set.py b/tests/helpers/util/score_set.py index 69ff7ca5a..13c96f91d 100644 --- a/tests/helpers/util/score_set.py +++ b/tests/helpers/util/score_set.py @@ -16,6 +16,7 @@ from tests.helpers.constants import ( TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_SEQ_SCORESET, + TEST_MINIMAL_MULTI_TARGET_SCORESET, TEST_NT_CDOT_TRANSCRIPT, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, @@ -67,6 +68,24 @@ def create_acc_score_set( return response_data +def create_multi_target_score_set( + client: TestClient, experiment_urn: Optional[str], update: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: + score_set_payload = deepcopy(TEST_MINIMAL_MULTI_TARGET_SCORESET) + if experiment_urn is not None: + score_set_payload["experimentUrn"] = experiment_urn + if update is not None: + score_set_payload.update(update) + jsonschema.validate(instance=score_set_payload, schema=ScoreSetCreate.schema()) + + response = client.post("/api/v1/score-sets/", json=score_set_payload) + assert response.status_code == 200, "Could not create sequence based score set" + + response_data = response.json() + jsonschema.validate(instance=response_data, schema=ScoreSet.schema()) + return response_data + + def create_seq_score_set_with_mapped_variants( client, db, data_provider, experiment_urn, scores_csv_path, update=None, counts_csv_path=None ): diff --git a/tests/worker/data/counts_multi_target.csv b/tests/worker/data/counts_multi_target.csv new file mode 100644 index 000000000..37a1f200d --- /dev/null +++ b/tests/worker/data/counts_multi_target.csv @@ -0,0 +1,4 @@ +hgvs_nt,c_0,c_1 +TEST3:n.1A>T,10,20 +TEST3:n.6T>A,90,2 +TEST4:n.2A>T,15,4 diff --git a/tests/worker/data/scores_multi_target.csv b/tests/worker/data/scores_multi_target.csv new file mode 100644 index 000000000..11dcc55fb --- /dev/null +++ b/tests/worker/data/scores_multi_target.csv @@ -0,0 +1,4 @@ +hgvs_nt,score +TEST3:n.1A>T,0.3 +TEST3:n.6T>A,-1.65 +TEST4:n.2A>T,0.1 diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 05de6e339..cdd66c901 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -48,7 +48,9 @@ TEST_NT_CDOT_TRANSCRIPT, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_EXPERIMENT, + TEST_MINIMAL_MULTI_TARGET_SCORESET, TEST_MINIMAL_SEQ_SCORESET, + TEST_MULTI_TARGET_SCORESET_VARIANT_MAPPING_SCAFFOLD, TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD, VALID_NT_ACCESSION, TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS1_X, @@ -58,7 +60,7 @@ ) from tests.helpers.util.exceptions import awaitable_exception from tests.helpers.util.experiment import create_experiment -from tests.helpers.util.score_set import create_acc_score_set, create_seq_score_set +from tests.helpers.util.score_set import create_acc_score_set, create_multi_target_score_set, create_seq_score_set @pytest.fixture @@ -67,8 +69,9 @@ def populate_worker_db(data_files, client): experiment = create_experiment(client) seq_score_set = create_seq_score_set(client, experiment["urn"]) acc_score_set = create_acc_score_set(client, experiment["urn"]) + multi_target_score_set = create_multi_target_score_set(client, experiment["urn"]) - return [seq_score_set["urn"], acc_score_set["urn"]] + return [seq_score_set["urn"], acc_score_set["urn"], multi_target_score_set["urn"]] async def setup_records_and_files(async_client, data_files, input_score_set): @@ -87,8 +90,16 @@ async def setup_records_and_files(async_client, data_files, input_score_set): score_set = score_set_response.json() jsonschema.validate(instance=score_set, schema=ScoreSet.schema()) - scores_fp = "scores.csv" if "targetSequence" in score_set["targetGenes"][0] else "scores_acc.csv" - counts_fp = "counts.csv" if "targetSequence" in score_set["targetGenes"][0] else "counts_acc.csv" + scores_fp = ( + "scores_multi_target.csv" + if len(score_set["targetGenes"]) > 1 + else ("scores.csv" if "targetSequence" in score_set["targetGenes"][0] else "scores_acc.csv") + ) + counts_fp = ( + "counts_multi_target.csv" + if len(score_set["targetGenes"]) > 1 + else ("counts.csv" if "targetSequence" in score_set["targetGenes"][0] else "counts_acc.csv") + ) with ( open(data_files / scores_fp, "rb") as score_file, open(data_files / counts_fp, "rb") as count_file, @@ -152,12 +163,19 @@ async def sanitize_mapping_queue(standalone_worker_context, score_set): assert int(queued_job.decode("utf-8")) == score_set.id -async def setup_mapping_output(async_client, session, score_set, score_set_is_seq_based=True, empty=False): +async def setup_mapping_output( + async_client, session, score_set, score_set_is_seq_based=True, score_set_is_multi_target=False, empty=False +): score_set_response = await async_client.get(f"/api/v1/score-sets/{score_set.urn}") if score_set_is_seq_based: - mapping_output = deepcopy(TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD) + if score_set_is_multi_target: + # If this is a multi-target sequence based score set, use the scaffold for that. + mapping_output = deepcopy(TEST_MULTI_TARGET_SCORESET_VARIANT_MAPPING_SCAFFOLD) + else: + mapping_output = deepcopy(TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD) else: + # there is not currently a multi-target accession-based score set test mapping_output = deepcopy(TEST_ACC_SCORESET_VARIANT_MAPPING_SCAFFOLD) mapping_output["metadata"] = score_set_response.json() @@ -199,6 +217,13 @@ async def setup_mapping_output(async_client, session, score_set, score_set_is_se ], }, ), + ( + TEST_MINIMAL_MULTI_TARGET_SCORESET, + { + "exception": "encountered 1 invalid variant strings.", + "detail": ["target sequence mismatch for 'n.1T>A' at row 0 for sequence TEST3"], + }, + ), ], ) async def test_create_variants_for_score_set_with_validation_error( @@ -215,8 +240,10 @@ async def test_create_variants_for_score_set_with_validation_error( if input_score_set == TEST_MINIMAL_SEQ_SCORESET: scores.loc[:, HGVS_NT_COLUMN].iloc[0] = "c.1T>A" - else: + elif input_score_set == TEST_MINIMAL_ACC_SCORESET: scores.loc[:, HGVS_NT_COLUMN].iloc[0] = f"{VALID_NT_ACCESSION}:c.1T>A" + elif input_score_set == TEST_MINIMAL_MULTI_TARGET_SCORESET: + scores.loc[:, HGVS_NT_COLUMN].iloc[0] = "TEST3:n.1T>A" with ( patch.object( @@ -247,7 +274,9 @@ async def test_create_variants_for_score_set_with_validation_error( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_caught_exception( input_score_set, setup_worker_db, @@ -281,7 +310,9 @@ async def test_create_variants_for_score_set_with_caught_exception( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_caught_base_exception( input_score_set, setup_worker_db, @@ -314,7 +345,9 @@ async def test_create_variants_for_score_set_with_caught_base_exception( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_existing_variants( input_score_set, setup_worker_db, @@ -370,7 +403,9 @@ async def test_create_variants_for_score_set_with_existing_variants( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_with_existing_exceptions( input_score_set, setup_worker_db, @@ -434,7 +469,9 @@ async def test_create_variants_for_score_set_with_existing_exceptions( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set( input_score_set, setup_worker_db, @@ -472,7 +509,9 @@ async def test_create_variants_for_score_set( @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_enqueues_manager_and_successful_mapping( input_score_set, setup_worker_db, @@ -483,11 +522,12 @@ async def test_create_variants_for_score_set_enqueues_manager_and_successful_map arq_redis, ): score_set_is_seq = all(["targetSequence" in target for target in input_score_set["targetGenes"]]) + score_set_is_multi_target = len(input_score_set["targetGenes"]) > 1 score_set_urn, scores, counts = await setup_records_and_files(async_client, data_files, input_score_set) score_set = session.scalars(select(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set_urn)).one() async def dummy_mapping_job(): - return await setup_mapping_output(async_client, session, score_set, score_set_is_seq) + return await setup_mapping_output(async_client, session, score_set, score_set_is_seq, score_set_is_multi_target) async def dummy_submission_job(): return [TEST_CLINGEN_SUBMISSION_RESPONSE, None] @@ -539,7 +579,9 @@ async def dummy_linking_job(): @pytest.mark.asyncio -@pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) +@pytest.mark.parametrize( + "input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_MULTI_TARGET_SCORESET) +) async def test_create_variants_for_score_set_exception_skips_mapping( input_score_set, setup_worker_db,