From 6dc2781c6a9c5c4da354325ad38f609fa40e6c24 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 17 Jul 2025 11:28:06 -0700 Subject: [PATCH 1/4] Raise AlignmentError if no alignment result for target --- src/dcd_mapping/align.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index b64f67a..2d02d41 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -363,6 +363,11 @@ def align( msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) + # confirm that there is an alignment result for each target gene + for target_gene in scoreset_metadata.target_genes: + if target_gene not in alignment_results: + msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}" + raise AlignmentError(msg) return alignment_results From 377a8c441f8fb6f52eeb8ab64057f1b2c0e68041 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 17 Jul 2025 11:28:48 -0700 Subject: [PATCH 2/4] Translate NT target sequence for targets with protein-level variants If a target has only protein-level variants, but the provided target sequence is a nucleotide sequence, translate the nucleotide sequence to an amino acid sequence immediately after metadata ingestion. This change avoids alignment errors that can occur when a target sequence has been codon-optimized to a non-human organism. Since we do not have sufficient metadata to assume that a target sequence has been codon-optimized, always perform translation when there are no nucleotide-level variants for a target. --- src/api/routers/map.py | 2 ++ src/dcd_mapping/main.py | 2 ++ src/dcd_mapping/mavedb_data.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 32b3856..8b27573 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -16,6 +16,7 @@ from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, + correct_target_sequence_type, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, @@ -48,6 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) + metadata = correct_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 7ccf26c..4966ca5 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -22,6 +22,7 @@ ) from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, + correct_target_sequence_type, get_scoreset_metadata, get_scoreset_records, with_mavedb_score_set, @@ -332,6 +333,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) + metadata = correct_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index a137e30..582a873 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -30,8 +30,10 @@ ScoresetMapping, ScoresetMetadata, TargetGene, + TargetSequenceType, UniProtRef, ) +from dcd_mapping.transcripts import _get_protein_sequence __all__ = [ "get_scoreset_urns", @@ -324,6 +326,28 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) +def correct_target_sequence_type( + metadata: ScoresetMetadata, records: dict +) -> ScoresetMetadata: + """If target sequence type is DNA but all variants are protein-level, change to protein. + This avoids BLAT errors in cases where the target sequence was codon-optimized + for a non-human organism + """ + for target_label, target in metadata.target_genes.items(): + if target.target_sequence_type == TargetSequenceType.DNA: + all_protein = True + for record in records.get(target_label, []): + if record.hgvs_pro == "NA" or not record.hgvs_pro: + all_protein = False + break + if all_protein: + msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because all variants are protein-level" + _logger.info(msg) + target.target_sequence = _get_protein_sequence(target.target_sequence) + target.target_sequence_type = TargetSequenceType.PROTEIN + return metadata + + def with_mavedb_score_set(fn: Callable) -> Callable: @wraps(fn) async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002 From 01427c4bc0c1ca055bbeed19e0f2245e2c40f65b Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 18 Jul 2025 13:58:41 -0700 Subject: [PATCH 3/4] Change function name to patch_target_sequence_type --- src/api/routers/map.py | 4 ++-- src/dcd_mapping/main.py | 4 ++-- src/dcd_mapping/mavedb_data.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 8b27573..fca57d7 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -16,10 +16,10 @@ from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, - correct_target_sequence_type, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -49,7 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) - metadata = correct_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 4966ca5..107f885 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -22,9 +22,9 @@ ) from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, - correct_target_sequence_type, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -333,7 +333,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) - metadata = correct_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 582a873..2cab736 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -326,7 +326,7 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) -def correct_target_sequence_type( +def patch_target_sequence_type( metadata: ScoresetMetadata, records: dict ) -> ScoresetMetadata: """If target sequence type is DNA but all variants are protein-level, change to protein. From d60e81a0cb29477301ed3b3b8980d4b2005eae7b Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 18 Jul 2025 14:01:31 -0700 Subject: [PATCH 4/4] Bump version number This update changes how alignment is performed for some score sets, so bump major version. --- src/dcd_mapping/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 8c31b0b..848b44d 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.1.0" +dcd_mapping_version = "2025.2.0"