diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 32b3856..fca57d7 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -19,6 +19,7 @@ get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -48,6 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index b64f67a..2d02d41 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -363,6 +363,11 @@ def align( msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) + # confirm that there is an alignment result for each target gene + for target_gene in scoreset_metadata.target_genes: + if target_gene not in alignment_results: + msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}" + raise AlignmentError(msg) return alignment_results diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 7ccf26c..107f885 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -24,6 +24,7 @@ ScoresetNotSupportedError, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -332,6 +333,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index a137e30..2cab736 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -30,8 +30,10 @@ ScoresetMapping, ScoresetMetadata, TargetGene, + TargetSequenceType, UniProtRef, ) +from dcd_mapping.transcripts import _get_protein_sequence __all__ = [ "get_scoreset_urns", @@ -324,6 +326,28 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) +def patch_target_sequence_type( + metadata: ScoresetMetadata, records: dict +) -> ScoresetMetadata: + """If target sequence type is DNA but all variants are protein-level, change to protein. + This avoids BLAT errors in cases where the target sequence was codon-optimized + for a non-human organism + """ + for target_label, target in metadata.target_genes.items(): + if target.target_sequence_type == TargetSequenceType.DNA: + all_protein = True + for record in records.get(target_label, []): + if record.hgvs_pro == "NA" or not record.hgvs_pro: + all_protein = False + break + if all_protein: + msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because all variants are protein-level" + _logger.info(msg) + target.target_sequence = _get_protein_sequence(target.target_sequence) + target.target_sequence_type = TargetSequenceType.PROTEIN + return metadata + + def with_mavedb_score_set(fn: Callable) -> Callable: @wraps(fn) async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002 diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 8c31b0b..848b44d 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.1.0" +dcd_mapping_version = "2025.2.0"