From c8a31e7c758afe79f5a0e7481d476b85e7a7d1bf Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 6 Feb 2025 13:49:05 -0800 Subject: [PATCH 01/23] Multi-target mapping partial draft: alignment and transcript selection --- src/dcd_mapping/align.py | 82 +++++++++++++++++++------- src/dcd_mapping/lookup.py | 36 ++++++------ src/dcd_mapping/main.py | 14 ++--- src/dcd_mapping/mavedb_data.py | 69 +++++++++++++--------- src/dcd_mapping/schemas.py | 13 ++++- src/dcd_mapping/transcripts.py | 102 +++++++++++++++++++++++---------- 6 files changed, 210 insertions(+), 106 deletions(-) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index db2d3f8..903a530 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -4,11 +4,12 @@ import subprocess import tempfile from pathlib import Path +from typing import Any from urllib.parse import urlparse import requests from Bio.SearchIO import HSP -from Bio.SearchIO import read as read_blat +from Bio.SearchIO import parse as parse_blat from Bio.SearchIO._model import Hit, QueryResult from cool_seq_tool.schemas import Strand @@ -25,6 +26,7 @@ GeneLocation, ScoresetMetadata, SequenceRange, + TargetGene, TargetSequenceType, ) @@ -61,7 +63,10 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) -> :return: Yielded Path to constructed file. Deletes file once complete. """ _logger.debug("Writing BLAT query to %s", query_file) - lines = [">query", scoreset_metadata.target_sequence] + lines = [] + for target_gene in scoreset_metadata.target_genes: + lines.append(f">{target_gene}") + lines.append(scoreset_metadata.target_genes[target_gene].target_sequence) _write_query_file(query_file, lines) return query_file @@ -143,7 +148,28 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str: return tmp.name -def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: +def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType | str: + """Get overall target sequence type for a score set's target genes. + Protein if all target sequences are protein sequences, nucleotide if all target + sequences are nucleotide sequences, and mixed if there is a mix within the score set. + :param metadata: object containing score set attributes + :return: TargetSequenceType enum (protein or nucleotide) or string "mixed" + """ + target_sequence_types = set() + for target_gene in metadata.target_genes: + target_sequence_types.add( + metadata.target_genes[target_gene].target_sequence_type + ) + if len(target_sequence_types) > 1: + return "mixed" + elif len(target_sequence_types) == 1: # noqa: RET505 + return target_sequence_types.pop() + else: + msg = f"Target sequence types not available for score set {metadata.urn}" + raise ValueError(msg) + + +def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: ANN401 """Run a BLAT query and returns a path to the output object. If unable to produce a valid query the first time, then try a query using ``dnax`` @@ -151,26 +177,33 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: :param scoreset_metadata: object containing scoreset attributes :param silent: suppress BLAT command output - :return: BLAT query result + :return: dict where keys are target gene identifiers and values are BLAT query result objects :raise AlignmentError: if BLAT subprocess returns error code """ with tempfile.NamedTemporaryFile() as tmp_file: query_file = _build_query_file(metadata, Path(tmp_file.name)) - if metadata.target_sequence_type == TargetSequenceType.PROTEIN: + target_sequence_type = _get_target_sequence_type(metadata) + if target_sequence_type == TargetSequenceType.PROTEIN: target_args = "-q=prot -t=dnax" - else: + elif target_sequence_type == TargetSequenceType.DNA: target_args = "" + else: + # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. + msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." + raise NotImplementedError(msg) process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) out_file = _write_blat_output_tempfile(process_result) try: - output = read_blat(out_file, "blat-psl") + output = parse_blat(out_file, "blat-psl") + + # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? except ValueError: target_args = "-q=dnax -t=dnax" process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) out_file = _write_blat_output_tempfile(process_result) try: - output = read_blat(out_file, "blat-psl") + output = parse_blat(out_file, "blat-psl") except ValueError as e: msg = f"Unable to run successful BLAT on {metadata.urn}" raise AlignmentError(msg) from e @@ -178,7 +211,7 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: return output -def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: +def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: """Get best hit from BLAT output. First, try to return hit corresponding to expected chromosome taken from scoreset @@ -186,7 +219,6 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: the hit with the single highest-scoring HSP. :param output: BLAT output - :param urn: scoreset URN to use in error messages :param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"`` :return: best Hit :raise AlignmentError: if unable to get hits from output @@ -207,8 +239,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: hit_chrs = [h.id for h in output] # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome _logger.warning( - "Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s", - urn, + "Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s", + output.id, chromosome, hit_chrs, ) @@ -222,13 +254,13 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: best_score_hit = hit if best_score_hit is None: - msg = f"Couldn't get BLAT hits from {urn}" + msg = f"Couldn't get BLAT hits for target {output.id}." raise AlignmentError(msg) return best_score_hit -def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP: +def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP: """Retrieve preferred HSP from BLAT Hit object. If gene location data is available, prefer the HSP with the least distance @@ -236,7 +268,6 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP take the HSP with the highest score value. :param hit: hit object from BLAT result - :param urn: scoreset identifier for use in error messages :param gene_location: location data acquired by normalizing scoreset metadata :return: Preferred HSP object :raise AlignmentError: if hit object appears to be empty (should be impossible) @@ -252,17 +283,17 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP return best_hsp -def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> AlignmentResult: +def _get_best_match(output: QueryResult, target_gene: TargetGene) -> AlignmentResult: """Obtain best high-scoring pairs (HSP) object for query sequence. :param metadata: scoreset metadata :param output: BLAT result object :return: alignment result ?? """ - location = get_gene_location(metadata) + location = get_gene_location(target_gene) chromosome = location.chromosome if location else None - best_hit = _get_best_hit(output, metadata.urn, chromosome) - best_hsp = _get_best_hsp(best_hit, metadata.urn, location) + best_hit = _get_best_hit(output, chromosome) + best_hsp = _get_best_hsp(best_hit, location) strand = Strand.POSITIVE if best_hsp[0].query_strand == 1 else Strand.NEGATIVE coverage = 100 * (best_hsp.query_end - best_hsp.query_start) / output.seq_len @@ -291,12 +322,19 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen ) -def align(scoreset_metadata: ScoresetMetadata, silent: bool = True) -> AlignmentResult: +def align( + scoreset_metadata: ScoresetMetadata, silent: bool = True +) -> dict[str, AlignmentResult]: """Align target sequence to a reference genome. :param scoreset_metadata: object containing scoreset metadata :param silent: suppress BLAT process output if true - :return: data wrapper containing alignment results + :return: dictionary where keys are target gene identifiers and values are alignment result objects """ blat_output = _get_blat_output(scoreset_metadata, silent) - return _get_best_match(blat_output, scoreset_metadata) + alignment_results = {} + for blat_result in blat_output: + target_label = blat_result.id + target_gene = scoreset_metadata.target_genes[target_label] + alignment_results[target_label] = _get_best_match(blat_result, target_gene) + return alignment_results diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 871d885..98db726 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -46,7 +46,11 @@ from gene.query import QueryHandler from gene.schemas import MatchType, SourceName -from dcd_mapping.schemas import GeneLocation, ManeDescription, ScoresetMetadata +from dcd_mapping.schemas import ( + GeneLocation, + ManeDescription, + TargetGene, +) __all__ = [ "CoolSeqToolBuilder", @@ -287,25 +291,25 @@ def _get_hgnc_symbol(term: str) -> str | None: return None -def get_gene_symbol(metadata: ScoresetMetadata) -> str | None: - """Acquire HGNC gene symbol given provided metadata from scoreset. +def get_gene_symbol(target_gene: TargetGene) -> str | None: + """Acquire HGNC gene symbol given provided target gene metadata from MaveDB. Right now, we use two sources for normalizing: 1. UniProt ID, if available 2. Target name: specifically, we try the first word in the name (this could cause some problems and we should double-check it) - :param ScoresetMetadata: data given by MaveDB API + :param target_gene: target gene metadata given by MaveDB API :return: gene symbol if available """ - if metadata.target_uniprot_ref: - result = _get_hgnc_symbol(metadata.target_uniprot_ref.id) + if target_gene.target_uniprot_ref: + result = _get_hgnc_symbol(target_gene.target_uniprot_ref.id) if result: return result # try taking the first word in the target name - if metadata.target_gene_name: - parsed_name = metadata.target_gene_name.split(" ")[0] + if target_gene.target_gene_name: + parsed_name = target_gene.target_gene_name.split(" ")[0] return _get_hgnc_symbol(parsed_name) return None @@ -324,21 +328,21 @@ def _normalize_gene(term: str) -> Gene | None: def _get_normalized_gene_response( - metadata: ScoresetMetadata, + target_gene: TargetGene, ) -> Gene | None: """Fetch best normalized concept given available scoreset metadata. :param metadata: salient scoreset metadata items :return: Normalized gene if available """ - if metadata.target_uniprot_ref: - gene_descriptor = _normalize_gene(metadata.target_uniprot_ref.id) + if target_gene.target_uniprot_ref: + gene_descriptor = _normalize_gene(target_gene.target_uniprot_ref.id) if gene_descriptor: return gene_descriptor # try taking the first word in the target name - if metadata.target_gene_name: - parsed_name = metadata.target_gene_name.split(" ")[0] + if target_gene.target_gene_name: + parsed_name = target_gene.target_gene_name.split(" ")[0] gene_descriptor = _normalize_gene(parsed_name) if gene_descriptor: return gene_descriptor @@ -371,7 +375,7 @@ def _get_genomic_interval( return None -def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None: +def get_gene_location(target_gene: TargetGene) -> GeneLocation | None: """Acquire gene location data from gene normalizer using metadata provided by scoreset. @@ -380,10 +384,10 @@ def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None: 2. Target name: specifically, we try the first word in the name (this could cause some problems and we should double-check it) - :param metadata: data given by MaveDB API + :param target_gene: data given by MaveDB API :return: gene location data if available """ - gene_descriptor = _get_normalized_gene_response(metadata) + gene_descriptor = _get_normalized_gene_response(target_gene) if not gene_descriptor or not gene_descriptor.extensions: return None diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 6909ed7..ad711bc 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -33,7 +33,7 @@ ScoresetMetadata, VrsVersion, ) -from dcd_mapping.transcripts import TxSelectError, select_transcript +from dcd_mapping.transcripts import TxSelectError, select_transcripts from dcd_mapping.vrs_map import VrsMapError, vrs_map _logger = logging.getLogger(__name__) @@ -156,7 +156,7 @@ async def map_scoreset( _emit_info(f"Performing alignment for {metadata.urn}...", silent) try: - alignment_result = align(metadata, silent) + alignment_results = align(metadata, silent) except BlatNotFoundError as e: msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." _emit_info(msg, silent, logging.ERROR) @@ -179,7 +179,7 @@ async def map_scoreset( _emit_info("Selecting reference sequence...", silent) try: - transcript = await select_transcript(metadata, records, alignment_result) + transcripts = await select_transcripts(metadata, records, alignment_results) except (TxSelectError, KeyError, ValueError) as e: _emit_info( f"Transcript selection failed for scoreset {metadata.urn}", @@ -211,7 +211,7 @@ async def map_scoreset( _emit_info("Mapping to VRS...", silent) try: - vrs_results = vrs_map(metadata, alignment_result, records, transcript, silent) + vrs_results = vrs_map(metadata, alignment_results, records, transcripts, silent) except VrsMapError as e: _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR @@ -239,7 +239,7 @@ async def map_scoreset( _emit_info("Annotating metadata and saving to file...", silent) try: - vrs_results = annotate(vrs_results, transcript, metadata, vrs_version) + vrs_results = annotate(vrs_results, transcripts, metadata, vrs_version) except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors _emit_info( f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR @@ -267,8 +267,8 @@ async def map_scoreset( final_output = save_mapped_output_json( metadata, vrs_results, - alignment_result, - transcript, + alignment_results, + transcripts, prefer_genomic, output_path, ) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 3654eb6..25b0e59 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -17,6 +17,7 @@ from fastapi import HTTPException from pydantic import ValidationError +from dcd_mapping.lookup import DataLookupError from dcd_mapping.resource_utils import ( LOCAL_STORE_PATH, MAVEDB_BASE_URL, @@ -24,7 +25,13 @@ authentication_header, http_download, ) -from dcd_mapping.schemas import ScoreRow, ScoresetMapping, ScoresetMetadata, UniProtRef +from dcd_mapping.schemas import ( + ScoreRow, + ScoresetMapping, + ScoresetMetadata, + TargetGene, + UniProtRef, +) __all__ = [ "get_scoreset_urns", @@ -174,33 +181,41 @@ def get_scoreset_metadata( :raise ResourceAcquisitionError: if unable to acquire metadata """ metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir) + target_genes = {} + multi_target = len(metadata["targetGenes"]) > 1 + + for gene in metadata["targetGenes"]: + target_sequence_gene = gene.get("targetSequence") + if target_sequence_gene is None: + msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." + raise ScoresetNotSupportedError(msg) + if not _metadata_response_is_human(metadata): + msg = f"Experiment for {scoreset_urn} contains no human targets" + raise ScoresetNotSupportedError(msg) + try: + target_sequence_label = gene["targetSequence"].get("label") + if target_sequence_label is None: + # if there are not multiple targets, label is not required by mavedb, + # so use target gene name as the label. + if not multi_target: + target_sequence_label = gene["name"] + else: + msg = f"No target label provided for target in multi-target score set {scoreset_urn}." + raise DataLookupError(msg) + target_genes[target_sequence_label] = TargetGene( + target_gene_name=gene["name"], + target_gene_category=gene["category"], + target_sequence=gene["targetSequence"]["sequence"], + target_sequence_type=gene["targetSequence"]["sequenceType"], + target_sequence_label=target_sequence_label, + target_uniprot_ref=_get_uniprot_ref(metadata), + ) + except (KeyError, ValidationError) as e: + msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" + _logger.error(msg) + raise ScoresetNotSupportedError(msg) from e - if len(metadata["targetGenes"]) > 1: - msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." - raise ScoresetNotSupportedError(msg) - gene = metadata["targetGenes"][0] - target_sequence_gene = gene.get("targetSequence") - if target_sequence_gene is None: - msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." - raise ScoresetNotSupportedError(msg) - if not _metadata_response_is_human(metadata): - msg = f"Experiment for {scoreset_urn} contains no human targets" - raise ScoresetNotSupportedError(msg) - try: - structured_data = ScoresetMetadata( - urn=metadata["urn"], - target_gene_name=gene["name"], - target_gene_category=gene["category"], - target_sequence=gene["targetSequence"]["sequence"], - target_sequence_type=gene["targetSequence"]["sequenceType"], - target_uniprot_ref=_get_uniprot_ref(metadata), - ) - except (KeyError, ValidationError) as e: - msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" - _logger.error(msg) - raise ScoresetNotSupportedError(msg) from e - - return structured_data + return ScoresetMetadata(urn=scoreset_urn, target_genes=target_genes) def _load_scoreset_records(path: Path) -> list[ScoreRow]: diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 366943c..6a52946 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -40,17 +40,24 @@ class UniProtRef(BaseModel): offset: int -class ScoresetMetadata(BaseModel): - """Store all relevant metadata from metadata reported for scoreset by MaveDB""" +class TargetGene(BaseModel): + """Store metadata for a target gene from a MaveDB score set""" - urn: str target_gene_name: str target_gene_category: TargetType target_sequence: str target_sequence_type: TargetSequenceType + target_sequence_label: str target_uniprot_ref: UniProtRef | None = None +class ScoresetMetadata(BaseModel): + """Store all relevant metadata from metadata reported for scoreset by MaveDB""" + + urn: str + target_genes: dict[str, TargetGene] + + class ScoreRow(BaseModel): """Row from a MAVE score result""" diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 56bea85..689fade 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -22,6 +22,7 @@ ManeDescription, ScoreRow, ScoresetMetadata, + TargetGene, TargetSequenceType, TargetType, TranscriptDescription, @@ -38,7 +39,7 @@ class TxSelectError(Exception): async def _get_compatible_transcripts( - metadata: ScoresetMetadata, align_result: AlignmentResult + target_gene: TargetGene, align_result: AlignmentResult ) -> list[list[str]]: """Acquire matching transcripts @@ -51,7 +52,7 @@ async def _get_compatible_transcripts( else: aligned_chrom = align_result.chrom chromosome = get_chromosome_identifier(aligned_chrom) - gene_symbol = get_gene_symbol(metadata) + gene_symbol = get_gene_symbol(target_gene) if not gene_symbol: raise TxSelectError transcript_matches = [] @@ -145,7 +146,7 @@ def _get_protein_sequence(target_sequence: str) -> str: async def _select_protein_reference( - metadata: ScoresetMetadata, align_result: AlignmentResult + target_gene: TargetGene, align_result: AlignmentResult ) -> TxSelectResult: """Select preferred transcript for protein reference sequence @@ -155,22 +156,20 @@ async def _select_protein_reference( :raise TxSelectError: if no matching MANE transcripts and unable to get UniProt ID/ reference sequence """ - matching_transcripts = await _get_compatible_transcripts(metadata, align_result) + matching_transcripts = await _get_compatible_transcripts(target_gene, align_result) if not matching_transcripts: common_transcripts = None else: common_transcripts = _reduce_compatible_transcripts(matching_transcripts) if not common_transcripts: - if not metadata.target_uniprot_ref: - msg = f"Unable to find matching transcripts for {metadata.urn}" + if not target_gene.target_uniprot_ref: + msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}" raise TxSelectError(msg) - protein_sequence = get_uniprot_sequence(metadata.target_uniprot_ref.id) - np_accession = metadata.target_uniprot_ref.id - ref_sequence = get_uniprot_sequence(metadata.target_uniprot_ref.id) + protein_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id) + np_accession = target_gene.target_uniprot_ref.id + ref_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id) if not ref_sequence: - msg = ( - f"Unable to grab reference sequence from uniprot.org for {metadata.urn}" - ) + msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}" raise ValueError(msg) nm_accession = None tx_mode = None @@ -186,8 +185,7 @@ async def _select_protein_reference( np_accession = best_tx.refseq_prot tx_mode = best_tx.transcript_priority - protein_sequence = _get_protein_sequence(metadata.target_sequence) - # TODO -- look at these two lines + protein_sequence = _get_protein_sequence(target_gene.target_sequence) is_full_match = ref_sequence.find(protein_sequence) != -1 start = ref_sequence.find(protein_sequence[:10]) @@ -201,10 +199,10 @@ async def _select_protein_reference( ) -def _offset_target_sequence(metadata: ScoresetMetadata, records: list[ScoreRow]) -> int: +def _offset_target_sequence(target_gene: TargetGene, records: list[ScoreRow]) -> int: """Find start location in target sequence - :param metadata: MaveDB metadata for score set + :param target_gene: MaveDB metadata for target gene :param records: individual score records (including MAVE-HGVS descriptions) :return: starting index position (may be 0) """ @@ -232,7 +230,7 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: list[ScoreRow]) amino_acids_by_position[loc] = seq1(aa) err_locs = [] - protein_sequence = Seq(metadata.target_sequence).translate(table="1") + protein_sequence = Seq(target_gene.target_sequence).translate(table="1") for i in range(len(protein_sequence)): if ( str(i) in amino_acids_by_position @@ -251,7 +249,7 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: list[ScoreRow]) for value in amino_acids_by_position.values(): seq += value - protein_sequence = _get_protein_sequence(metadata.target_sequence) + protein_sequence = _get_protein_sequence(target_gene.target_sequence) offset = 0 if protein_sequence in seq: @@ -308,22 +306,24 @@ def _handle_edge_cases( async def select_transcript( - metadata: ScoresetMetadata, + scoreset_urn: str, + target_gene: TargetGene, records: list[ScoreRow], align_result: AlignmentResult, ) -> TxSelectResult | None: - """Select appropriate human reference sequence for scoreset. + """Select appropriate human reference sequence for one target in a score set. - * Unnecessary for regulatory/other noncoding scoresets which report genomic + * Unnecessary for regulatory/other noncoding targets in score sets which report genomic variations. - * For protein scoresets, identify a matching RefSeq protein reference sequence. + * For protein-coding targets, identify a matching RefSeq protein reference sequence. - :param metadata: Scoreset metadata from MaveDB + :param scoreset_urn: MaveDB URN for score set, used for hardcoding for certain score sets + :param target_gene: Target gene metadata from MaveDB :param records: :param align_result: alignment results :return: Transcript description (accession ID, offset, selected sequence, etc) """ - if metadata.urn.startswith("urn:mavedb:00000097"): + if scoreset_urn.startswith("urn:mavedb:00000097"): _logger.info( "Score sets in urn:mavedb:00000097 are already expressed in full HGVS strings -- using predefined results because additional hard-coding is unnecessary" ) @@ -333,16 +333,56 @@ async def select_transcript( start=0, is_full_match=False, transcript_mode=TranscriptPriority.MANE_SELECT, - sequence=_get_protein_sequence(metadata.target_sequence), + sequence=_get_protein_sequence(target_gene.target_sequence), ) - if metadata.target_gene_category != TargetType.PROTEIN_CODING: - _logger.debug("%s is regulatory/noncoding -- skipping transcript selection") + if target_gene.target_gene_category != TargetType.PROTEIN_CODING: + _logger.debug( + "%s is regulatory/noncoding -- skipping transcript selection", + target_gene.target_gene_name, + ) return None - transcript_reference = await _select_protein_reference(metadata, align_result) - if transcript_reference and metadata.target_sequence_type == TargetSequenceType.DNA: - offset = _offset_target_sequence(metadata, records) + transcript_reference = await _select_protein_reference(target_gene, align_result) + if ( + transcript_reference + and target_gene.target_sequence_type == TargetSequenceType.DNA + ): + offset = _offset_target_sequence(target_gene, records) if offset: transcript_reference.start = offset - return _handle_edge_cases(metadata.urn, transcript_reference) + return _handle_edge_cases(scoreset_urn, transcript_reference) + + +async def select_transcripts( + scoreset_metadata: ScoresetMetadata, + records: list[ScoreRow], + align_results: dict[str, AlignmentResult], +) -> dict[str, TxSelectResult]: + """Select appropriate human reference sequence for each target in a score set. + :param scoreset_metadata: Metadata for score set from MaveDB API + :param records: Variant/score records from MaveDB API + :param align_results: Alignment results for all targets in score set. + * Dict where keys are target labels and values are alignment result objects + :return: dict where keys are target labels and values are objects describing selected transcript (accession ID, offset, selected sequence, etc) + """ + selected_transcripts = {} + for target_gene in scoreset_metadata.target_genes: + # TODO it isn't efficient to go through records each time. Maybe we should initialize records as a dictionary where variants/scores are organized by target label. + # select only records associated with this target + target_associated_records = [ + record + for record in records + if ( + record.hgvs_pro.startswith(target_gene) + or record.hgvs_nt.startswith(target_gene) + ) + ] + selected_transcripts[target_gene] = await select_transcript( + scoreset_urn=scoreset_metadata.urn, + target_gene=scoreset_metadata.target_genes[target_gene], + records=target_associated_records, + align_result=align_results[target_gene], + ) + + return selected_transcripts From 0b68f5aecbbf621955d595bcddf9c48a30895d1c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 13 Feb 2025 13:43:22 -0800 Subject: [PATCH 02/23] Make it the default behavior to prefer genomic mappings when available --- src/dcd_mapping/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index 0db86b3..d5954e3 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -44,7 +44,7 @@ @click.option( "--prefer_genomic", is_flag=True, - default=False, + default=True, help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings", ) def cli( From 4ed6cb9550394ee304a2ebd7e7984d6886354b24 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 13 Feb 2025 13:48:46 -0800 Subject: [PATCH 03/23] Map score sets with multiple targets --- src/dcd_mapping/annotate.py | 178 +++++++++++++++++++-------------- src/dcd_mapping/main.py | 79 +++++++++------ src/dcd_mapping/mavedb_data.py | 53 ++++++++-- src/dcd_mapping/schemas.py | 11 +- src/dcd_mapping/transcripts.py | 29 ++---- src/dcd_mapping/vrs_map.py | 26 ++++- 6 files changed, 229 insertions(+), 147 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index b8f7342..7561f08 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -39,10 +39,12 @@ ScoreAnnotationWithLayer, ScoresetMapping, ScoresetMetadata, + TargetGene, TargetSequenceType, TxSelectResult, VrsVersion, ) +from dcd_mapping.transcripts import TxSelectError _logger = logging.getLogger(__name__) @@ -133,14 +135,15 @@ def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]: def _get_vrs_ref_allele_seq( - allele: Allele, metadata: ScoresetMetadata, tx_select_results: TxSelectResult | None + allele: Allele, + metadata: TargetGene, + urn: str, + tx_select_results: TxSelectResult | None, ) -> Extension: """Create `vrs_ref_allele_seq` property.""" - start, end = _offset_allele_ref_seq( - metadata.urn, allele.location.start, allele.location.end - ) + start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end) if ( - metadata.urn.startswith( + urn.startswith( ( "urn:mavedb:00000047", "urn:mavedb:00000048", @@ -149,6 +152,7 @@ def _get_vrs_ref_allele_seq( ) ) and tx_select_results is not None + and isinstance(tx_select_results, TxSelectResult) ): seq = tx_select_results.sequence ref = seq[start:end] @@ -241,8 +245,9 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]: def _annotate_allele_mapping( mapped_score: MappedScore, - tx_results: TxSelectResult | None, - metadata: ScoresetMetadata, + tx_results: TxSelectResult | TxSelectError | None, + metadata: TargetGene, + urn: str, vrs_version: VrsVersion = VrsVersion.V_2, ) -> ScoreAnnotationWithLayer: """Perform annotations and, if necessary, create VRS 1.3 equivalents for allele mappings.""" @@ -250,7 +255,9 @@ def _annotate_allele_mapping( post_mapped: Allele = mapped_score.post_mapped # get vrs_ref_allele_seq for pre-mapped variants - pre_mapped.extensions = [_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results)] + pre_mapped.extensions = [ + _get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results) + ] if post_mapped: # Determine reference sequence @@ -262,7 +269,7 @@ def _annotate_allele_mapping( if accession.startswith("refseq:"): accession = accession[7:] else: - if tx_results is None: + if tx_results is None or isinstance(tx_results, TxSelectError): raise ValueError # impossible by definition accession = tx_results.np @@ -295,8 +302,9 @@ def _annotate_allele_mapping( def _annotate_haplotype_mapping( mapped_score: MappedScore, - tx_results: TxSelectResult | None, - metadata: ScoresetMetadata, + tx_results: TxSelectResult | TxSelectError | None, + metadata: TargetGene, + urn: str, vrs_version: VrsVersion = VrsVersion.V_2, ) -> ScoreAnnotationWithLayer: """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" @@ -304,7 +312,7 @@ def _annotate_haplotype_mapping( post_mapped: Haplotype = mapped_score.post_mapped # type: ignore # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: - allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, tx_results)] + allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)] if post_mapped: # Determine reference sequence @@ -316,7 +324,7 @@ def _annotate_haplotype_mapping( if accession.startswith("refseq:"): accession = accession[7:] else: - if tx_results is None: + if tx_results is None or isinstance(tx_results, TxSelectError): raise ValueError # impossible by definition accession = tx_results.np @@ -350,8 +358,9 @@ def _annotate_haplotype_mapping( def annotate( mapped_scores: list[MappedScore], - tx_results: TxSelectResult | None, - metadata: ScoresetMetadata, + tx_results: TxSelectResult | TxSelectError | None, + metadata: TargetGene, + urn: str, vrs_version: VrsVersion = VrsVersion.V_2, ) -> list[ScoreAnnotationWithLayer]: """Given a list of mappings, add additional contextual data: @@ -367,7 +376,8 @@ def annotate( :param vrs_results: in-progress variant mappings :param tx_select_results: transcript selection if available - :param metadata: MaveDB scoreset metadata + :param metadata: Target gene metadata from MaveDB API + :param urn: Score set URN :return: annotated mappings objects """ score_annotations = [] @@ -386,7 +396,7 @@ def annotate( ): score_annotations.append( _annotate_haplotype_mapping( - mapped_score, tx_results, metadata, vrs_version + mapped_score, tx_results, metadata, urn, vrs_version ) ) elif isinstance(mapped_score.pre_mapped, Allele) and ( @@ -395,7 +405,7 @@ def annotate( ): score_annotations.append( _annotate_allele_mapping( - mapped_score, tx_results, metadata, vrs_version + mapped_score, tx_results, metadata, urn, vrs_version ) ) else: @@ -406,20 +416,22 @@ def annotate( def _get_computed_reference_sequence( - metadata: ScoresetMetadata, + metadata: TargetGene, layer: AnnotationLayer, - tx_output: TxSelectResult | None = None, -) -> ComputedReferenceSequence: + tx_output: TxSelectResult | TxSelectError | None = None, +) -> ComputedReferenceSequence | None: """Report the computed reference sequence for a score set - :param ss: A score set string + :param metadata: Target gene metadata from MaveDB API :param layer: AnnotationLayer :param tx_output: Transcript data for a score set :return A ComputedReferenceSequence object """ if layer == AnnotationLayer.PROTEIN: - if tx_output is None: - raise ValueError + if tx_output is None or isinstance(tx_output, TxSelectError): + # TODO catch this error - don't stop whole job for one failed target + # raise ValueError + return None seq_id = f"ga4gh:SQ.{sha512t24u(tx_output.sequence.encode('ascii'))}" return ComputedReferenceSequence( sequence=tx_output.sequence, @@ -436,24 +448,31 @@ def _get_computed_reference_sequence( def _get_mapped_reference_sequence( layer: AnnotationLayer, - tx_output: TxSelectResult | None = None, + tx_output: TxSelectResult | TxSelectError | None = None, align_result: AlignmentResult | None = None, -) -> MappedReferenceSequence: +) -> MappedReferenceSequence | None: """Report the mapped reference sequence for a score set - :param ss: A score set string :param layer: AnnotationLayer :param tx_output: Transcript data for a score set :return A MappedReferenceSequence object """ - if layer == AnnotationLayer.PROTEIN and tx_output is not None: + if ( + layer == AnnotationLayer.PROTEIN + and tx_output is not None + and isinstance(tx_output, TxSelectResult) + ): if tx_output.np is None: - msg = "No NP accession associated with reference transcript" - raise ValueError(msg) + # TODO catch this error, don't fail whole job for one target + # msg = "No NP accession associated with reference transcript" + # raise ValueError(msg) + return None vrs_id = get_vrs_id_from_identifier(tx_output.np) if vrs_id is None: - msg = "ID could not be acquired from Seqrepo for transcript identifier" - raise ValueError(msg) + # TODO catch this error, don't fail whole job for one target + # msg = "ID could not be acquired from Seqrepo for transcript identifier" + # raise ValueError(msg) + return None return MappedReferenceSequence( sequence_type=TargetSequenceType.PROTEIN, sequence_id=vrs_id, @@ -462,8 +481,10 @@ def _get_mapped_reference_sequence( seq_id = get_chromosome_identifier(align_result.chrom) vrs_id = get_vrs_id_from_identifier(seq_id) if vrs_id is None: - msg = "ID could not be acquired from Seqrepo for chromosome identifier" - raise ValueError(msg) + # TODO catch this error, don't fail whole job for one target + # msg = "ID could not be acquired from Seqrepo for chromosome identifier" + # raise ValueError(msg) + return None return MappedReferenceSequence( sequence_type=TargetSequenceType.DNA, sequence_id=vrs_id, @@ -513,64 +534,69 @@ def write_scoreset_mapping_to_json( def save_mapped_output_json( metadata: ScoresetMetadata, - mappings: list[ScoreAnnotationWithLayer], - align_result: AlignmentResult, - tx_output: TxSelectResult | None, + mappings: dict[str, ScoreAnnotationWithLayer], + align_results: dict[str, AlignmentResult], + tx_output: dict[str, TxSelectResult | TxSelectError | None], preferred_layer_only: bool = False, output_path: Path | None = None, ) -> Path: """Save mapping output for a score set in a JSON file :param urn: Score set accession - :param mave_vrs_mappings: A dictionary of VrsObject1_x objects - :param align_result: Alignment information for a score set - :param tx_output: Transcript output for a score set + :param mappings: A dictionary of annotated VRS mappings for each target + :param align_result: A dictionary of alignment information for each target + :param tx_output: A dictionary of transcript output for each target :param output_path: specific location to save output to. Default to /urn:mavedb:00000XXX-X-X_mapping_.json :return: output location """ - if preferred_layer_only: - preferred_layers = { - _set_scoreset_layer(metadata.urn, mappings), + # set preferred layers for each target, to allow a mix of coding and noncoding targets + # TODO maybe this should be reevaluated and we should only allow one preferred layer per score set, + # since I can't imagine an experimental assay where some variants are assayed as nucleotide variants + # and others are assayed as amino acid variants. + reference_sequences: dict[str, dict] = {} + mapped_scores: list[ScoreAnnotation] = [] + for target_gene in mappings: + if preferred_layer_only: + preferred_layers = { + _set_scoreset_layer(metadata.urn, mappings[target_gene]), + } + else: + preferred_layers = { + mapping.annotation_layer for mapping in mappings[target_gene] + } + + reference_sequences[target_gene] = { + layer: { + "computed_reference_sequence": None, + "mapped_reference_sequence": None, + } + for layer in AnnotationLayer } - else: - preferred_layers = {mapping.annotation_layer for mapping in mappings} - reference_sequences = { - layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} - for layer in AnnotationLayer - } - - for layer in preferred_layers: - reference_sequences[layer][ - "computed_reference_sequence" - ] = _get_computed_reference_sequence(metadata, layer, tx_output) - reference_sequences[layer][ - "mapped_reference_sequence" - ] = _get_mapped_reference_sequence(layer, tx_output, align_result) + for layer in preferred_layers: + reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence( + metadata.target_genes[target_gene], layer, tx_output[target_gene] + ) + reference_sequences[target_gene][layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence( + layer, tx_output[target_gene], align_results[target_gene] + ) - mapped_scores: list[ScoreAnnotation] = [] - for m in mappings: - if m.pre_mapped is None: - mapped_scores.append(ScoreAnnotation(**m.model_dump())) - elif m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) + for m in mappings[target_gene]: + if m.pre_mapped is None: + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + elif m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + # TODO drop any Nonetype reference sequences output = ScoresetMapping( metadata=metadata.model_dump(), - computed_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["computed_reference_sequence"], - mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][ - "mapped_reference_sequence" - ], - computed_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["computed_reference_sequence"], - mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][ - "mapped_reference_sequence" - ], + reference_sequences=reference_sequences, mapped_scores=mapped_scores, ) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index ad711bc..5b253f7 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -33,7 +33,7 @@ ScoresetMetadata, VrsVersion, ) -from dcd_mapping.transcripts import TxSelectError, select_transcripts +from dcd_mapping.transcripts import select_transcripts from dcd_mapping.vrs_map import VrsMapError, vrs_map _logger = logging.getLogger(__name__) @@ -138,7 +138,7 @@ async def _check_data_prereqs(silent: bool) -> None: async def map_scoreset( metadata: ScoresetMetadata, - records: list[ScoreRow], + records: dict[str, list[ScoreRow]], output_path: Path | None = None, vrs_version: VrsVersion = VrsVersion.V_2, prefer_genomic: bool = False, @@ -180,19 +180,11 @@ async def map_scoreset( _emit_info("Selecting reference sequence...", silent) try: transcripts = await select_transcripts(metadata, records, alignment_results) - except (TxSelectError, KeyError, ValueError) as e: - _emit_info( - f"Transcript selection failed for scoreset {metadata.urn}", - silent, - logging.ERROR, - ) - final_output = write_scoreset_mapping_to_json( - metadata.urn, - ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), - output_path, - ) - _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - return + # NOTE: transcript selection errors are handled in select_transcripts, + # and they do not cause the entire mapping process to exit; instead, an error will be reported + # on the target level and on the variant level for variants relative to that target + # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate + # underlying issues with data providers. except HTTPError as e: _emit_info( f"HTTP error occurred during transcript selection: {e}", @@ -210,8 +202,16 @@ async def map_scoreset( _emit_info("Reference selection complete.", silent) _emit_info("Mapping to VRS...", silent) + vrs_results = {} try: - vrs_results = vrs_map(metadata, alignment_results, records, transcripts, silent) + for target_gene in metadata.target_genes: + vrs_results[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=alignment_results[target_gene], + records=records[target_gene], + transcript=transcripts[target_gene], + silent=silent, + ) except VrsMapError as e: _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR @@ -223,7 +223,8 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return - if vrs_results is None: + # TODO this should be if all values in dict are none. or might not need this at all. + if vrs_results is None or len(vrs_results) == 0: _emit_info(f"No mapping available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, @@ -238,20 +239,32 @@ async def map_scoreset( _emit_info("VRS mapping complete.", silent) _emit_info("Annotating metadata and saving to file...", silent) - try: - vrs_results = annotate(vrs_results, transcripts, metadata, vrs_version) - except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors - _emit_info( - f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR - ) - final_output = write_scoreset_mapping_to_json( - metadata.urn, - ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), - output_path, - ) - _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - return - if vrs_results is None: + # annotate each target's variants separately, since annotation is target specific (e.g. obtaining reference sequence) + annotated_vrs_results = {} + for target_gene in vrs_results: + try: + annotated_vrs_results[target_gene] = annotate( + vrs_results[target_gene], + transcripts[target_gene], + metadata.target_genes[target_gene], + metadata.urn, + vrs_version, + ) + except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors + _emit_info( + f"VRS annotation failed for scoreset {metadata.urn}", + silent, + logging.ERROR, + ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return + # TODO this should be if all values in dict are none. or might not need this at all. + if annotated_vrs_results is None: _emit_info(f"No annotation available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, @@ -266,7 +279,7 @@ async def map_scoreset( try: final_output = save_mapped_output_json( metadata, - vrs_results, + annotated_vrs_results, alignment_results, transcripts, prefer_genomic, @@ -306,7 +319,7 @@ async def map_scoreset_urn( """ try: metadata = get_scoreset_metadata(urn, store_path) - records = get_scoreset_records(urn, silent, store_path) + records = get_scoreset_records(metadata, silent, store_path) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 25b0e59..a8dbb5d 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -218,13 +218,18 @@ def get_scoreset_metadata( return ScoresetMetadata(urn=scoreset_urn, target_genes=target_genes) -def _load_scoreset_records(path: Path) -> list[ScoreRow]: +def _load_scoreset_records( + path: Path, metadata: ScoresetMetadata +) -> dict[str, list[ScoreRow]]: """Load scoreset records from CSV file. + Organize scoreset records by reference sequence prefix / target gene label. + If no reference sequence prefix is provided, the score set should only have one + target, so use the one target's label. This method is intentionally identified as "private", but is refactored out for use during testing. """ - scores_data: list[ScoreRow] = [] + scores_data: dict[str, list[ScoreRow]] = {} with path.open() as csvfile: reader = csv.DictReader(csvfile) for row in reader: @@ -232,7 +237,33 @@ def _load_scoreset_records(path: Path) -> list[ScoreRow]: row["score"] = None else: row["score"] = row["score"] - scores_data.append(ScoreRow(**row)) + if row["hgvs_nt"] != "NA": + # TODO check assumption of no colon in hgvs unless reference sequence identifier present + prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None + elif row["hgvs_pro"] != "NA": + # TODO check assumption of no colon in hgvs unless reference sequence identifier present + prefix = ( + row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None + ) + else: + # Should we quit the whole mapping job if this comes up, or just skip this row and only quit if none contain hgvs_nt or hgvs_pro? + msg = f"Each score row in {metadata.urn} must contain hgvs_nt or hgvs_pro variant description " + raise ScoresetNotSupportedError(msg) + # If no reference sequence prefix is provided, the score set should only have one + # target, so use the one target's label. + if prefix is None: + if len(metadata.target_genes) == 1: + # don't know name of key, so loop through dictionary + # and use the key of the single-entry metadata.target_genes dict as the key for this new dict + for target_gene in metadata.target_genes: + prefix = target_gene + else: + msg = f"Multi-target score set {metadata.urn} does not contain target sequence labels in variant HGVS strings." + raise ScoresetNotSupportedError(msg) + if prefix in scores_data: + scores_data[prefix].append(ScoreRow(**row)) + else: + scores_data[prefix] = [ScoreRow(**row)] return scores_data @@ -253,8 +284,8 @@ def _get_experiment_53_scores(outfile: Path, silent: bool) -> None: def get_scoreset_records( - urn: str, silent: bool = True, dcd_mapping_dir: Path | None = None -) -> list[ScoreRow]: + metadata: ScoresetMetadata, silent: bool = True, dcd_mapping_dir: Path | None = None +) -> dict[str, list[ScoreRow]]: """Get scoreset records. Only hit the MaveDB API if unavailable locally. That means data must be refreshed @@ -270,13 +301,13 @@ def get_scoreset_records( """ if not dcd_mapping_dir: dcd_mapping_dir = LOCAL_STORE_PATH - scores_csv = dcd_mapping_dir / f"{urn}_scores.csv" + scores_csv = dcd_mapping_dir / f"{metadata.urn}_scores.csv" # TODO use smarter/more flexible caching methods if not scores_csv.exists(): - if urn == "urn:mavedb:00000053-a-1": + if metadata.urn == "urn:mavedb:00000053-a-1": _get_experiment_53_scores(scores_csv, silent) else: - url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}/scores" + url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{metadata.urn}/scores" try: http_download(url, scores_csv, silent) except requests.HTTPError as e: @@ -284,7 +315,7 @@ def get_scoreset_records( _logger.error(msg) raise ResourceAcquisitionError(msg) from e - return _load_scoreset_records(scores_csv) + return _load_scoreset_records(scores_csv, metadata) def with_mavedb_score_set(fn: Callable) -> Callable: @@ -300,8 +331,8 @@ async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002 # without the need to download the data again. temp_dir_as_path = Path(temp_dir) try: - get_scoreset_metadata(urn, temp_dir_as_path) - get_scoreset_records(urn, silent, temp_dir_as_path) + metadata = get_scoreset_metadata(urn, temp_dir_as_path) + get_scoreset_records(metadata, silent, temp_dir_as_path) except ScoresetNotSupportedError as e: return ScoresetMapping( metadata=None, diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 6a52946..7c33740 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -203,9 +203,12 @@ class ScoresetMapping(BaseModel): mapped_date_utc: str = Field( default=datetime.datetime.now(tz=datetime.UTC).isoformat() ) - computed_protein_reference_sequence: ComputedReferenceSequence | None = None - mapped_protein_reference_sequence: MappedReferenceSequence | None = None - computed_genomic_reference_sequence: ComputedReferenceSequence | None = None - mapped_genomic_reference_sequence: MappedReferenceSequence | None = None + reference_sequences: dict[ + str, + dict[ + AnnotationLayer, + dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], + ], + ] | None = None mapped_scores: list[ScoreAnnotation] | None = None error_message: str | None = None diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 689fade..5004368 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -170,7 +170,7 @@ async def _select_protein_reference( ref_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id) if not ref_sequence: msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}" - raise ValueError(msg) + raise TxSelectError(msg) nm_accession = None tx_mode = None else: @@ -356,9 +356,9 @@ async def select_transcript( async def select_transcripts( scoreset_metadata: ScoresetMetadata, - records: list[ScoreRow], + records: dict[str, list[ScoreRow]], align_results: dict[str, AlignmentResult], -) -> dict[str, TxSelectResult]: +) -> dict[str, TxSelectResult | Exception | None]: """Select appropriate human reference sequence for each target in a score set. :param scoreset_metadata: Metadata for score set from MaveDB API :param records: Variant/score records from MaveDB API @@ -368,21 +368,14 @@ async def select_transcripts( """ selected_transcripts = {} for target_gene in scoreset_metadata.target_genes: - # TODO it isn't efficient to go through records each time. Maybe we should initialize records as a dictionary where variants/scores are organized by target label. - # select only records associated with this target - target_associated_records = [ - record - for record in records - if ( - record.hgvs_pro.startswith(target_gene) - or record.hgvs_nt.startswith(target_gene) + try: + selected_transcripts[target_gene] = await select_transcript( + scoreset_urn=scoreset_metadata.urn, + target_gene=scoreset_metadata.target_genes[target_gene], + records=records[target_gene], + align_result=align_results[target_gene], ) - ] - selected_transcripts[target_gene] = await select_transcript( - scoreset_urn=scoreset_metadata.urn, - target_gene=scoreset_metadata.target_genes[target_gene], - records=target_associated_records, - align_result=align_results[target_gene], - ) + except (TxSelectError, KeyError) as e: + selected_transcripts[target_gene] = e return selected_transcripts diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 4f4a32f..e72b0df 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -33,6 +33,7 @@ TargetType, TxSelectResult, ) +from dcd_mapping.transcripts import TxSelectError __all__ = ["vrs_map", "VrsMapError"] @@ -280,6 +281,13 @@ def _parse_raw_variant_str(raw_description: str) -> list[str]: :param raw_description: A string that may contain a list of variant descriptions or a single variant description :return: A list of HGVS strings """ + # some variant strings follow mavehgvs format, meaning they don't have a reference sequence id and colon preceding the c./g./n./p. prefix + # the reference sequence information has previously been parsed for score sets with multiple targets, + # so can discard the reference sequence id and colon if they are present + # NOTE: this will need to be changed for accession-based mapping - cannot discard reference unless it has been previously parsed. + # TODO check assumption of no colon unless reference sequence identifier is supplied! + if ":" in raw_description: + raw_description = raw_description.split(":")[1] if "[" in raw_description: prefix = raw_description[0:2] return [prefix + var for var in set(raw_description[3:-1].split(";"))] @@ -529,14 +537,14 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool: def _map_protein_coding( metadata: ScoresetMetadata, records: list[ScoreRow], - transcript: TxSelectResult, + transcript: TxSelectResult | TxSelectError, align_result: AlignmentResult, ) -> list[MappedScore]: """Perform mapping on protein coding experiment results :param metadata: The metadata for a score set :param records: The list of MAVE variants in a given score set - :param transcript: The transcript data for a score set + :param transcript: The transcript data for a score set, or an error message describing why an expected transcript is missing :param align_results: The alignment data for a score set :return: A list of mappings """ @@ -552,7 +560,15 @@ def _map_protein_coding( variations: list[MappedScore] = [] for row in records: - hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript) + if isinstance(transcript, TxSelectError): + # TODO create pre mapped allele + hgvs_pro_mappings = MappedScore( + accession_id=row.accession, + score=row.score, + error_message=str(transcript).strip("'"), + ) + else: + hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript) if hgvs_pro_mappings: variations.append(hgvs_pro_mappings) @@ -664,13 +680,13 @@ def vrs_map( metadata: ScoresetMetadata, align_result: AlignmentResult, records: list[ScoreRow], - transcript: TxSelectResult | None = None, + transcript: TxSelectResult | TxSelectError | None = None, silent: bool = True, ) -> list[MappedScore] | None: """Given a description of a MAVE scoreset and an aligned transcript, generate the corresponding VRS objects. - :param metadata: salient MAVE scoreset metadata + :param metadata: target gene metadata from MaveDB API :param align_result: output from the sequence alignment process :param records: scoreset records :param transcript: output of transcript selection process From d94d55c31cd09bb3f01c59924901f2ffeebec30f Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Mar 2025 08:52:09 -0700 Subject: [PATCH 04/23] fix: single-target blat result named incorrectly --- src/dcd_mapping/align.py | 62 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 903a530..30a9234 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -180,35 +180,36 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: :return: dict where keys are target gene identifiers and values are BLAT query result objects :raise AlignmentError: if BLAT subprocess returns error code """ - with tempfile.NamedTemporaryFile() as tmp_file: - query_file = _build_query_file(metadata, Path(tmp_file.name)) - target_sequence_type = _get_target_sequence_type(metadata) - if target_sequence_type == TargetSequenceType.PROTEIN: - target_args = "-q=prot -t=dnax" - elif target_sequence_type == TargetSequenceType.DNA: - target_args = "" - else: - # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. - msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." - raise NotImplementedError(msg) - process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) - out_file = _write_blat_output_tempfile(process_result) - - try: - output = parse_blat(out_file, "blat-psl") - - # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? - except ValueError: - target_args = "-q=dnax -t=dnax" - process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) - out_file = _write_blat_output_tempfile(process_result) - try: - output = parse_blat(out_file, "blat-psl") - except ValueError as e: - msg = f"Unable to run successful BLAT on {metadata.urn}" - raise AlignmentError(msg) from e - - return output + return parse_blat(f"{metadata.urn}_blat.psl", "blat-psl") + # with tempfile.NamedTemporaryFile() as tmp_file: + # query_file = _build_query_file(metadata, Path(tmp_file.name)) + # target_sequence_type = _get_target_sequence_type(metadata) + # if target_sequence_type == TargetSequenceType.PROTEIN: + # target_args = "-q=prot -t=dnax" + # elif target_sequence_type == TargetSequenceType.DNA: + # target_args = "" + # else: + # # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. + # msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." + # raise NotImplementedError(msg) + # process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + # out_file = _write_blat_output_tempfile(process_result) + + # try: + # output = parse_blat(out_file, "blat-psl") + + # # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? + # except ValueError: + # target_args = "-q=dnax -t=dnax" + # process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + # out_file = _write_blat_output_tempfile(process_result) + # try: + # output = parse_blat(out_file, "blat-psl") + # except ValueError as e: + # msg = f"Unable to run successful BLAT on {metadata.urn}" + # raise AlignmentError(msg) from e + + # return output def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: @@ -335,6 +336,9 @@ def align( alignment_results = {} for blat_result in blat_output: target_label = blat_result.id + # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets + if target_label == "query" and len(scoreset_metadata.target_genes) == 1: + target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) return alignment_results From bfcbcbaf39068dc6f0da104e91887a5cfb176f7c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Mar 2025 08:52:36 -0700 Subject: [PATCH 05/23] Remove blank reference sequence entries from output --- src/dcd_mapping/annotate.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 7561f08..eafa05a 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -571,7 +571,7 @@ def save_mapped_output_json( "computed_reference_sequence": None, "mapped_reference_sequence": None, } - for layer in AnnotationLayer + for layer in preferred_layers } for layer in preferred_layers: @@ -593,7 +593,19 @@ def save_mapped_output_json( # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) - # TODO drop any Nonetype reference sequences + # drop Nonetype reference sequences + for target_gene in reference_sequences: + for layer in list(reference_sequences[target_gene].keys()): + if ( + reference_sequences[target_gene][layer]["mapped_reference_sequence"] + is None + and reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] + is None + ) or layer is None: + del reference_sequences[target_gene][layer] + output = ScoresetMapping( metadata=metadata.model_dump(), reference_sequences=reference_sequences, From 20e3aa2390c2f0353d82d150668aab771740ba23 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Mar 2025 14:09:01 -0700 Subject: [PATCH 06/23] Fix type annotations for target gene metadata param --- src/dcd_mapping/vrs_map.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index e72b0df..67b44c0 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -28,7 +28,7 @@ AlignmentResult, MappedScore, ScoreRow, - ScoresetMetadata, + TargetGene, TargetSequenceType, TargetType, TxSelectResult, @@ -535,14 +535,14 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool: def _map_protein_coding( - metadata: ScoresetMetadata, + metadata: TargetGene, records: list[ScoreRow], transcript: TxSelectResult | TxSelectError, align_result: AlignmentResult, ) -> list[MappedScore]: """Perform mapping on protein coding experiment results - :param metadata: The metadata for a score set + :param metadata: Target gene metadata from MaveDB API :param records: The list of MAVE variants in a given score set :param transcript: The transcript data for a score set, or an error message describing why an expected transcript is missing :param align_results: The alignment data for a score set @@ -582,13 +582,13 @@ def _map_protein_coding( def _map_regulatory_noncoding( - metadata: ScoresetMetadata, + metadata: TargetGene, records: list[ScoreRow], align_result: AlignmentResult, ) -> list[MappedScore]: """Perform mapping on noncoding/regulatory experiment results - :param metadata: metadata for URN + :param metadata: Target gene metadata from MaveDB API :param records: list of MAVE experiment result rows :param align_result: An AlignmentResult object for a score set :return: A list of VRS mappings @@ -677,7 +677,7 @@ def _construct_vrs_allele( def vrs_map( - metadata: ScoresetMetadata, + metadata: TargetGene, align_result: AlignmentResult, records: list[ScoreRow], transcript: TxSelectResult | TxSelectError | None = None, From 8fb8e0bfcec67942d78162d4f4c91333a2e550a3 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 8 Apr 2025 11:23:31 -0700 Subject: [PATCH 07/23] Accession-based mapping and add cdot to environment --- Dockerfile | 6 + pyproject.toml | 1 + src/dcd_mapping/align.py | 166 +++++++++++++++---- src/dcd_mapping/annotate.py | 26 ++- src/dcd_mapping/lookup.py | 36 +++++ src/dcd_mapping/main.py | 14 +- src/dcd_mapping/mavedb_data.py | 74 +++++---- src/dcd_mapping/schemas.py | 12 +- src/dcd_mapping/transcripts.py | 38 +++-- src/dcd_mapping/vrs_map.py | 287 ++++++++++++++++++++++++++------- 10 files changed, 522 insertions(+), 138 deletions(-) diff --git a/Dockerfile b/Dockerfile index 363ee58..a38827f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,12 @@ RUN curl -L https://github.com/samtools/htslib/releases/download/${htsversion}/h curl -L https://github.com/samtools/bcftools/releases/download/${htsversion}/bcftools-${htsversion}.tar.bz2 | tar xj && \ (cd bcftools-${htsversion} && ./configure --enable-libgsl --enable-perl-filters --with-htslib=system && make install) +# Fetch and index GRCh37 and GRCh38 assemblies for cdot +RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.25_GRCh37.p13_genomic.fna.gz +RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.39_GRCh38.p13_genomic.fna.gz +RUN samtools faidx GCF_000001405.25_GRCh37.p13_genomic.fna.gz +RUN samtools faidx GCF_000001405.39_GRCh38.p13_genomic.fna.gz + RUN mkdir /usr/src/app WORKDIR /usr/src/app COPY . . diff --git a/pyproject.toml b/pyproject.toml index 331d3dc..e7568e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "requests", "biopython", "tqdm", + "cdot", "click", "cool-seq-tool==0.4.0.dev3", "ga4gh.vrs==2.0.0-a6", diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 30a9234..16b2cf5 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -14,9 +14,7 @@ from cool_seq_tool.schemas import Strand from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location -from dcd_mapping.mavedb_data import ( - LOCAL_STORE_PATH, -) +from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError from dcd_mapping.resource_utils import ( ResourceAcquisitionError, http_download, @@ -180,36 +178,35 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: :return: dict where keys are target gene identifiers and values are BLAT query result objects :raise AlignmentError: if BLAT subprocess returns error code """ - return parse_blat(f"{metadata.urn}_blat.psl", "blat-psl") - # with tempfile.NamedTemporaryFile() as tmp_file: - # query_file = _build_query_file(metadata, Path(tmp_file.name)) - # target_sequence_type = _get_target_sequence_type(metadata) - # if target_sequence_type == TargetSequenceType.PROTEIN: - # target_args = "-q=prot -t=dnax" - # elif target_sequence_type == TargetSequenceType.DNA: - # target_args = "" - # else: - # # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. - # msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." - # raise NotImplementedError(msg) - # process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) - # out_file = _write_blat_output_tempfile(process_result) - - # try: - # output = parse_blat(out_file, "blat-psl") - - # # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? - # except ValueError: - # target_args = "-q=dnax -t=dnax" - # process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) - # out_file = _write_blat_output_tempfile(process_result) - # try: - # output = parse_blat(out_file, "blat-psl") - # except ValueError as e: - # msg = f"Unable to run successful BLAT on {metadata.urn}" - # raise AlignmentError(msg) from e - - # return output + with tempfile.NamedTemporaryFile() as tmp_file: + query_file = _build_query_file(metadata, Path(tmp_file.name)) + target_sequence_type = _get_target_sequence_type(metadata) + if target_sequence_type == TargetSequenceType.PROTEIN: + target_args = "-q=prot -t=dnax" + elif target_sequence_type == TargetSequenceType.DNA: + target_args = "" + else: + # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. + msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." + raise NotImplementedError(msg) + process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + out_file = _write_blat_output_tempfile(process_result) + + try: + output = parse_blat(out_file, "blat-psl") + + # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? + except ValueError: + target_args = "-q=dnax -t=dnax" + process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + out_file = _write_blat_output_tempfile(process_result) + try: + output = parse_blat(out_file, "blat-psl") + except ValueError as e: + msg = f"Unable to run successful BLAT on {metadata.urn}" + raise AlignmentError(msg) from e + + return output def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: @@ -342,3 +339,106 @@ def align( target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) return alignment_results + + +def fetch_alignment( + metadata: ScoresetMetadata, silent: bool +) -> dict[str, AlignmentResult | None]: + alignment_results = {} + for target_gene in metadata.target_genes: + accession_id = metadata.target_genes[target_gene].target_accession_id + # protein and contig/chromosome accession ids do not need to be aligned to the genome + if accession_id.startswith(("NP", "ENSP", "NC_")): + alignment_results[accession_id] = None + else: + url = f"https://cdot.cc/transcript/{accession_id}" + r = requests.get(url, timeout=30) + + try: + r.raise_for_status() + except requests.HTTPError as e: + msg = f"Received HTTPError from {url} for scoreset {metadata.urn}" + _logger.error(msg) + raise ResourceAcquisitionError(msg) from e + + cdot_mapping = r.json() + alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent) + return alignment_results + + +def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult: + # blat psl & AlignmentResult: 0-based, start inclusive, stop exclusive + # cdot: 1-based, start inclusive, stop inclusive + # so, to "translate" cdot ranges to AlignmentResult-style ranges: + # subtract 1 from start and end to go from 1-based to 0-based coord, + # and then add 1 to the stop to go from inclusive to exclusive + # so just subtract 1 from start and do nothing to end + + grch38 = cdot_mapping.get("genome_builds", {}).get("GRCh38") + grch37 = cdot_mapping.get("genome_builds", {}).get("GRCh37") + mapping = grch38 if grch38 else grch37 + if mapping is None: + msg = f"Cdot transcript results for transcript {cdot_mapping.get('id')} do not include GRCh37 or GRCh38 mapping" + raise AlignmentError(msg) + + chrom = mapping["contig"] + strand = Strand.POSITIVE if mapping["strand"] == "+" else Strand.NEGATIVE + query_subranges = [] + hit_subranges = [] + for exon in mapping["exons"]: + query_subranges.append(SequenceRange(start=exon[3] - 1, end=exon[4])) + hit_subranges.append(SequenceRange(start=exon[0] - 1, end=exon[1])) + + if strand == Strand.POSITIVE: + query_range = SequenceRange( + start=query_subranges[0].start, end=query_subranges[-1].end + ) + hit_range = SequenceRange( + start=hit_subranges[0].start, end=hit_subranges[-1].end + ) + else: + query_range = SequenceRange( + start=query_subranges[-1].start, end=query_subranges[0].end + ) + hit_range = SequenceRange( + start=hit_subranges[-1].start, end=hit_subranges[0].end + ) + + return AlignmentResult( + chrom=chrom, + strand=strand, + query_range=query_range, + query_subranges=query_subranges, + hit_range=hit_range, + hit_subranges=hit_subranges, + ) + + +def build_alignment_result( + metadata: ScoresetMetadata, silent: bool +) -> dict[str, AlignmentResult | None]: + # NOTE: Score set must contain all accession-based target genes or all sequence-based target genes + # This decision was made because it is most efficient to run BLAT all together, so the alignment function + # works on an entire score set rather than per target gene. + # However, if the need arises, we can allow both types of target genes in a score set. + + # determine whether score set is accession-based or sequence-based + score_set_type = None + for target_gene in metadata.target_genes: + if metadata.target_genes[target_gene].target_accession_id: + if score_set_type == "sequence": + msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported." + raise ScoresetNotSupportedError(msg) + score_set_type = "accession" + else: + if score_set_type == "accession": + msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported." + raise ScoresetNotSupportedError(msg) + score_set_type = "sequence" + + if score_set_type == "sequence": + alignment_result = align(metadata, silent) + else: + alignment_result = fetch_alignment(metadata, silent) + + return alignment_result diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index eafa05a..3c47a8b 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -419,14 +419,36 @@ def _get_computed_reference_sequence( metadata: TargetGene, layer: AnnotationLayer, tx_output: TxSelectResult | TxSelectError | None = None, -) -> ComputedReferenceSequence | None: +) -> ComputedReferenceSequence | MappedReferenceSequence | None: """Report the computed reference sequence for a score set :param metadata: Target gene metadata from MaveDB API :param layer: AnnotationLayer :param tx_output: Transcript data for a score set - :return A ComputedReferenceSequence object + :return A ComputedReferenceSequence object, + or if the target gene is accession-based, a mapped reference sequence describing the pre-mapped reference """ + # accession-based target genes always use accession id as pre-mapped reference sequence + if metadata.target_accession_id: + seq_id = get_vrs_id_from_identifier(metadata.target_accession_id) + # use MappedReferenceSequence type because there should be an accession id but no sequence. + # for accession-based target genes, the object returned by this function describes the provided reference accession + # whereas the object returned by _get_mapped_reference_sequence describes the mapped reference accession, which could be a chromosome for ex. + seq_type: TargetSequenceType + # TODO full list of protein accession id prefixes + if metadata.target_accession_id.startswith(("NP", "ENSP")): + seq_type = TargetSequenceType.PROTEIN + # TODO full list of transcript and contig accession id prefixes + elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")): + seq_type = TargetSequenceType.DNA + else: + msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}" + raise ValueError(msg) + return MappedReferenceSequence( + sequence_type=seq_type, + sequence_id=seq_id, + sequence_accessions=[metadata.target_accession_id], + ) if layer == AnnotationLayer.PROTEIN: if tx_output is None or isinstance(tx_output, TxSelectError): # TODO catch this error - don't stop whole job for one failed target diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 98db726..bf53f48 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -12,10 +12,12 @@ import os from pathlib import Path +import hgvs import polars as pl import requests from biocommons.seqrepo import SeqRepo from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3 +from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider from cool_seq_tool.app import ( LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, @@ -42,6 +44,7 @@ ) from ga4gh.vrs.dataproxy import SeqRepoDataProxy, coerce_namespace from ga4gh.vrs.extras.translator import AlleleTranslator +from ga4gh.vrs.utils.hgvs_tools import HgvsTools from gene.database import create_db from gene.query import QueryHandler from gene.schemas import MatchType, SourceName @@ -70,6 +73,23 @@ ] _logger = logging.getLogger(__name__) +# ---------------------------------- Cdot ---------------------------------- # + + +GENOMIC_FASTA_FILES = [ + "/home/.local/share/dcd_mapping/GCF_000001405.39_GRCh38.p13_genomic.fna.gz", + "/home/.local/share/dcd_mapping/GCF_000001405.25_GRCh37.p13_genomic.fna.gz", +] + + +def seqfetcher() -> ChainedSeqFetcher: + return ChainedSeqFetcher(*[FastaSeqFetcher(file) for file in GENOMIC_FASTA_FILES]) + + +def cdot_rest() -> RESTDataProvider: + return RESTDataProvider(seqfetcher=seqfetcher()) + + # ---------------------------------- Global ---------------------------------- # @@ -180,6 +200,15 @@ def __new__(cls) -> QueryHandler: return cls.instance +def init_hgvs_tools(self, data_proxy=None): # noqa: ANN202, ANN001 + """Initialize HgvsTools with cdot as data provider""" + self.parser = hgvs.parser.Parser() + self.data_proxy = data_proxy + cdot_provider = cdot_rest() + self.normalizer = hgvs.normalizer.Normalizer(cdot_provider, validate=True) + self.variant_mapper = hgvs.variantmapper.VariantMapper(cdot_provider) + + class TranslatorBuilder: """Singleton constructor for VRS Translator instance.""" @@ -190,6 +219,8 @@ def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator: :return: singleton instance of ``AlleleTranslator`` """ if not hasattr(cls, "instance"): + # monkey patch to use cdot instead of UTA as HgvsTools data provider + HgvsTools.__init__ = init_hgvs_tools tr = AlleleTranslator(data_proxy) cls.instance = tr else: @@ -430,6 +461,11 @@ def get_chromosome_identifier(chromosome: str) -> str: :return: latest ID if available :raise KeyError: if unable to retrieve identifier """ + # target sequence alignment references are chromosome names like ``"8"``, ``"X"`` + # but accession alignment information from cdot has reference accessions, beginning with "NC_" + # for "NC_" identifiers, just return the identifier + if chromosome.startswith("NC_"): + return chromosome if not chromosome.startswith("chr"): chromosome = f"chr{chromosome}" sr = get_seqrepo() diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 5b253f7..6da9c8b 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -8,7 +8,7 @@ import click from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, align +from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result from dcd_mapping.annotate import ( annotate, save_mapped_output_json, @@ -156,7 +156,8 @@ async def map_scoreset( _emit_info(f"Performing alignment for {metadata.urn}...", silent) try: - alignment_results = align(metadata, silent) + # dictionary where keys are target gene labels or accession ids, and values are alignment result objects + alignment_results = build_alignment_result(metadata, silent) except BlatNotFoundError as e: msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." _emit_info(msg, silent, logging.ERROR) @@ -175,6 +176,15 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return + except ScoresetNotSupportedError as e: + _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return _emit_info("Alignment complete.", silent) _emit_info("Selecting reference sequence...", silent) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index a8dbb5d..532f400 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -70,20 +70,24 @@ def get_scoreset_urns() -> set[str]: def _metadata_response_is_human(json_response: dict) -> bool: - """Check that response from scoreset metadata API refers to a human genome target. - + """Check that response from scoreset metadata API refers to a score set containing only human genome targets. :param json_response: response from scoreset metadata API :return: True if contains a target tagged as ``"Homo sapiens"`` """ for target_gene in json_response.get("targetGenes", []): + # for now, assume that genomic coordinate-based score sets are always human, + # since users are not allowed to upload non-human coordinate-based score sets + if target_gene.get("targetAccession"): + continue + organism = ( target_gene.get("targetSequence", {}) .get("taxonomy", {}) .get("organismName") ) - if organism == "Homo sapiens": - return True - return False + if organism != "Homo sapiens": + return False + return True def get_human_urns() -> list[str]: @@ -185,31 +189,40 @@ def get_scoreset_metadata( multi_target = len(metadata["targetGenes"]) > 1 for gene in metadata["targetGenes"]: - target_sequence_gene = gene.get("targetSequence") - if target_sequence_gene is None: - msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." - raise ScoresetNotSupportedError(msg) if not _metadata_response_is_human(metadata): - msg = f"Experiment for {scoreset_urn} contains no human targets" + # TODO allow score sets with mix of human and non-human targets? This may not come up, but is doable with a little restructuring. + msg = f"Experiment for {scoreset_urn} contains non-human targets" raise ScoresetNotSupportedError(msg) try: - target_sequence_label = gene["targetSequence"].get("label") - if target_sequence_label is None: - # if there are not multiple targets, label is not required by mavedb, - # so use target gene name as the label. - if not multi_target: - target_sequence_label = gene["name"] - else: - msg = f"No target label provided for target in multi-target score set {scoreset_urn}." - raise DataLookupError(msg) - target_genes[target_sequence_label] = TargetGene( - target_gene_name=gene["name"], - target_gene_category=gene["category"], - target_sequence=gene["targetSequence"]["sequence"], - target_sequence_type=gene["targetSequence"]["sequenceType"], - target_sequence_label=target_sequence_label, - target_uniprot_ref=_get_uniprot_ref(metadata), - ) + target_gene_sequence = gene.get("targetSequence") + target_gene_accession = gene.get("targetAccession") + + if target_gene_sequence: + target_sequence_label = target_gene_sequence.get("label") + if target_sequence_label is None: + # if there are not multiple targets, label is not required by mavedb, + # so use target gene name as the label. + if not multi_target: + target_sequence_label = gene["name"] + else: + msg = f"No target label provided for target in multi-target score set {scoreset_urn}." + raise DataLookupError(msg) + target_genes[target_sequence_label] = TargetGene( + target_gene_name=gene["name"], + target_gene_category=gene["category"], + target_sequence=target_gene_sequence["sequence"], + target_sequence_type=target_gene_sequence["sequenceType"], + target_sequence_label=target_sequence_label, + target_uniprot_ref=_get_uniprot_ref(metadata), + ) + elif target_gene_accession: + target_accession_id = target_gene_accession["accession"] + target_genes[target_accession_id] = TargetGene( + target_gene_name=gene["name"], + target_gene_category=gene["category"], + target_accession_id=target_accession_id, + target_accession_assembly=target_gene_accession["assembly"], + ) except (KeyError, ValidationError) as e: msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" _logger.error(msg) @@ -253,12 +266,9 @@ def _load_scoreset_records( # target, so use the one target's label. if prefix is None: if len(metadata.target_genes) == 1: - # don't know name of key, so loop through dictionary - # and use the key of the single-entry metadata.target_genes dict as the key for this new dict - for target_gene in metadata.target_genes: - prefix = target_gene + prefix = list(metadata.target_genes.keys())[0] # noqa: RUF015 else: - msg = f"Multi-target score set {metadata.urn} does not contain target sequence labels in variant HGVS strings." + msg = f"Score set {metadata.urn} contains one or more variant HGVS strings without a reference sequence label. All variant HGVS strings must contain a reference sequence label or accession ID unless the score set contains a single target sequence." raise ScoresetNotSupportedError(msg) if prefix in scores_data: scores_data[prefix].append(ScoreRow(**row)) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 7c33740..44dcc3a 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -45,10 +45,12 @@ class TargetGene(BaseModel): target_gene_name: str target_gene_category: TargetType - target_sequence: str - target_sequence_type: TargetSequenceType - target_sequence_label: str + target_sequence: str | None = None + target_sequence_type: TargetSequenceType | None = None + target_sequence_label: str | None = None target_uniprot_ref: UniProtRef | None = None + target_accession_id: str | None = None + target_accession_assembly: str | None = None class ScoresetMetadata(BaseModel): @@ -106,8 +108,8 @@ class AlignmentResult(BaseModel): chrom: str strand: Strand - coverage: float - ident_pct: float + coverage: float | None = None + ident_pct: float | None = None query_range: SequenceRange query_subranges: list[SequenceRange] hit_range: SequenceRange diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 5004368..9034894 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -357,7 +357,7 @@ async def select_transcript( async def select_transcripts( scoreset_metadata: ScoresetMetadata, records: dict[str, list[ScoreRow]], - align_results: dict[str, AlignmentResult], + align_results: dict[str, AlignmentResult | None], ) -> dict[str, TxSelectResult | Exception | None]: """Select appropriate human reference sequence for each target in a score set. :param scoreset_metadata: Metadata for score set from MaveDB API @@ -368,14 +368,32 @@ async def select_transcripts( """ selected_transcripts = {} for target_gene in scoreset_metadata.target_genes: - try: - selected_transcripts[target_gene] = await select_transcript( - scoreset_urn=scoreset_metadata.urn, - target_gene=scoreset_metadata.target_genes[target_gene], - records=records[target_gene], - align_result=align_results[target_gene], - ) - except (TxSelectError, KeyError) as e: - selected_transcripts[target_gene] = e + if scoreset_metadata.target_genes[target_gene].target_accession_id: + # for accession-based targets, create tx select objects for protein sequence accessions only + accession_id = scoreset_metadata.target_genes[ + target_gene + ].target_accession_id + # TODO create full list of possible protein accession prefixes + if accession_id.startswith(("NP_", "ENSP_")): + # TODO make sequence field optional instead of leaving blank here? + selected_transcripts[target_gene] = TxSelectResult( + np=accession_id, + start=0, + is_full_match=True, + sequence="", + transcript_mode=None, + ) + else: + selected_transcripts[target_gene] = None + else: + try: + selected_transcripts[target_gene] = await select_transcript( + scoreset_urn=scoreset_metadata.urn, + target_gene=scoreset_metadata.target_genes[target_gene], + records=records[target_gene], + align_result=align_results[target_gene], + ) + except (TxSelectError, KeyError) as e: + selected_transcripts[target_gene] = e return selected_transcripts diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 67b44c0..fa7d4e6 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -5,6 +5,7 @@ from itertools import cycle from Bio.Seq import Seq +from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand from ga4gh.core import ga4gh_identify, sha512t24u from ga4gh.vrs._internal.models import ( @@ -20,6 +21,7 @@ from mavehgvs.variant import Variant from dcd_mapping.lookup import ( + cdot_rest, get_chromosome_identifier, get_seqrepo, translate_hgvs_to_vrs, @@ -68,11 +70,26 @@ def _process_any_aa_code(hgvs_pro_string: str) -> str: return hgvs_pro_string +def is_intronic_variant(variant: Variant) -> bool: + """Return True if given Variant is intronic, otherwise return False. + Supports single or multi-position variants. + """ + if isinstance(variant.positions, Iterable): + if any(position.is_intronic() for position in variant.positions): + return True + else: + if variant.positions.is_intronic(): + return True + + return False + + def _create_pre_mapped_hgvs_strings( raw_description: str, layer: AnnotationLayer, tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, + accession_id: str | None = None, ) -> list[str]: """Generate a list of (pre-mapped) HGVS strings from one long string containing many valid HGVS substrings @@ -85,13 +102,14 @@ def _create_pre_mapped_hgvs_strings( :param layer: An enum denoting the targeted annotation layer of these HGVS strings :param tx: A TxSelectResult object defining the transcript we are mapping to (or None). :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None). + :param accession_id: An accession id describing the reference sequence (for accession-based target gene variants) :return: A list of HGVS strings prior to being mapped to the `tx` or `alignment` """ if layer is AnnotationLayer.PROTEIN and tx is None: msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." raise ValueError(msg) - if layer is AnnotationLayer.GENOMIC and alignment is None: - msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." + if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: + msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}`)." raise ValueError(msg) raw_variant_strings = _parse_raw_variant_str(raw_description) @@ -103,10 +121,18 @@ def _create_pre_mapped_hgvs_strings( msg = f"Variant could not be parsed by mavehgvs: {error}" raise ValueError(msg) + # ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called + # therefore skip them there + if is_intronic_variant(variant): + msg = f"Variant is intronic and cannot be processed: {variant}" + raise ValueError(msg) + + if accession_id: + hgvs_strings.append(accession_id + ":" + str(variant)) # Ideally we would create an HGVS string namespaced to GA4GH. The line below # creates such a string, but it is not able to be parsed by the GA4GH VRS translator. # hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant)) - if layer is AnnotationLayer.PROTEIN: + elif layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: @@ -157,6 +183,12 @@ def _create_post_mapped_hgvs_strings( msg = f"Variant could not be parsed by mavehgvs: {error}" raise ValueError(msg) + # ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called + # therefore skip them there + if is_intronic_variant(variant): + msg = f"Variant is intronic and cannot be processed: {variant}" + raise ValueError(msg) + if layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help @@ -284,7 +316,6 @@ def _parse_raw_variant_str(raw_description: str) -> list[str]: # some variant strings follow mavehgvs format, meaning they don't have a reference sequence id and colon preceding the c./g./n./p. prefix # the reference sequence information has previously been parsed for score sets with multiple targets, # so can discard the reference sequence id and colon if they are present - # NOTE: this will need to be changed for accession-based mapping - cannot discard reference unless it has been previously parsed. # TODO check assumption of no colon unless reference sequence identifier is supplied! if ":" in raw_description: raw_description = raw_description.split(":")[1] @@ -405,7 +436,7 @@ def _map_protein_coding_pro( def _map_genomic( row: ScoreRow, sequence_id: str, - align_result: AlignmentResult, + align_result: AlignmentResult | None, ) -> MappedScore: """Construct VRS object mapping for ``hgvs_nt`` variant column entry @@ -416,6 +447,15 @@ def _map_genomic( :param align_result: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ + namespace = infer_namespace(sequence_id) + if namespace is None: + if sequence_id.startswith("SQ."): + # if the sequence id starts with SQ, it is a target sequence which is in the ga4gh namespace + namespace = "ga4gh" + else: + msg = f"Namespace could not be inferred from sequence: {sequence_id}" + raise ValueError(msg) + if ( row.hgvs_nt in {"_wt", "_sy", "="} or "fs" @@ -431,55 +471,146 @@ def _map_genomic( error_message=f"Can't process variant syntax {row.hgvs_nt}", ) - try: - pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) - pre_mapped_genomic = _construct_vrs_allele( - pre_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - sequence_id, - True, - ) - except Exception as e: - _logger.warning( - "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", - row.hgvs_nt, - row.accession, - str(e), - ) - return MappedScore( - accession_id=row.accession, score=row.score, error_message=str(e) - ) + if align_result is None: + # for contig accession based score sets, no mapping is performed, + # so pre- and post-mapped alleles are the same + try: + pre_mapped_hgvs_strings = ( + post_mapped_hgvs_strings + ) = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + accession_id=sequence_id, + ) + # accession-based pre-mapped alleles should be constructed like post-mapped alleles (sequence id is gathered from hgvs string rather than manually provided) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) - try: - post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) - post_mapped_genomic = _construct_vrs_allele( - post_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - None, - False, - ) - except Exception as e: - _logger.warning( - "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", - row.hgvs_nt, - row.accession, - str(e), - ) - return MappedScore( - accession_id=row.accession, - score=row.score, - annotation_layer=AnnotationLayer.GENOMIC, - pre_mapped=pre_mapped_genomic, - error_message=str(e), - ) + elif namespace.lower() in ("refseq", "ncbi", "ensembl"): + # nm/enst way + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + accession_id=sequence_id, + ) + # accession-based pre-mapped alleles should be constructed like post-mapped alleles (sequence id is gathered from hgvs string rather than manually provided) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.GENOMIC, + pre_mapped=pre_mapped_genomic, + error_message=str(e), + ) + elif namespace.lower() == "ga4gh": + # target seq way + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + sequence_id, + True, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) + + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.GENOMIC, + pre_mapped=pre_mapped_genomic, + error_message=str(e), + ) + else: + msg = f"Reference sequence namespace not supported: {namespace}" + raise ValueError(msg) return MappedScore( accession_id=row.accession, @@ -603,6 +734,52 @@ def _map_regulatory_noncoding( return variations +def store_accession( + accession_id: str, +) -> None: + namespace = infer_namespace(accession_id) + alias_dict_list = [{"namespace": namespace, "alias": accession_id}] + cd = cdot_rest() + sequence = cd.get_seq(accession_id) + sr = get_seqrepo() + sr.sr.store(sequence, alias_dict_list) + + +def _map_accession( + metadata: TargetGene, + records: list[ScoreRow], + align_result: AlignmentResult + | None, # NP and NC accessions won't have alignment results + transcript: TxSelectResult | None, +) -> list[MappedScore]: + variations: list[MappedScore] = [] + sequence_id = metadata.target_accession_id + if sequence_id is None: + raise ValueError + + store_accession(sequence_id) + + # TODO full list of protein accession id prefixes + if metadata.target_accession_id.startswith(("NP", "ENSP")): + for row in records: + hgvs_pro_mappings = _map_protein_coding_pro( + row, + sequence_id, + transcript, + ) + variations.append(hgvs_pro_mappings) + # TODO full list of transcript and contig accession id prefixes + elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")): + for row in records: + hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) + variations.append(hgvs_nt_mappings) + else: + msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}" + raise ValueError(msg) + + return variations + + def _rle_to_lse( rle: ReferenceLengthExpression, location: SequenceLocation ) -> LiteralSequenceExpression: @@ -678,7 +855,7 @@ def _construct_vrs_allele( def vrs_map( metadata: TargetGene, - align_result: AlignmentResult, + align_result: AlignmentResult | None, records: list[ScoreRow], transcript: TxSelectResult | TxSelectError | None = None, silent: bool = True, @@ -693,6 +870,8 @@ def vrs_map( :param silent: If true, suppress console output :return: A list of mapping results """ + if metadata.target_accession_id: + return _map_accession(metadata, records, align_result, transcript) if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: return _map_protein_coding( metadata, From 25f43327479650606fa6029dd3203df97bd74fec Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 9 Apr 2025 11:15:09 -0700 Subject: [PATCH 08/23] API support for accession-based mapping Note that this is mostly structured to handle multi-target mapping, but does not contain all changes required for multi-target mapping (specifically final output / reference sequence structure). Such changes would require corresponding changes in mavedb-api which we are not prepared to deploy yet. --- src/api/routers/map.py | 64 +++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 2f34def..c61fb77 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -6,7 +6,7 @@ from fastapi.responses import JSONResponse from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, align +from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result from dcd_mapping.annotate import ( _get_computed_reference_sequence, _get_mapped_reference_sequence, @@ -23,7 +23,7 @@ ) from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion -from dcd_mapping.transcripts import TxSelectError, select_transcript +from dcd_mapping.transcripts import select_transcripts from dcd_mapping.vrs_map import VrsMapError, vrs_map router = APIRouter( @@ -37,9 +37,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp """Perform end-to-end mapping for a scoreset. :param urn: identifier for a scoreset. - :param output_path: optional path to save output at - :param vrs_version: version of VRS objects to output (1.3 or 2) - :param silent: if True, suppress console information output + :param store_path: optional path to save output at """ try: metadata = get_scoreset_metadata(urn, store_path) @@ -62,7 +60,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp ) try: - alignment_result = align(metadata, True) + alignment_results = build_alignment_result(metadata, True) except BlatNotFoundError as e: msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." raise HTTPException(status_code=500, detail=msg) from e @@ -75,15 +73,20 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - - try: - transcript = await select_transcript(metadata, records, alignment_result) - except (TxSelectError, KeyError, ValueError) as e: + except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) + + try: + transcripts = await select_transcripts(metadata, records, alignment_results) + # NOTE: transcript selection errors are handled in select_transcripts, + # and they do not cause the entire mapping process to exit; instead, an error will be reported + # on the target level and on the variant level for variants relative to that target + # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate + # underlying issues with data providers. except HTTPError as e: msg = f"HTTP error occurred during transcript selection: {e}" raise HTTPException(status_code=500, detail=msg) from e @@ -91,38 +94,61 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp msg = f"Data lookup error occurred during transcript selection: {e}" raise HTTPException(status_code=500, detail=msg) from e + vrs_results = {} try: - vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) + for target_gene in metadata.target_genes: + vrs_results[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=alignment_results[target_gene], + records=records[target_gene], + transcript=transcripts[target_gene], + silent=True, + ) except VrsMapError as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if vrs_results is None: + # TODO this should instead check if all values in dict are none. or might not need this at all. + if vrs_results is None or len(vrs_results) == 0: return ScoresetMapping( metadata=metadata, error_message="No variant mappings available for this score set", ) + annotated_vrs_results = {} try: - vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2) + for target_gene in vrs_results: + annotated_vrs_results[target_gene] = annotate( + vrs_results[target_gene], + transcripts[target_gene], + metadata.target_genes[target_gene], + metadata.urn, + VrsVersion.V_2, + ) except Exception as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if vrs_results is None: + # TODO this should instead check if all values in dict are none. or might not need this at all. + if vrs_results is None or len(vrs_results) == 0: return ScoresetMapping( metadata=metadata, error_message="No annotated variant mappings available for this score set", ) + # TODO this will need to be changed to support multi-target score sets. + # This version works for accession based score sets. + # Not implementing multi-target changes because this will require corresponding changes on mavedb-api and we want to get this on staging quickly right now. + # For now, only accept single-target score sets so that we don't need to change structure of JSON output. + target_gene = list(metadata["target_genes"].keys())[0] # noqa: RUF015 try: raw_metadata = get_raw_scoreset_metadata(urn, store_path) preferred_layers = { - _set_scoreset_layer(urn, vrs_results), + _set_scoreset_layer(urn, vrs_results[target_gene]), } reference_sequences = { @@ -136,10 +162,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp for layer in preferred_layers: reference_sequences[layer][ "computed_reference_sequence" - ] = _get_computed_reference_sequence(metadata, layer, transcript) + ] = _get_computed_reference_sequence( + metadata.target_genes[target_gene], layer, transcripts[target_gene] + ) reference_sequences[layer][ "mapped_reference_sequence" - ] = _get_mapped_reference_sequence(layer, transcript, alignment_result) + ] = _get_mapped_reference_sequence( + layer, transcripts[target_gene], alignment_results[target_gene] + ) mapped_scores: list[ScoreAnnotation] = [] for m in vrs_results: From f8586ce007f7966ad5e5050326c7b212e2559190 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 9 Apr 2025 11:17:15 -0700 Subject: [PATCH 09/23] Temporarily remove support for multi-target score sets --- src/dcd_mapping/mavedb_data.py | 3 +++ src/dcd_mapping/schemas.py | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 532f400..38f99c2 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -187,6 +187,9 @@ def get_scoreset_metadata( metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir) target_genes = {} multi_target = len(metadata["targetGenes"]) > 1 + if multi_target: + msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." + raise ScoresetNotSupportedError(msg) for gene in metadata["targetGenes"]: if not _metadata_response_is_human(metadata): diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 44dcc3a..e745707 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -205,12 +205,17 @@ class ScoresetMapping(BaseModel): mapped_date_utc: str = Field( default=datetime.datetime.now(tz=datetime.UTC).isoformat() ) - reference_sequences: dict[ - str, - dict[ - AnnotationLayer, - dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], - ], - ] | None = None + # TODO re-implement metadata change later to support multi-target score sets. will require corresponding changes in mavedb-api + # reference_sequences: dict[ + # str, + # dict[ + # AnnotationLayer, + # dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], + # ], + # ] | None = None + computed_protein_reference_sequence: ComputedReferenceSequence | None = None + mapped_protein_reference_sequence: MappedReferenceSequence | None = None + computed_genomic_reference_sequence: ComputedReferenceSequence | None = None + mapped_genomic_reference_sequence: MappedReferenceSequence | None = None mapped_scores: list[ScoreAnnotation] | None = None error_message: str | None = None From d38f36409e0348dd56a0b1bce21c33e59194cc8c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 11 Apr 2025 08:40:53 -0700 Subject: [PATCH 10/23] Corrections for accession-based mapping without multi-target mapping --- src/api/routers/map.py | 10 +++++----- src/dcd_mapping/schemas.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index c61fb77..490af19 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -41,7 +41,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp """ try: metadata = get_scoreset_metadata(urn, store_path) - records = get_scoreset_records(urn, True, store_path) + records = get_scoreset_records(metadata, True, store_path) except ScoresetNotSupportedError as e: return ScoresetMapping( metadata=None, @@ -134,7 +134,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp ).model_dump(exclude_none=True) ) # TODO this should instead check if all values in dict are none. or might not need this at all. - if vrs_results is None or len(vrs_results) == 0: + if annotated_vrs_results is None or len(annotated_vrs_results) == 0: return ScoresetMapping( metadata=metadata, error_message="No annotated variant mappings available for this score set", @@ -144,11 +144,11 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp # This version works for accession based score sets. # Not implementing multi-target changes because this will require corresponding changes on mavedb-api and we want to get this on staging quickly right now. # For now, only accept single-target score sets so that we don't need to change structure of JSON output. - target_gene = list(metadata["target_genes"].keys())[0] # noqa: RUF015 + target_gene = list(metadata.target_genes.keys())[0] # noqa: RUF015 try: raw_metadata = get_raw_scoreset_metadata(urn, store_path) preferred_layers = { - _set_scoreset_layer(urn, vrs_results[target_gene]), + _set_scoreset_layer(urn, annotated_vrs_results[target_gene]), } reference_sequences = { @@ -172,7 +172,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp ) mapped_scores: list[ScoreAnnotation] = [] - for m in vrs_results: + for m in annotated_vrs_results[target_gene]: if m.annotation_layer in preferred_layers: # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index e745707..8274f39 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -213,9 +213,9 @@ class ScoresetMapping(BaseModel): # dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], # ], # ] | None = None - computed_protein_reference_sequence: ComputedReferenceSequence | None = None + computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None mapped_protein_reference_sequence: MappedReferenceSequence | None = None - computed_genomic_reference_sequence: ComputedReferenceSequence | None = None + computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None mapped_genomic_reference_sequence: MappedReferenceSequence | None = None mapped_scores: list[ScoreAnnotation] | None = None error_message: str | None = None From 586953619895e19a22c8dec571b545ed002bdb62 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 14 May 2025 09:13:03 -0700 Subject: [PATCH 11/23] Bug fixes for temporary non-multi-target staging deployment --- src/api/routers/map.py | 8 +++- src/dcd_mapping/align.py | 7 ++++ src/dcd_mapping/annotate.py | 77 +++++++++++++++++++++++++++---------- src/dcd_mapping/vrs_map.py | 46 +++++++++++++++++----- 4 files changed, 107 insertions(+), 31 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 490af19..9471ca9 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -158,7 +158,8 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp } for layer in AnnotationLayer } - + # sometimes Nonetype layers show up in preferred layers dict; remove these + preferred_layers.discard(None) for layer in preferred_layers: reference_sequences[layer][ "computed_reference_sequence" @@ -168,7 +169,10 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp reference_sequences[layer][ "mapped_reference_sequence" ] = _get_mapped_reference_sequence( - layer, transcripts[target_gene], alignment_results[target_gene] + metadata.target_genes[target_gene], + layer, + transcripts[target_gene], + alignment_results[target_gene], ) mapped_scores: list[ScoreAnnotation] = [] diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 16b2cf5..f99c4dc 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -336,6 +336,13 @@ def align( # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets if target_label == "query" and len(scoreset_metadata.target_genes) == 1: target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 + # NOTE this is a temporary fix that will not work for multi-target score sets! + # blat automatically reformats query names. + if ( + target_label not in scoreset_metadata.target_genes + and len(scoreset_metadata.target_genes) == 1 + ): + target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) return alignment_results diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 3c47a8b..2e41747 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -469,12 +469,14 @@ def _get_computed_reference_sequence( def _get_mapped_reference_sequence( + metadata: TargetGene, layer: AnnotationLayer, tx_output: TxSelectResult | TxSelectError | None = None, align_result: AlignmentResult | None = None, ) -> MappedReferenceSequence | None: """Report the mapped reference sequence for a score set + :param metadata: Target gene metadata from MaveDB API :param layer: AnnotationLayer :param tx_output: Transcript data for a score set :return A MappedReferenceSequence object @@ -500,13 +502,21 @@ def _get_mapped_reference_sequence( sequence_id=vrs_id, sequence_accessions=[tx_output.np], ) - seq_id = get_chromosome_identifier(align_result.chrom) + # accession-based score sets with genomic accession do not have alignment results + if ( + align_result is None + and metadata.target_accession_id + and metadata.target_accession_id.startswith("NC") + ): + seq_id = metadata.target_accession_id + else: + seq_id = get_chromosome_identifier(align_result.chrom) vrs_id = get_vrs_id_from_identifier(seq_id) if vrs_id is None: # TODO catch this error, don't fail whole job for one target - # msg = "ID could not be acquired from Seqrepo for chromosome identifier" - # raise ValueError(msg) - return None + msg = "ID could not be acquired from Seqrepo for chromosome identifier" + raise ValueError(msg) + # return None return MappedReferenceSequence( sequence_type=TargetSequenceType.DNA, sequence_id=vrs_id, @@ -593,9 +603,11 @@ def save_mapped_output_json( "computed_reference_sequence": None, "mapped_reference_sequence": None, } - for layer in preferred_layers + # TODO change this back after reimplementing multi-target mapping + for layer in AnnotationLayer } - + # sometimes Nonetype layers show up in preferred layers dict; remove these + preferred_layers.discard(None) for layer in preferred_layers: reference_sequences[target_gene][layer][ "computed_reference_sequence" @@ -605,7 +617,10 @@ def save_mapped_output_json( reference_sequences[target_gene][layer][ "mapped_reference_sequence" ] = _get_mapped_reference_sequence( - layer, tx_output[target_gene], align_results[target_gene] + metadata.target_genes[target_gene], + layer, + tx_output[target_gene], + align_results[target_gene], ) for m in mappings[target_gene]: @@ -615,21 +630,43 @@ def save_mapped_output_json( # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) - # drop Nonetype reference sequences - for target_gene in reference_sequences: - for layer in list(reference_sequences[target_gene].keys()): - if ( - reference_sequences[target_gene][layer]["mapped_reference_sequence"] - is None - and reference_sequences[target_gene][layer][ - "computed_reference_sequence" - ] - is None - ) or layer is None: - del reference_sequences[target_gene][layer] - + # TODO drop this "continue" after reimplementing multi-target mapping + continue + + # TODO add this back after reimplementing multi-target mapping + # drop Nonetype reference sequences + # for target_gene in reference_sequences: + # for layer in list(reference_sequences[target_gene].keys()): + # if ( + # reference_sequences[target_gene][layer]["mapped_reference_sequence"] + # is None + # and reference_sequences[target_gene][layer][ + # "computed_reference_sequence" + # ] + # is None + # ) or layer is None: + # del reference_sequences[target_gene][layer] + + # TODO drop this "continue" after reimplementing multi-target mapping + continue + # TODO drop this after reimplementing multi-target mapping + reference_sequences = reference_sequences.popitem()[1] # get only value in dict + # TODO change this back after reimplementing multi-target mapping + # this only works for --prefer_genomic right now, which is fine because we're going to change it back after reimplementing multi-target mapping output = ScoresetMapping( metadata=metadata.model_dump(), + computed_protein_reference_sequence=reference_sequences[ + AnnotationLayer.PROTEIN + ]["computed_reference_sequence"], + mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][ + "mapped_reference_sequence" + ], + computed_genomic_reference_sequence=reference_sequences[ + AnnotationLayer.GENOMIC + ]["computed_reference_sequence"], + mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][ + "mapped_reference_sequence" + ], reference_sequences=reference_sequences, mapped_scores=mapped_scores, ) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index fa7d4e6..b03742c 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -665,6 +665,19 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool: ) +def _hgvs_pro_is_valid(hgvs_pro: str) -> bool: + """Check for invalid or unavailable protein MAVE-HGVS variation + + :param hgvs_nt: MAVE_HGVS protein expression + :return: True if expression appears populated and valid + """ + return ( + (hgvs_pro not in {"_wt", "_sy", "NA"}) + and (len(hgvs_pro) != 3) + and ("fs" not in hgvs_pro) + ) + + def _map_protein_coding( metadata: TargetGene, records: list[ScoreRow], @@ -691,7 +704,14 @@ def _map_protein_coding( variations: list[MappedScore] = [] for row in records: - if isinstance(transcript, TxSelectError): + hgvs_nt_mappings = None + hgvs_pro_mappings = None + if _hgvs_nt_is_valid(row.hgvs_nt): + hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result) + + if ( + isinstance(transcript, TxSelectError) and not hgvs_nt_mappings + ): # only create error message if there is not an hgvs nt mapping # TODO create pre mapped allele hgvs_pro_mappings = MappedScore( accession_id=row.accession, @@ -699,16 +719,24 @@ def _map_protein_coding( error_message=str(transcript).strip("'"), ) else: - hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript) + if _hgvs_pro_is_valid(row.hgvs_pro): + hgvs_pro_mappings = _map_protein_coding_pro( + row, psequence_id, transcript + ) + elif ( + not hgvs_nt_mappings + ): # only create error message if there is not an hgvs nt mapping + hgvs_pro_mappings = MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Invalid protein variant syntax", + ) + + # append both pro and nt mappings if both available if hgvs_pro_mappings: variations.append(hgvs_pro_mappings) - - if _hgvs_nt_is_valid(row.hgvs_nt): - hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result) - - if hgvs_nt_mappings: - variations.append(hgvs_nt_mappings) - + if hgvs_nt_mappings: + variations.append(hgvs_nt_mappings) return variations From 4cdc7a3774be7aa5113857a7754cc08b9d0fc7b3 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 15 May 2025 14:25:12 -0700 Subject: [PATCH 12/23] Update UTA DB version --- settings/.env.dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings/.env.dev b/settings/.env.dev index 4e761c7..956bfae 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -17,7 +17,7 @@ POSTGRES_DB=gene_normalizer # Environment variables for UTA connection via CoolSeqTool #################################################################################################### -UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20180821 +UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20241220 #################################################################################################### # Environment variables for MaveDB connection From ccec5a5245754d9833e68c273953b33bf356dce9 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 15 May 2025 14:26:34 -0700 Subject: [PATCH 13/23] Re-implement multi-target mapping --- src/api/routers/map.py | 100 +++++++++++++++++---------------- src/dcd_mapping/annotate.py | 49 +++++----------- src/dcd_mapping/mavedb_data.py | 3 - src/dcd_mapping/schemas.py | 23 ++++---- 4 files changed, 75 insertions(+), 100 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 9471ca9..f87c558 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -1,7 +1,6 @@ """"Provide mapping router""" from pathlib import Path -from cool_seq_tool.schemas import AnnotationLayer from fastapi import APIRouter, HTTPException from fastapi.responses import JSONResponse from requests import HTTPError @@ -140,46 +139,60 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp error_message="No annotated variant mappings available for this score set", ) - # TODO this will need to be changed to support multi-target score sets. - # This version works for accession based score sets. - # Not implementing multi-target changes because this will require corresponding changes on mavedb-api and we want to get this on staging quickly right now. - # For now, only accept single-target score sets so that we don't need to change structure of JSON output. - target_gene = list(metadata.target_genes.keys())[0] # noqa: RUF015 try: raw_metadata = get_raw_scoreset_metadata(urn, store_path) - preferred_layers = { - _set_scoreset_layer(urn, annotated_vrs_results[target_gene]), - } - - reference_sequences = { - layer: { - "computed_reference_sequence": None, - "mapped_reference_sequence": None, + reference_sequences: dict[str, dict] = {} + mapped_scores: list[ScoreAnnotation] = [] + for target_gene in annotated_vrs_results: + preferred_layers = { + _set_scoreset_layer(urn, annotated_vrs_results[target_gene]), } - for layer in AnnotationLayer - } - # sometimes Nonetype layers show up in preferred layers dict; remove these - preferred_layers.discard(None) - for layer in preferred_layers: - reference_sequences[layer][ - "computed_reference_sequence" - ] = _get_computed_reference_sequence( - metadata.target_genes[target_gene], layer, transcripts[target_gene] - ) - reference_sequences[layer][ - "mapped_reference_sequence" - ] = _get_mapped_reference_sequence( - metadata.target_genes[target_gene], - layer, - transcripts[target_gene], - alignment_results[target_gene], - ) + reference_sequences[target_gene] = { + layer: { + "computed_reference_sequence": None, + "mapped_reference_sequence": None, + } + for layer in preferred_layers + } + # sometimes Nonetype layers show up in preferred layers dict; remove these + preferred_layers.discard(None) + for layer in preferred_layers: + reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence( + metadata.target_genes[target_gene], layer, transcripts[target_gene] + ) + reference_sequences[target_gene][layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence( + metadata.target_genes[target_gene], + layer, + transcripts[target_gene], + alignment_results[target_gene], + ) + + for m in annotated_vrs_results[target_gene]: + if m.pre_mapped is None: + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + elif m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + + # drop Nonetype reference sequences + for target_gene in reference_sequences: + for layer in list(reference_sequences[target_gene].keys()): + if ( + reference_sequences[target_gene][layer][ + "mapped_reference_sequence" + ] + is None + and reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] + is None + ) or layer is None: + del reference_sequences[target_gene][layer] - mapped_scores: list[ScoreAnnotation] = [] - for m in annotated_vrs_results[target_gene]: - if m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) except Exception as e: return JSONResponse( content=ScoresetMapping( @@ -190,18 +203,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp return JSONResponse( content=ScoresetMapping( metadata=raw_metadata, - computed_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["computed_reference_sequence"], - mapped_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["mapped_reference_sequence"], - computed_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["computed_reference_sequence"], - mapped_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["mapped_reference_sequence"], + reference_sequences=reference_sequences, mapped_scores=mapped_scores, ).model_dump(exclude_none=True) ) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 2e41747..a1408df 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -603,8 +603,7 @@ def save_mapped_output_json( "computed_reference_sequence": None, "mapped_reference_sequence": None, } - # TODO change this back after reimplementing multi-target mapping - for layer in AnnotationLayer + for layer in preferred_layers } # sometimes Nonetype layers show up in preferred layers dict; remove these preferred_layers.discard(None) @@ -630,43 +629,21 @@ def save_mapped_output_json( # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) - # TODO drop this "continue" after reimplementing multi-target mapping - continue - - # TODO add this back after reimplementing multi-target mapping # drop Nonetype reference sequences - # for target_gene in reference_sequences: - # for layer in list(reference_sequences[target_gene].keys()): - # if ( - # reference_sequences[target_gene][layer]["mapped_reference_sequence"] - # is None - # and reference_sequences[target_gene][layer][ - # "computed_reference_sequence" - # ] - # is None - # ) or layer is None: - # del reference_sequences[target_gene][layer] - - # TODO drop this "continue" after reimplementing multi-target mapping - continue - # TODO drop this after reimplementing multi-target mapping - reference_sequences = reference_sequences.popitem()[1] # get only value in dict - # TODO change this back after reimplementing multi-target mapping - # this only works for --prefer_genomic right now, which is fine because we're going to change it back after reimplementing multi-target mapping + for target_gene in reference_sequences: + for layer in list(reference_sequences[target_gene].keys()): + if ( + reference_sequences[target_gene][layer]["mapped_reference_sequence"] + is None + and reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] + is None + ) or layer is None: + del reference_sequences[target_gene][layer] + output = ScoresetMapping( metadata=metadata.model_dump(), - computed_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["computed_reference_sequence"], - mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][ - "mapped_reference_sequence" - ], - computed_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["computed_reference_sequence"], - mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][ - "mapped_reference_sequence" - ], reference_sequences=reference_sequences, mapped_scores=mapped_scores, ) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 38f99c2..532f400 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -187,9 +187,6 @@ def get_scoreset_metadata( metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir) target_genes = {} multi_target = len(metadata["targetGenes"]) > 1 - if multi_target: - msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." - raise ScoresetNotSupportedError(msg) for gene in metadata["targetGenes"]: if not _metadata_response_is_human(metadata): diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 8274f39..9395cd1 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -205,17 +205,16 @@ class ScoresetMapping(BaseModel): mapped_date_utc: str = Field( default=datetime.datetime.now(tz=datetime.UTC).isoformat() ) - # TODO re-implement metadata change later to support multi-target score sets. will require corresponding changes in mavedb-api - # reference_sequences: dict[ - # str, - # dict[ - # AnnotationLayer, - # dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], - # ], - # ] | None = None - computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None - mapped_protein_reference_sequence: MappedReferenceSequence | None = None - computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None - mapped_genomic_reference_sequence: MappedReferenceSequence | None = None + reference_sequences: dict[ + str, + dict[ + AnnotationLayer, + dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], + ], + ] | None = None + # computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None + # mapped_protein_reference_sequence: MappedReferenceSequence | None = None + # computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None + # mapped_genomic_reference_sequence: MappedReferenceSequence | None = None mapped_scores: list[ScoreAnnotation] | None = None error_message: str | None = None From a44bd2e1eb3559a98a3a0875785c66907ac7a35c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 22 May 2025 09:59:10 -0700 Subject: [PATCH 14/23] Add transcript information to mapped metadata for genomic score sets --- src/api/routers/map.py | 33 +++++++++++++++++++++++++++++---- src/dcd_mapping/annotate.py | 27 ++++++++++++++++++++++++--- src/dcd_mapping/schemas.py | 8 +++----- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index f87c558..4152cde 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -1,6 +1,7 @@ """"Provide mapping router""" from pathlib import Path +from cool_seq_tool.schemas import AnnotationLayer from fastapi import APIRouter, HTTPException from fastapi.responses import JSONResponse from requests import HTTPError @@ -21,7 +22,13 @@ with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError -from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion +from dcd_mapping.schemas import ( + ScoreAnnotation, + ScoresetMapping, + TargetType, + TxSelectResult, + VrsVersion, +) from dcd_mapping.transcripts import select_transcripts from dcd_mapping.vrs_map import VrsMapError, vrs_map @@ -147,7 +154,8 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp preferred_layers = { _set_scoreset_layer(urn, annotated_vrs_results[target_gene]), } - reference_sequences[target_gene] = { + target_gene_name = metadata.target_genes[target_gene].target_gene_name + reference_sequences[target_gene_name] = { layer: { "computed_reference_sequence": None, "mapped_reference_sequence": None, @@ -157,12 +165,12 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp # sometimes Nonetype layers show up in preferred layers dict; remove these preferred_layers.discard(None) for layer in preferred_layers: - reference_sequences[target_gene][layer][ + reference_sequences[target_gene_name][layer][ "computed_reference_sequence" ] = _get_computed_reference_sequence( metadata.target_genes[target_gene], layer, transcripts[target_gene] ) - reference_sequences[target_gene][layer][ + reference_sequences[target_gene_name][layer][ "mapped_reference_sequence" ] = _get_mapped_reference_sequence( metadata.target_genes[target_gene], @@ -193,6 +201,23 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp ) or layer is None: del reference_sequences[target_gene][layer] + # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict + if ( + AnnotationLayer.GENOMIC in reference_sequences[target_gene_name] + and metadata.target_genes[target_gene].target_gene_category + == TargetType.PROTEIN_CODING + and metadata.target_genes[target_gene].target_accession_id is None + and transcripts[target_gene] is not None + and isinstance(transcripts[target_gene], TxSelectResult) + and transcripts[target_gene].nm is not None + ): + reference_sequences[target_gene_name][AnnotationLayer.CDNA] = { + "computed_reference_sequence": None, + "mapped_reference_sequence": { + "sequence_accessions": [transcripts[target_gene].nm] + }, + } + except Exception as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index a1408df..0bdc7ea 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -41,6 +41,7 @@ ScoresetMetadata, TargetGene, TargetSequenceType, + TargetType, TxSelectResult, VrsVersion, ) @@ -598,7 +599,10 @@ def save_mapped_output_json( mapping.annotation_layer for mapping in mappings[target_gene] } - reference_sequences[target_gene] = { + # use target gene name in reference sequence dictionary, rather than the label, which differs between score sets + target_gene_name = metadata.target_genes[target_gene].target_gene_name + + reference_sequences[target_gene_name] = { layer: { "computed_reference_sequence": None, "mapped_reference_sequence": None, @@ -608,12 +612,12 @@ def save_mapped_output_json( # sometimes Nonetype layers show up in preferred layers dict; remove these preferred_layers.discard(None) for layer in preferred_layers: - reference_sequences[target_gene][layer][ + reference_sequences[target_gene_name][layer][ "computed_reference_sequence" ] = _get_computed_reference_sequence( metadata.target_genes[target_gene], layer, tx_output[target_gene] ) - reference_sequences[target_gene][layer][ + reference_sequences[target_gene_name][layer][ "mapped_reference_sequence" ] = _get_mapped_reference_sequence( metadata.target_genes[target_gene], @@ -622,6 +626,23 @@ def save_mapped_output_json( align_results[target_gene], ) + # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict + if ( + AnnotationLayer.GENOMIC in reference_sequences[target_gene_name] + and metadata.target_genes[target_gene].target_gene_category + == TargetType.PROTEIN_CODING + and metadata.target_genes[target_gene].target_accession_id is None + and tx_output[target_gene] is not None + and isinstance(tx_output[target_gene], TxSelectResult) + and tx_output[target_gene].nm is not None + ): + reference_sequences[target_gene_name][AnnotationLayer.CDNA] = { + "computed_reference_sequence": None, + "mapped_reference_sequence": { + "sequence_accessions": [tx_output[target_gene].nm] + }, + } + for m in mappings[target_gene]: if m.pre_mapped is None: mapped_scores.append(ScoreAnnotation(**m.model_dump())) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 9395cd1..072e3b8 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -209,12 +209,10 @@ class ScoresetMapping(BaseModel): str, dict[ AnnotationLayer, - dict[str, ComputedReferenceSequence | MappedReferenceSequence | None], + dict[ + str, ComputedReferenceSequence | MappedReferenceSequence | dict | None + ], ], ] | None = None - # computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None - # mapped_protein_reference_sequence: MappedReferenceSequence | None = None - # computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None - # mapped_genomic_reference_sequence: MappedReferenceSequence | None = None mapped_scores: list[ScoreAnnotation] | None = None error_message: str | None = None From 5a825e3df10095d1c0470bd05b1336b042b9d8d2 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 22 May 2025 11:15:11 -0700 Subject: [PATCH 15/23] Fix CLI prefer_genomic flag --- src/dcd_mapping/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index d5954e3..0db86b3 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -44,7 +44,7 @@ @click.option( "--prefer_genomic", is_flag=True, - default=True, + default=False, help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings", ) def cli( From 1b1ab4826e756f4867a6b3b86364996c695a51d9 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 16:37:19 -0700 Subject: [PATCH 16/23] Change tests and fixtures to reflect multi-target mapper changes --- tests/conftest.py | 25 +- tests/fixtures/align_result.json | 1104 ++++++++++++------------- tests/fixtures/scoreset_metadata.json | 208 +++-- tests/fixtures/transcript_result.json | 140 ++-- tests/test_mavedb_data.py | 39 +- tests/test_vrs_map.py | 90 +- 6 files changed, 854 insertions(+), 752 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a9ce550..c588794 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,7 +19,12 @@ import pytest -from dcd_mapping.schemas import AlignmentResult, ScoresetMetadata, TxSelectResult +from dcd_mapping.schemas import ( + AlignmentResult, + ScoresetMetadata, + TargetGene, + TxSelectResult, +) FIXTURE_DATA_DIR = Path(__file__).parents[0].resolve() / "fixtures" @@ -43,7 +48,11 @@ def scoreset_metadata_fixture(fixture_data_dir: Path): data = json.load(f) results = {} for d in data["scoreset_metadata"]: - formatted_data = ScoresetMetadata(**d) + target_genes = {} + for target_gene in d["target_genes"]: + target_gene_metadata = d["target_genes"][target_gene] + target_genes[target_gene] = TargetGene(**target_gene_metadata) + formatted_data = ScoresetMetadata(urn=d["urn"], target_genes=target_genes) results[formatted_data.urn] = formatted_data return results @@ -56,8 +65,10 @@ def align_result_fixture(fixture_data_dir: Path): data = json.load(f) results = {} for urn, result in data.items(): - formatted_result = AlignmentResult(**result) - results[urn] = formatted_result + formatted_results = {} + for target_gene in result: + formatted_results[target_gene] = AlignmentResult(**result[target_gene]) + results[urn] = formatted_results return results @@ -69,8 +80,10 @@ def transcript_results_fixture(fixture_data_dir: Path): data = json.load(f) results = {} for urn, result in data.items(): - formatted_result = TxSelectResult(**result) - results[urn] = formatted_result + formatted_results = {} + for target_gene in result: + formatted_results[target_gene] = TxSelectResult(**result[target_gene]) + results[urn] = formatted_results return results diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json index 306d755..6b2d086 100644 --- a/tests/fixtures/align_result.json +++ b/tests/fixtures/align_result.json @@ -1,626 +1,614 @@ { "urn:mavedb:00000002-a-2": { - "chrom": "chr11", - "strand": 1, - "coverage": 100.0, - "ident_pct": 77.45098039215686, - "query_range": { - "start": 0, - "end": 102 - }, - "query_subranges": [ - { + "hYAP65 WW domain": { + "chrom": "chr11", + "strand": 1, + "coverage": 100.0, + "ident_pct": 77.45098039215686, + "query_range": { "start": 0, - "end": 60 - }, - { - "start": 60, "end": 102 - } - ], - "hit_range": { - "start": 102114329, - "end": 102162492 - }, - "hit_subranges": [ - { - "start": 102114329, - "end": 102114389 }, - { - "start": 102162450, - "end": 102162492 - } - ] - }, - "urn:mavedb:00000007-a-2": { - "chrom": "chr11", - "strand": 1, - "coverage": 100.0, - "ident_pct": 77.45098039215686, - "query_range": { - "start": 0, - "end": 102 - }, - "query_subranges": [ - { - "start": 0, - "end": 60 - }, - { - "start": 60, - "end": 102 - } - ], - "hit_range": { - "start": 102114329, - "end": 102162492 - }, - "hit_subranges": [ - { + "query_subranges": [ + { + "start": 0, + "end": 60 + }, + { + "start": 60, + "end": 102 + } + ], + "hit_range": { "start": 102114329, - "end": 102114389 - }, - { - "start": 102162450, "end": 102162492 - } - ] + }, + "hit_subranges": [ + { + "start": 102114329, + "end": 102114389 + }, + { + "start": 102162450, + "end": 102162492 + } + ] + } }, "urn:mavedb:00000099-a-1": { - "chrom": "chr3", - "strand": 1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 1047 - }, - "query_subranges": [ - { + "RHO": { + "chrom": "chr3", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, - "end": 361 - }, - { - "start": 361, - "end": 530 - }, - { - "start": 530, - "end": 696 - }, - { - "start": 696, - "end": 936 - }, - { - "start": 936, "end": 1047 - } - ], - "hit_range": { - "start": 129528733, - "end": 129533718 - }, - "hit_subranges": [ - { - "start": 129528733, - "end": 129529094 - }, - { - "start": 129530875, - "end": 129531044 - }, - { - "start": 129532250, - "end": 129532416 - }, - { - "start": 129532532, - "end": 129532772 }, - { - "start": 129533607, + "query_subranges": [ + { + "start": 0, + "end": 361 + }, + { + "start": 361, + "end": 530 + }, + { + "start": 530, + "end": 696 + }, + { + "start": 696, + "end": 936 + }, + { + "start": 936, + "end": 1047 + } + ], + "hit_range": { + "start": 129528733, "end": 129533718 - } - ] + }, + "hit_subranges": [ + { + "start": 129528733, + "end": 129529094 + }, + { + "start": 129530875, + "end": 129531044 + }, + { + "start": 129532250, + "end": 129532416 + }, + { + "start": 129532532, + "end": 129532772 + }, + { + "start": 129533607, + "end": 129533718 + } + ] + } }, "urn:mavedb:00000103-c-1": { - "chrom": "chr22", - "strand": -1, - "coverage": 100.0, - "ident_pct": 99.9071494893222, - "query_range": { - "start": 0, - "end": 360 - }, - "query_subranges": [ - { + "MAPK1": { + "chrom": "chr22", + "strand": -1, + "coverage": 100.0, + "ident_pct": 99.9071494893222, + "query_range": { "start": 0, - "end": 39 - }, - { - "start": 40, - "end": 101 - }, - { - "start": 101, - "end": 164 - }, - { - "start": 164, - "end": 203 - }, - { - "start": 203, - "end": 241 - }, - { - "start": 241, - "end": 285 - }, - { - "start": 285, - "end": 322 - }, - { - "start": 322, "end": 360 - } - ], - "hit_range": { - "start": 21769206, - "end": 21867440 - }, - "hit_subranges": [ - { - "start": 21867323, - "end": 21867440 - }, - { - "start": 21807662, - "end": 21807845 - }, - { - "start": 21805849, - "end": 21806038 - }, - { - "start": 21799011, - "end": 21799128 - }, - { - "start": 21788694, - "end": 21788808 - }, - { - "start": 21788257, - "end": 21788389 - }, - { - "start": 21772872, - "end": 21772983 }, - { + "query_subranges": [ + { + "start": 0, + "end": 39 + }, + { + "start": 40, + "end": 101 + }, + { + "start": 101, + "end": 164 + }, + { + "start": 164, + "end": 203 + }, + { + "start": 203, + "end": 241 + }, + { + "start": 241, + "end": 285 + }, + { + "start": 285, + "end": 322 + }, + { + "start": 322, + "end": 360 + } + ], + "hit_range": { "start": 21769206, - "end": 21769320 - } - ] + "end": 21867440 + }, + "hit_subranges": [ + { + "start": 21867323, + "end": 21867440 + }, + { + "start": 21807662, + "end": 21807845 + }, + { + "start": 21805849, + "end": 21806038 + }, + { + "start": 21799011, + "end": 21799128 + }, + { + "start": 21788694, + "end": 21788808 + }, + { + "start": 21788257, + "end": 21788389 + }, + { + "start": 21772872, + "end": 21772983 + }, + { + "start": 21769206, + "end": 21769320 + } + ] + } }, "urn:mavedb:00000041-a-1": { - "chrom": "chr20", - "strand": 1, - "coverage": 100.0, - "ident_pct": 99.86666666666666, - "query_range": { - "start": 0, - "end": 750 - }, - "query_subranges": [ - { + "Src catalytic domain": { + "chrom": "chr20", + "strand": 1, + "coverage": 100.0, + "ident_pct": 99.86666666666666, + "query_range": { "start": 0, - "end": 52 - }, - { - "start": 52, - "end": 232 - }, - { - "start": 232, - "end": 309 - }, - { - "start": 309, - "end": 463 - }, - { - "start": 463, - "end": 595 - }, - { - "start": 595, "end": 750 - } - ], - "hit_range": { - "start": 37397802, - "end": 37403325 - }, - "hit_subranges": [ - { - "start": 37397802, - "end": 37397854 - }, - { - "start": 37400114, - "end": 37400294 - }, - { - "start": 37401601, - "end": 37401678 }, - { - "start": 37402434, - "end": 37402588 - }, - { - "start": 37402748, - "end": 37402880 - }, - { - "start": 37403170, + "query_subranges": [ + { + "start": 0, + "end": 52 + }, + { + "start": 52, + "end": 232 + }, + { + "start": 232, + "end": 309 + }, + { + "start": 309, + "end": 463 + }, + { + "start": 463, + "end": 595 + }, + { + "start": 595, + "end": 750 + } + ], + "hit_range": { + "start": 37397802, "end": 37403325 - } - ] + }, + "hit_subranges": [ + { + "start": 37397802, + "end": 37397854 + }, + { + "start": 37400114, + "end": 37400294 + }, + { + "start": 37401601, + "end": 37401678 + }, + { + "start": 37402434, + "end": 37402588 + }, + { + "start": 37402748, + "end": 37402880 + }, + { + "start": 37403170, + "end": 37403325 + } + ] + } }, "urn:mavedb:00000018-a-1": { - "chrom": "chr11", - "strand": 1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 187 - }, - "query_subranges": [ - { + "HBB promoter": { + "chrom": "chr11", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, "end": 187 - } - ], - "hit_range": { - "start": 5227021, - "end": 5227208 - }, - "hit_subranges": [ - { + }, + "query_subranges": [ + { + "start": 0, + "end": 187 + } + ], + "hit_range": { "start": 5227021, "end": 5227208 - } - ] + }, + "hit_subranges": [ + { + "start": 5227021, + "end": 5227208 + } + ] + } }, "urn:mavedb:00000001-a-4": { - "chrom": "chr16", - "strand": 1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 477 - }, - "query_subranges": [ - { + "UBE2I": { + "chrom": "chr16", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, - "end": 66 - }, - { - "start": 66, - "end": 150 - }, - { - "start": 150, - "end": 223 - }, - { - "start": 223, - "end": 333 - }, - { - "start": 333, - "end": 413 - }, - { - "start": 413, "end": 477 - } - ], - "hit_range": { - "start": 1314030, - "end": 1324793 - }, - "hit_subranges": [ - { - "start": 1314030, - "end": 1314096 - }, - { - "start": 1314292, - "end": 1314376 - }, - { - "start": 1315653, - "end": 1315726 - }, - { - "start": 1320173, - "end": 1320283 - }, - { - "start": 1320437, - "end": 1320517 }, - { - "start": 1324729, + "query_subranges": [ + { + "start": 0, + "end": 66 + }, + { + "start": 66, + "end": 150 + }, + { + "start": 150, + "end": 223 + }, + { + "start": 223, + "end": 333 + }, + { + "start": 333, + "end": 413 + }, + { + "start": 413, + "end": 477 + } + ], + "hit_range": { + "start": 1314030, "end": 1324793 - } - ] + }, + "hit_subranges": [ + { + "start": 1314030, + "end": 1314096 + }, + { + "start": 1314292, + "end": 1314376 + }, + { + "start": 1315653, + "end": 1315726 + }, + { + "start": 1320173, + "end": 1320283 + }, + { + "start": 1320437, + "end": 1320517 + }, + { + "start": 1324729, + "end": 1324793 + } + ] + } }, "urn:mavedb:00000098-a-1": { - "chrom": "chr3", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 12 - }, - "query_subranges": [ - { + "SCN5A": { + "chrom": "chr3", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, "end": 12 - } - ], - "hit_range": { - "start": 38551475, - "end": 38551511 - }, - "hit_subranges": [ - { + }, + "query_subranges": [ + { + "start": 0, + "end": 12 + } + ], + "hit_range": { "start": 38551475, "end": 38551511 - } - ] + }, + "hit_subranges": [ + { + "start": 38551475, + "end": 38551511 + } + ] + } }, "urn:mavedb:00000061-h-1": { - "chrom": "chr3", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 117 - }, - "query_subranges": [ - { - "start": 54, - "end": 117 - }, - { + "RAF": { + "chrom": "chr3", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, - "end": 54 - } - ], - "hit_range": { - "start": 12611999, - "end": 12618568 - }, - "hit_subranges": [ - { - "start": 1261199, - "end": 12612062 + "end": 117 }, - { - "start": 12618514, + "query_subranges": [ + { + "start": 54, + "end": 117 + }, + { + "start": 0, + "end": 54 + } + ], + "hit_range": { + "start": 12611999, "end": 12618568 - } - ] + }, + "hit_subranges": [ + { + "start": 1261199, + "end": 12612062 + }, + { + "start": 12618514, + "end": 12618568 + } + ] + } }, "urn:mavedb:00000068-a-1": { - "chrom": "chr17", - "strand": -1, - "coverage": 99.83079526226734, - "ident_pct": 99.91525423728814, - "query_range": { - "start": 0, - "end": 1180 - }, - "query_subranges": [ - { - "start": 1100, + "TP53 (P72R)": { + "chrom": "chr17", + "strand": -1, + "coverage": 99.83079526226734, + "ident_pct": 99.91525423728814, + "query_range": { + "start": 0, "end": 1180 }, - { - "start": 993, - "end": 1100 - }, - { - "start": 919, - "end": 993 - }, - { - "start": 782, - "end": 919 - }, - { - "start": 672, - "end": 782 - }, - { - "start": 559, - "end": 672 - }, - { - "start": 375, - "end": 559 - }, - { - "start": 96, - "end": 375 - }, - { - "start": 74, - "end": 96 - }, - { - "start": 0, - "end": 74 - } - ], - "hit_range": { - "start": 7676520, - "end": 7669690 - }, - "hit_subranges": [ - { - "start": 7669610, + "query_subranges": [ + { + "start": 1100, + "end": 1180 + }, + { + "start": 993, + "end": 1100 + }, + { + "start": 919, + "end": 993 + }, + { + "start": 782, + "end": 919 + }, + { + "start": 672, + "end": 782 + }, + { + "start": 559, + "end": 672 + }, + { + "start": 375, + "end": 559 + }, + { + "start": 96, + "end": 375 + }, + { + "start": 74, + "end": 96 + }, + { + "start": 0, + "end": 74 + } + ], + "hit_range": { + "start": 7676520, "end": 7669690 }, - { - "start": 7670608, - "end": 7670715 - }, - { - "start": 7673534, - "end": 7673608 - }, - { - "start": 7673700, - "end": 7673837 - }, - { - "start": 7674180, - "end": 7674290 - }, - { - "start": 7674858, - "end": 7674971 - }, - { - "start": 7675052, - "end": 7675236 - }, - { - "start": 7675993, - "end": 7676272 - }, - { - "start": 7676381, - "end": 7676403 - }, - { - "start": 7676520, - "end": 7676594 - } - ] + "hit_subranges": [ + { + "start": 7669610, + "end": 7669690 + }, + { + "start": 7670608, + "end": 7670715 + }, + { + "start": 7673534, + "end": 7673608 + }, + { + "start": 7673700, + "end": 7673837 + }, + { + "start": 7674180, + "end": 7674290 + }, + { + "start": 7674858, + "end": 7674971 + }, + { + "start": 7675052, + "end": 7675236 + }, + { + "start": 7675993, + "end": 7676272 + }, + { + "start": 7676381, + "end": 7676403 + }, + { + "start": 7676520, + "end": 7676594 + } + ] + } }, "urn:mavedb:00000093-a-1": { - "chrom": "chr17", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 300 - }, - "query_subranges": [ - { - "start": 212, + "BRCA1 translation start through RING domain": { + "chrom": "chr17", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { + "start": 0, "end": 300 }, - { - "start": 134, - "end": 212 - }, - { - "start": 80, - "end": 134 - }, - { - "start": 0, - "end": 80 - } - ], - "hit_range": { - "start": 43104868, - "end": 43124096 - }, - "hit_subranges": [ - { + "query_subranges": [ + { + "start": 212, + "end": 300 + }, + { + "start": 134, + "end": 212 + }, + { + "start": 80, + "end": 134 + }, + { + "start": 0, + "end": 80 + } + ], + "hit_range": { "start": 43104868, - "end": 43104956 - }, - { - "start": 43106455, - "end": 43106533 - }, - { - "start": 43115725, - "end": 43115779 - }, - { - "start": 43124016, "end": 43124096 - } - ] + }, + "hit_subranges": [ + { + "start": 43104868, + "end": 43104956 + }, + { + "start": 43106455, + "end": 43106533 + }, + { + "start": 43115725, + "end": 43115779 + }, + { + "start": 43124016, + "end": 43124096 + } + ] + } }, "urn:mavedb:00000001-b-2": { - "chrom": "chr2", - "strand": -1, - "coverage": 97.05882352941177, - "ident_pct": 100.0, - "query_range": { - "start": 9, - "end": 306 - }, - "query_subranges": [ - { - "start": 237, + "SUMO1": { + "chrom": "chr2", + "strand": -1, + "coverage": 97.05882352941177, + "ident_pct": 100.0, + "query_range": { + "start": 9, "end": 306 }, - { - "start": 165, - "end": 237 - }, - { - "start": 87, - "end": 165 - }, - { - "start": 9, - "end": 87 - } - ], - "hit_range": { - "start": 202207252, - "end": 202220109 - }, - "hit_subranges": [ - { + "query_subranges": [ + { + "start": 237, + "end": 306 + }, + { + "start": 165, + "end": 237 + }, + { + "start": 87, + "end": 165 + }, + { + "start": 9, + "end": 87 + } + ], + "hit_range": { "start": 202207252, - "end": 202207321 - }, - { - "start": 202210734, - "end": 202210806 - }, - { - "start": 202214356, - "end": 202214434 - }, - { - "start": 202220031, "end": 202220109 - } - ] + }, + "hit_subranges": [ + { + "start": 202207252, + "end": 202207321 + }, + { + "start": 202210734, + "end": 202210806 + }, + { + "start": 202214356, + "end": 202214434 + }, + { + "start": 202220031, + "end": 202220109 + } + ] + } } } diff --git a/tests/fixtures/scoreset_metadata.json b/tests/fixtures/scoreset_metadata.json index f0465db..c61098c 100644 --- a/tests/fixtures/scoreset_metadata.json +++ b/tests/fixtures/scoreset_metadata.json @@ -2,130 +2,162 @@ "scoreset_metadata": [ { "urn": "urn:mavedb:00000002-a-2", - "target_gene_name": "hYAP65 WW domain", - "target_gene_category": "protein_coding", - "target_sequence": "GACGTTCCACTGCCGGCTGGTTGGGAAATGGCTAAAACTAGTTCTGGTCAGCGTTACTTCCTGAACCACATCGACCAGACCACCACGTGGCAGGACCCGCGT", - "target_sequence_type": "dna", - "target_uniprot_ref": { - "id": "uniprot:P46937", - "offset": 169 + "target_genes": { + "hYAP65 WW domain": { + "target_gene_name": "hYAP65 WW domain", + "target_gene_category": "protein_coding", + "target_sequence": "GACGTTCCACTGCCGGCTGGTTGGGAAATGGCTAAAACTAGTTCTGGTCAGCGTTACTTCCTGAACCACATCGACCAGACCACCACGTGGCAGGACCCGCGT", + "target_sequence_type": "dna", + "target_uniprot_ref": { + "id": "uniprot:P46937", + "offset": 169 + } + } } }, { "urn": "urn:mavedb:00000099-a-1", - "target_gene_name": "RHO", - "target_gene_category": "protein_coding", - "target_sequence": "ATGAATGGCACAGAAGGCCCTAACTTCTACGTGCCCTTCTCCAATGCGACGGGTGTGGTACGCAGCCCCTTCGAGTACCCACAGTACTACCTGGCTGAGCCATGGCAGTTCTCCATGCTGGCCGCCTACATGTTTCTGCTGATCGTGCTGGGCTTCCCCATCAACTTCCTCACGCTCTACGTCACCGTCCAGCACAAGAAGCTGCGCACGCCTCTCAACTACATCCTGCTCAACCTAGCCGTGGCTGACCTCTTCATGGTCCTAGGTGGCTTCACCAGCACCCTCTACACCTCTCTGCATGGATACTTCGTCTTCGGGCCCACAGGATGCAATTTGGAGGGCTTCTTTGCCACCCTGGGCGGTGAAATTGCCCTGTGGTCCTTGGTGGTCCTGGCCATCGAGCGGTACGTGGTGGTGTGTAAGCCCATGAGCAACTTCCGCTTCGGGGAGAACCATGCCATCATGGGCGTTGCCTTCACCTGGGTCATGGCGCTGGCCTGCGCCGCACCCCCACTCGCCGGCTGGTCCAGGTACATCCCCGAGGGCCTGCAGTGCTCGTGTGGAATCGACTACTACACGCTCAAGCCGGAGGTCAACAACGAGTCTTTTGTCATCTACATGTTCGTGGTCCACTTCACCATCCCCATGATTATCATCTTTTTCTGCTATGGGCAGCTCGTCTTCACCGTCAAGGAGGCCGCTGCCCAGCAGCAGGAGTCAGCCACCACACAGAAGGCAGAGAAGGAGGTCACCCGCATGGTCATCATCATGGTCATCGCTTTCCTGATCTGCTGGGTGCCCTACGCCAGCGTGGCATTCTACATCTTCACCCACCAGGGCTCCAACTTCGGTCCCATCTTCATGACCATCCCAGCGTTCTTTGCCAAGAGCGCCGCCATCTACAACCCTGTCATCTATATCATGATGAACAAGCAGTTCCGGAACTGCATGCTCACCACCATCTGCTGCGGCAAGAACCCACTGGGTGACGATGAGGCCTCTGCTACCGTGTCCAAGACGGAGACGAGCCAGGTGGCCCCGGCCTAA", - "target_sequence_type": "dna", - "target_uniprot_ref": null + "target_genes": { + "RHO": { + "target_gene_name": "RHO", + "target_gene_category": "protein_coding", + "target_sequence": "ATGAATGGCACAGAAGGCCCTAACTTCTACGTGCCCTTCTCCAATGCGACGGGTGTGGTACGCAGCCCCTTCGAGTACCCACAGTACTACCTGGCTGAGCCATGGCAGTTCTCCATGCTGGCCGCCTACATGTTTCTGCTGATCGTGCTGGGCTTCCCCATCAACTTCCTCACGCTCTACGTCACCGTCCAGCACAAGAAGCTGCGCACGCCTCTCAACTACATCCTGCTCAACCTAGCCGTGGCTGACCTCTTCATGGTCCTAGGTGGCTTCACCAGCACCCTCTACACCTCTCTGCATGGATACTTCGTCTTCGGGCCCACAGGATGCAATTTGGAGGGCTTCTTTGCCACCCTGGGCGGTGAAATTGCCCTGTGGTCCTTGGTGGTCCTGGCCATCGAGCGGTACGTGGTGGTGTGTAAGCCCATGAGCAACTTCCGCTTCGGGGAGAACCATGCCATCATGGGCGTTGCCTTCACCTGGGTCATGGCGCTGGCCTGCGCCGCACCCCCACTCGCCGGCTGGTCCAGGTACATCCCCGAGGGCCTGCAGTGCTCGTGTGGAATCGACTACTACACGCTCAAGCCGGAGGTCAACAACGAGTCTTTTGTCATCTACATGTTCGTGGTCCACTTCACCATCCCCATGATTATCATCTTTTTCTGCTATGGGCAGCTCGTCTTCACCGTCAAGGAGGCCGCTGCCCAGCAGCAGGAGTCAGCCACCACACAGAAGGCAGAGAAGGAGGTCACCCGCATGGTCATCATCATGGTCATCGCTTTCCTGATCTGCTGGGTGCCCTACGCCAGCGTGGCATTCTACATCTTCACCCACCAGGGCTCCAACTTCGGTCCCATCTTCATGACCATCCCAGCGTTCTTTGCCAAGAGCGCCGCCATCTACAACCCTGTCATCTATATCATGATGAACAAGCAGTTCCGGAACTGCATGCTCACCACCATCTGCTGCGGCAAGAACCCACTGGGTGACGATGAGGCCTCTGCTACCGTGTCCAAGACGGAGACGAGCCAGGTGGCCCCGGCCTAA", + "target_sequence_type": "dna", + "target_uniprot_ref": null + } + } }, { "urn": "urn:mavedb:00000103-c-1", - "target_gene_name": "MAPK1", - "target_gene_category": "protein_coding", - "target_sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS", - "target_sequence_type": "protein", - "target_uniprot_ref": null + "target_genes": { + "MAPK1": { + "target_gene_name": "MAPK1", + "target_gene_category": "protein_coding", + "target_sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS", + "target_sequence_type": "protein", + "target_uniprot_ref": null + } + } }, { "urn": "urn:mavedb:00000041-a-1", - "target_gene_name": "Src catalytic domain", - "target_gene_category": "protein_coding", - "target_sequence": "CTGCGGCTGGAGGTCAAGCTGGGCCAGGGCTGCTTTGGCGAGGTGTGGATGGGGACCTGGAACGGTACCACCAGGGTGGCCATCAAAACCCTGAAGCCTGGCACGATGTCTCCAGAGGCCTTCCTGCAGGAGGCCCAGGTCATGAAGAAGCTGAGGCATGAGAAGCTGGTGCAGTTGTATGCTGTGGTTTCAGAGGAGCCCATTTACATCGTCACGGAGTACATGAGCAAGGGGAGTTTGCTGGACTTTCTCAAGGGGGAGACAGGCAAGTACCTGCGGCTGCCTCAGCTGGTGGACATGGCTGCTCAGATCGCCTCAGGCATGGCGTACGTGGAGCGGATGAACTACGTCCACCGGGACCTTCGTGCAGCCAACATCCTGGTGGGAGAGAACCTGGTGTGCAAAGTGGCCGACTTTGGGCTGGCTCGGCTCATTGAAGACAATGAGTACACGGCGCGGCAAGGTGCCAAATTCCCCATCAAGTGGACGGCTCCAGAAGCTGCCCTCTATGGCCGCTTCACCATCAAGTCGGACGTGTGGTCCTTCGGGATCCTGCTGACTGAGCTCACCACAAAGGGACGGGTGCCCTACCCTGGGATGGTGAACCGCGAGGTGCTGGACCAGGTGGAGCGGGGCTACCGGATGCCCTGCCCGCCGGAGTGTCCCGAGTCCCTGCACGACCTCATGTGCCAGTGCTGGCGGAAGGAGCCTGAGGAGCGGCCCACCTTCGAGTACCTGCAGGCCTTCCTG", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P12931", - "offset": 269 + "target_genes": { + "Src catalytic domain": { + "target_gene_name": "Src catalytic domain", + "target_gene_category": "protein_coding", + "target_sequence": "CTGCGGCTGGAGGTCAAGCTGGGCCAGGGCTGCTTTGGCGAGGTGTGGATGGGGACCTGGAACGGTACCACCAGGGTGGCCATCAAAACCCTGAAGCCTGGCACGATGTCTCCAGAGGCCTTCCTGCAGGAGGCCCAGGTCATGAAGAAGCTGAGGCATGAGAAGCTGGTGCAGTTGTATGCTGTGGTTTCAGAGGAGCCCATTTACATCGTCACGGAGTACATGAGCAAGGGGAGTTTGCTGGACTTTCTCAAGGGGGAGACAGGCAAGTACCTGCGGCTGCCTCAGCTGGTGGACATGGCTGCTCAGATCGCCTCAGGCATGGCGTACGTGGAGCGGATGAACTACGTCCACCGGGACCTTCGTGCAGCCAACATCCTGGTGGGAGAGAACCTGGTGTGCAAAGTGGCCGACTTTGGGCTGGCTCGGCTCATTGAAGACAATGAGTACACGGCGCGGCAAGGTGCCAAATTCCCCATCAAGTGGACGGCTCCAGAAGCTGCCCTCTATGGCCGCTTCACCATCAAGTCGGACGTGTGGTCCTTCGGGATCCTGCTGACTGAGCTCACCACAAAGGGACGGGTGCCCTACCCTGGGATGGTGAACCGCGAGGTGCTGGACCAGGTGGAGCGGGGCTACCGGATGCCCTGCCCGCCGGAGTGTCCCGAGTCCCTGCACGACCTCATGTGCCAGTGCTGGCGGAAGGAGCCTGAGGAGCGGCCCACCTTCGAGTACCTGCAGGCCTTCCTG", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:P12931", + "offset": 269 + } + } } }, { "urn": "urn:mavedb:00000018-a-1", - "target_gene_name": "HBB promoter", - "target_gene_category": "regulatory", - "target_sequence": "GGTGTCTGTTTGAGGTTGCTAGTGAACACAGTTGTGTCAGAAGCAAATGTAAGCAATAGATGGCTCTGCCCTGACTTTTATGCCCAGCCCTGGCTCCTGCCCTCCCTGCTCCTGGGAGTAGATTGGCCAACCCTAGGGTGTGGCTCCACAGGGTGAGGTCTAAGTGATGACAGCCGTACCTGTCCTT", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": null - }, - { - "urn": "urn:mavedb:00000001-a-4", - "target_gene_name": "UBE2I", - "target_gene_category": "protein_coding", - "target_sequence": "ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P63279", - "offset": 0 + "target_genes": { + "HBB promoter": { + "target_gene_name": "HBB promoter", + "target_gene_category": "regulatory", + "target_sequence": "GGTGTCTGTTTGAGGTTGCTAGTGAACACAGTTGTGTCAGAAGCAAATGTAAGCAATAGATGGCTCTGCCCTGACTTTTATGCCCAGCCCTGGCTCCTGCCCTCCCTGCTCCTGGGAGTAGATTGGCCAACCCTAGGGTGTGGCTCCACAGGGTGAGGTCTAAGTGATGACAGCCGTACCTGTCCTT", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": null + } } }, { - "urn": "urn:mavedb:00000113-a-2", - "target_gene_name": "APP", - "target_gene_category": "protein_coding", - "target_sequence": "DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA", - "target_sequence_type": "protein", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P05067", - "offset": 0 + "urn": "urn:mavedb:00000001-a-4", + "target_genes": { + "UBE2I": { + "target_gene_name": "UBE2I", + "target_gene_category": "protein_coding", + "target_sequence": "ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:P63279", + "offset": 0 + } + } } }, { "urn": "urn:mavedb:00000098-a-1", - "target_gene_name": "SCN5A", - "target_gene_category": "protein_coding", - "target_sequence": "LFRVIRLARIGR", - "target_sequence_type": "protein", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:Q14524", - "offset": 1620 + "target_genes": { + "SCN5A": { + "target_gene_name": "SCN5A", + "target_gene_category": "protein_coding", + "target_sequence": "LFRVIRLARIGR", + "target_sequence_type": "protein", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:Q14524", + "offset": 1620 + } + } } }, { "urn": "urn:mavedb:00000061-h-1", - "target_gene_name": "RAF", - "target_gene_category": "protein_coding", - "target_sequence": "TCTAAGACAAGCAACACTATCCGTGTTTTCTTGCCGAACAAGCAAAGAACAGTGGTCAATGTGCGAAATGGAATGAGCTTGCATGACTGCCTTATGAAAGCACTCAAGGTGAGGGGC", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P04049", - "offset": 51 + "target_genes": { + "RAF": { + "target_gene_name": "RAF", + "target_gene_category": "protein_coding", + "target_sequence": "TCTAAGACAAGCAACACTATCCGTGTTTTCTTGCCGAACAAGCAAAGAACAGTGGTCAATGTGCGAAATGGAATGAGCTTGCATGACTGCCTTATGAAAGCACTCAAGGTGAGGGGC", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:P04049", + "offset": 51 + } + } } }, { "urn": "urn:mavedb:00000068-a-1", - "target_gene_name": "TP53 (P72R)", - "target_gene_category": "protein_coding", - "target_sequence": "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTAG", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": null + "target_genes": { + "TP53 (P72R)": { + "target_gene_name": "TP53 (P72R)", + "target_gene_category": "protein_coding", + "target_sequence": "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTAG", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": null + } + } }, { "urn": "urn:mavedb:00000093-a-1", - "target_gene_name": "BRCA1 translation start through RING domain", - "target_gene_category": "protein_coding", - "target_sequence": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAG", - "target_sequence_type": "dna", - "target_reference_genome": "hg19", - "target_uniprot_ref": { - "id": "uniprot:P38398", - "offset": 0 + "target_genes": { + "BRCA1 translation start through RING domain": { + "target_gene_name": "BRCA1 translation start through RING domain", + "target_gene_category": "protein_coding", + "target_sequence": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAG", + "target_sequence_type": "dna", + "target_reference_genome": "hg19", + "target_uniprot_ref": { + "id": "uniprot:P38398", + "offset": 0 + } + } } }, { "urn": "urn:mavedb:00000001-b-2", - "target_gene_name": "SUMO1", - "target_gene_category": "protein_coding", - "target_sequence": "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", - "target_sequence_type": "dna", - "target_uniprot_ref": { - "id": "uniprot:P63165", - "offset": 0 + "target_genes": { + "SUMO1": { + "target_gene_name": "SUMO1", + "target_gene_category": "protein_coding", + "target_sequence": "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", + "target_sequence_type": "dna", + "target_uniprot_ref": { + "id": "uniprot:P63165", + "offset": 0 + } + } } } ] diff --git a/tests/fixtures/transcript_result.json b/tests/fixtures/transcript_result.json index f93d5a9..6d3564a 100644 --- a/tests/fixtures/transcript_result.json +++ b/tests/fixtures/transcript_result.json @@ -1,82 +1,102 @@ { "urn:mavedb:00000002-a-2": { - "nm": "NM_001130145.3", - "np": "NP_001123617.1", - "start": 169, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR" + "hYAP65 WW domain": { + "nm": "NM_001130145.3", + "np": "NP_001123617.1", + "start": 169, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR" + } }, "urn:mavedb:00000099-a-1": { - "nm": "NM_000539.3", - "np": "NP_000530.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA" + "RHO": { + "nm": "NM_000539.3", + "np": "NP_000530.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA" + } }, "urn:mavedb:00000103-c-1": { - "nm": "NM_002745.5", - "np": "NP_002736.3", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS" + "MAPK1": { + "nm": "NM_002745.5", + "np": "NP_002736.3", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS" + } }, "urn:mavedb:00000041-a-1": { - "nm": "NM_198291.3", - "np": "NP_938033.1", - "start": 269, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "LRLEVKLGQGCFGEVWMGTWNGTTRVAIKTLKPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKYLRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYTARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVERGYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFL" + "Src catalytic domain": { + "nm": "NM_198291.3", + "np": "NP_938033.1", + "start": 269, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "LRLEVKLGQGCFGEVWMGTWNGTTRVAIKTLKPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKYLRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYTARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVERGYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFL" + } }, "urn:mavedb:00000001-a-4": { - "nm": "NM_003345.5", - "np": "NP_003336.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS" + "UBE2I": { + "nm": "NM_003345.5", + "np": "NP_003336.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS" + } }, "urn:mavedb:00000098-a-1": { - "nm": "NM_000335.5", - "np": "NP_000326.2", - "start": 1619, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "LFRVIRLARIGR" + "SCN5A": { + "nm": "NM_000335.5", + "np": "NP_000326.2", + "start": 1619, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "LFRVIRLARIGR" + } }, "urn:mavedb:00000061-h-1": { - "nm": "NM_002880.4", - "np": "NP_002871.1", - "start": 51, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "SKTSNTIRVFLPNKQRTVVNVRNGMSLHDCLMKALKVRG" + "RAF": { + "nm": "NM_002880.4", + "np": "NP_002871.1", + "start": 51, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "SKTSNTIRVFLPNKQRTVVNVRNGMSLHDCLMKALKVRG" + } }, "urn:mavedb:00000068-a-1": { - "nm": "NM_000546.6", - "np": "NP_000537.3", - "start": 0, - "is_full_match": false, - "transcript_mode": "mane_select", - "sequence": "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD" + "TP53 (P72R)": { + "nm": "NM_000546.6", + "np": "NP_000537.3", + "start": 0, + "is_full_match": false, + "transcript_mode": "mane_select", + "sequence": "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD" + } }, "urn:mavedb:00000093-a-1": { - "nm": "NM_007294.4", - "np": "NP_009225.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLE" + "BRCA1 translation start through RING domain": { + "nm": "NM_007294.4", + "np": "NP_009225.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLE" + } }, "urn:mavedb:00000001-b-2": { - "nm": "NM_003352.8", - "np": "NP_003343.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV" + "SUMO1": { + "nm": "NM_003352.8", + "np": "NP_003343.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV" + } } } diff --git a/tests/test_mavedb_data.py b/tests/test_mavedb_data.py index 31258d6..94cab8b 100644 --- a/tests/test_mavedb_data.py +++ b/tests/test_mavedb_data.py @@ -34,26 +34,49 @@ def test_get_scoreset_metadata( urn = "urn:mavedb:00000093-a-1" with requests_mock.Mocker() as m: m.get( - f"https://api.mavedb.org/api/v1/score-sets/{urn}", + f"http://api.mavedb.org/api/v1/score-sets/{urn}", json=scoreset_metadata_response[urn], ) scoreset_metadata = get_scoreset_metadata( urn, dcd_mapping_dir=resources_data_dir ) assert scoreset_metadata.urn == urn - assert scoreset_metadata.target_uniprot_ref - assert scoreset_metadata.target_uniprot_ref.id == "uniprot:P38398" - assert scoreset_metadata.target_uniprot_ref.offset == 0 + assert scoreset_metadata.target_genes + assert ( + scoreset_metadata.target_genes[ + "BRCA1 translation start through RING domain" + ].target_uniprot_ref.id + == "uniprot:P38398" + ) + assert ( + scoreset_metadata.target_genes[ + "BRCA1 translation start through RING domain" + ].target_uniprot_ref.offset + == 0 + ) -def test_get_scoreset_records(resources_data_dir: Path, fixture_data_dir: Path): +def test_get_scoreset_records( + resources_data_dir: Path, fixture_data_dir: Path, scoreset_metadata_response: dict +): urn = "urn:mavedb:00000093-a-1" with (fixture_data_dir / f"{urn}_scores.csv").open() as f: scores_csv_text = f.read() with requests_mock.Mocker() as m: m.get( - f"https://api.mavedb.org/api/v1/score-sets/{urn}/scores", + f"http://api.mavedb.org/api/v1/score-sets/{urn}", + json=scoreset_metadata_response[urn], + ) + scoreset_metadata = get_scoreset_metadata( + urn, dcd_mapping_dir=resources_data_dir + ) + m.get( + f"http://api.mavedb.org/api/v1/score-sets/{urn}/scores", text=scores_csv_text, ) - scoreset_records = get_scoreset_records(urn, dcd_mapping_dir=resources_data_dir) - assert len(scoreset_records) == 853 + scoreset_records = get_scoreset_records( + scoreset_metadata, dcd_mapping_dir=resources_data_dir + ) + assert ( + len(scoreset_records["BRCA1 translation start through RING domain"]) == 853 + ) diff --git a/tests/test_vrs_map.py b/tests/test_vrs_map.py index 37133c2..425011c 100644 --- a/tests/test_vrs_map.py +++ b/tests/test_vrs_map.py @@ -74,7 +74,9 @@ def get_fixtures( ): def _get_fixtures(urn: str): return ( - _load_scoreset_records(fixture_data_dir / f"{urn}_scores.csv"), + _load_scoreset_records( + fixture_data_dir / f"{urn}_scores.csv", scoreset_metadata_fixture[urn] + ), scoreset_metadata_fixture[urn], align_result_fixture[urn], transcript_results_fixture[urn], @@ -102,11 +104,18 @@ def test_2_a_2( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 1 + mappings = {} + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["hYAP65 WW domain"] is not None + assert len(mappings["hYAP65 WW domain"]) == 1 - for m in mappings: + for m in mappings["hYAP65 WW domain"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -154,11 +163,18 @@ def test_41_a_1( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 5 + mappings = {} + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["Src catalytic domain"] is not None + assert len(mappings["Src catalytic domain"]) == 5 - for m in mappings: + for m in mappings["Src catalytic domain"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -217,11 +233,18 @@ def test_99_a_1( "post_mapped": "ga4gh:VA.HSfipwsg28LbqwITCawzumz_OWZYu_jM", }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 8 # includes protein and genomic for all 4 rows + mappings = {} + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["RHO"] is not None + assert len(mappings["RHO"]) == 8 # includes protein and genomic for all 4 rows - for m in mappings: + for m in mappings["RHO"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -265,10 +288,17 @@ def test_103_c_1( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 4 - for m in mappings: + mappings = {} + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["MAPK1"] is not None + assert len(mappings["MAPK1"]) == 4 + for m in mappings["MAPK1"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -289,9 +319,6 @@ def test_1_b_2( urn = "urn:mavedb:00000001-b-2" records, metadata, align_result, tx_result = get_fixtures(urn) - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - expected_mappings_data = { ("urn:mavedb:00000001-b-2#444", AnnotationLayer.PROTEIN): { "pre_mapped": "ga4gh:VA.ojIs4GEPbxiMt5E6InnF6k6m9ix_z3SH", @@ -327,10 +354,17 @@ def test_1_b_2( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 8 - for m in mappings: + mappings = {} + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["SUMO1"] is not None + assert len(mappings["SUMO1"]) == 8 + for m in mappings["SUMO1"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -342,14 +376,6 @@ def test_1_b_2( "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", [{"namespace": "ga4gh", "alias": "SQ.i1KiGldkfULl1XcEI-XBwhiM7x3PK5Xk"}], ), - ( - "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV", - [{"namespace": "ga4gh", "alias": "SQ.VkCzFNsbifqfq61Mud6oGmz0ID6CLIip"}], - ), - ( - "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", - [{"namespace": "ga4gh", "alias": "SQ.i1KiGldkfULl1XcEI-XBwhiM7x3PK5Xk"}], - ), ] for call in store_calls: mock_seqrepo_access.sr.store.assert_any_call(*call) From c3ade2edfe776718982e6c93e6ab98b637b2b410 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 16:41:28 -0700 Subject: [PATCH 17/23] Always return a custom JSON response type from mapping api This change also checks that the number of mappings for a score set is greater than 0 and returns an error if not. --- src/api/routers/map.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 4152cde..2e65d2a 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -39,7 +39,7 @@ @router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) @with_mavedb_score_set -async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping: +async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse: """Perform end-to-end mapping for a scoreset. :param urn: identifier for a scoreset. @@ -49,9 +49,11 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) except ScoresetNotSupportedError as e: - return ScoresetMapping( - metadata=None, - error_message=str(e).strip("'"), + return JSONResponse( + content=ScoresetMapping( + metadata=None, + error_message=str(e).strip("'"), + ).model_dump(exclude_none=True) ) except ResourceAcquisitionError as e: msg = f"Unable to acquire resource from MaveDB: {e}" @@ -116,11 +118,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - # TODO this should instead check if all values in dict are none. or might not need this at all. - if vrs_results is None or len(vrs_results) == 0: - return ScoresetMapping( - metadata=metadata, - error_message="No variant mappings available for this score set", + if not vrs_results or all( + mapping_result is None for mapping_result in vrs_results.values() + ): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="No variant mappings available for this score set", + ).model_dump(exclude_none=True) ) annotated_vrs_results = {} @@ -139,11 +144,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - # TODO this should instead check if all values in dict are none. or might not need this at all. - if annotated_vrs_results is None or len(annotated_vrs_results) == 0: - return ScoresetMapping( - metadata=metadata, - error_message="No annotated variant mappings available for this score set", + if not annotated_vrs_results or all( + mapping_result is None for mapping_result in annotated_vrs_results.values() + ): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="No annotated variant mappings available for this score set", + ).model_dump(exclude_none=True) ) try: From e4b41b2169a4d3146081298a40d9084d000ca4b2 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 16:45:32 -0700 Subject: [PATCH 18/23] CLI: Check that number of mappings for a score set is greater than 0, and return an error if not. This change also removes a TODO for which there is now a backlog entry. --- src/dcd_mapping/main.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 6da9c8b..7ccf26c 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -233,8 +233,9 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return - # TODO this should be if all values in dict are none. or might not need this at all. - if vrs_results is None or len(vrs_results) == 0: + if not vrs_results or all( + mapping_result is None for mapping_result in vrs_results.values() + ): _emit_info(f"No mapping available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, @@ -260,7 +261,7 @@ async def map_scoreset( metadata.urn, vrs_version, ) - except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors + except Exception as e: _emit_info( f"VRS annotation failed for scoreset {metadata.urn}", silent, @@ -273,8 +274,9 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return - # TODO this should be if all values in dict are none. or might not need this at all. - if annotated_vrs_results is None: + if not annotated_vrs_results or all( + mapping_result is None for mapping_result in annotated_vrs_results.values() + ): _emit_info(f"No annotation available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, From 20e604dcddf4862b7775d1fe93abb1f764864323 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 16:48:00 -0700 Subject: [PATCH 19/23] Remove TODO comments for which backlog entries have been created --- src/dcd_mapping/annotate.py | 18 +----------------- src/dcd_mapping/mavedb_data.py | 4 ---- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 0bdc7ea..9a9532e 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -436,10 +436,8 @@ def _get_computed_reference_sequence( # for accession-based target genes, the object returned by this function describes the provided reference accession # whereas the object returned by _get_mapped_reference_sequence describes the mapped reference accession, which could be a chromosome for ex. seq_type: TargetSequenceType - # TODO full list of protein accession id prefixes if metadata.target_accession_id.startswith(("NP", "ENSP")): seq_type = TargetSequenceType.PROTEIN - # TODO full list of transcript and contig accession id prefixes elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")): seq_type = TargetSequenceType.DNA else: @@ -452,8 +450,6 @@ def _get_computed_reference_sequence( ) if layer == AnnotationLayer.PROTEIN: if tx_output is None or isinstance(tx_output, TxSelectError): - # TODO catch this error - don't stop whole job for one failed target - # raise ValueError return None seq_id = f"ga4gh:SQ.{sha512t24u(tx_output.sequence.encode('ascii'))}" return ComputedReferenceSequence( @@ -488,15 +484,9 @@ def _get_mapped_reference_sequence( and isinstance(tx_output, TxSelectResult) ): if tx_output.np is None: - # TODO catch this error, don't fail whole job for one target - # msg = "No NP accession associated with reference transcript" - # raise ValueError(msg) return None vrs_id = get_vrs_id_from_identifier(tx_output.np) if vrs_id is None: - # TODO catch this error, don't fail whole job for one target - # msg = "ID could not be acquired from Seqrepo for transcript identifier" - # raise ValueError(msg) return None return MappedReferenceSequence( sequence_type=TargetSequenceType.PROTEIN, @@ -514,10 +504,7 @@ def _get_mapped_reference_sequence( seq_id = get_chromosome_identifier(align_result.chrom) vrs_id = get_vrs_id_from_identifier(seq_id) if vrs_id is None: - # TODO catch this error, don't fail whole job for one target - msg = "ID could not be acquired from Seqrepo for chromosome identifier" - raise ValueError(msg) - # return None + return None return MappedReferenceSequence( sequence_type=TargetSequenceType.DNA, sequence_id=vrs_id, @@ -584,9 +571,6 @@ def save_mapped_output_json( :return: output location """ # set preferred layers for each target, to allow a mix of coding and noncoding targets - # TODO maybe this should be reevaluated and we should only allow one preferred layer per score set, - # since I can't imagine an experimental assay where some variants are assayed as nucleotide variants - # and others are assayed as amino acid variants. reference_sequences: dict[str, dict] = {} mapped_scores: list[ScoreAnnotation] = [] for target_gene in mappings: diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 532f400..eadd421 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -190,7 +190,6 @@ def get_scoreset_metadata( for gene in metadata["targetGenes"]: if not _metadata_response_is_human(metadata): - # TODO allow score sets with mix of human and non-human targets? This may not come up, but is doable with a little restructuring. msg = f"Experiment for {scoreset_urn} contains non-human targets" raise ScoresetNotSupportedError(msg) try: @@ -251,15 +250,12 @@ def _load_scoreset_records( else: row["score"] = row["score"] if row["hgvs_nt"] != "NA": - # TODO check assumption of no colon in hgvs unless reference sequence identifier present prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None elif row["hgvs_pro"] != "NA": - # TODO check assumption of no colon in hgvs unless reference sequence identifier present prefix = ( row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None ) else: - # Should we quit the whole mapping job if this comes up, or just skip this row and only quit if none contain hgvs_nt or hgvs_pro? msg = f"Each score row in {metadata.urn} must contain hgvs_nt or hgvs_pro variant description " raise ScoresetNotSupportedError(msg) # If no reference sequence prefix is provided, the score set should only have one From dc7c48e11a0c7053ec599b213d5bc0dcea6876cc Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 16:52:45 -0700 Subject: [PATCH 20/23] Fix BLAT-incompatible target names in multi-target score sets BLAT automatically removes certain characters from query names, including removing all characters after a space. If the BLAT result name does not match any target genes in the score set, attempt to match based on BLAT's query name shortening patterns. If multiple matches (could happen if labels are something like "Gene 1" and "Gene 2", in which case both would be shortened to "Gene"), raise an error. --- src/dcd_mapping/align.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index f99c4dc..b64f67a 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -4,7 +4,6 @@ import subprocess import tempfile from pathlib import Path -from typing import Any from urllib.parse import urlparse import requests @@ -167,7 +166,9 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType raise ValueError(msg) -def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: ANN401 +def _get_blat_output( + metadata: ScoresetMetadata, silent: bool +) -> dict[str, QueryResult]: """Run a BLAT query and returns a path to the output object. If unable to produce a valid query the first time, then try a query using ``dnax`` @@ -195,7 +196,6 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: try: output = parse_blat(out_file, "blat-psl") - # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? except ValueError: target_args = "-q=dnax -t=dnax" process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) @@ -336,13 +336,31 @@ def align( # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets if target_label == "query" and len(scoreset_metadata.target_genes) == 1: target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 - # NOTE this is a temporary fix that will not work for multi-target score sets! - # blat automatically reformats query names. - if ( - target_label not in scoreset_metadata.target_genes - and len(scoreset_metadata.target_genes) == 1 - ): - target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 + # blat automatically reformats query names, so sometimes they don't match our metadata + if target_label not in scoreset_metadata.target_genes: + # if single-target score set, don't need to match by name + if len(scoreset_metadata.target_genes) == 1: + target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 + else: + # try to match query name to a target gene in the metadata + matches = 0 + for target_gene_name in scoreset_metadata.target_genes: + blat_target_gene_name = ( + target_gene_name.split(" ")[0] + .replace("(", "") + .replace(")", "") + .replace(",", "") + ) + if blat_target_gene_name == target_label: + target_label = target_gene_name + matches += 1 + # we may be missing some blat reformatting rules here - if so, this error will be thrown + if matches == 0: + msg = f"BLAT result {target_label} does not match any target gene names in scoreset {scoreset_metadata.urn}." + raise AlignmentError(msg) + if matches > 1: + # could happen if multiple target genes have the same first word in their label (unlikely) + msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) return alignment_results From a59c5609c29bed1a5e2694914a574fc18b0f225c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 16:55:38 -0700 Subject: [PATCH 21/23] Bump mapper version --- src/dcd_mapping/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index e095212..8c31b0b 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2024.1.3" +dcd_mapping_version = "2025.1.0" From bb04c51e030794dd91f00ef2b95e026b9128e24e Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 3 Jun 2025 17:06:01 -0700 Subject: [PATCH 22/23] Use https for mock requests --- tests/test_mavedb_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_mavedb_data.py b/tests/test_mavedb_data.py index 94cab8b..751d0c7 100644 --- a/tests/test_mavedb_data.py +++ b/tests/test_mavedb_data.py @@ -34,7 +34,7 @@ def test_get_scoreset_metadata( urn = "urn:mavedb:00000093-a-1" with requests_mock.Mocker() as m: m.get( - f"http://api.mavedb.org/api/v1/score-sets/{urn}", + f"https://api.mavedb.org/api/v1/score-sets/{urn}", json=scoreset_metadata_response[urn], ) scoreset_metadata = get_scoreset_metadata( @@ -64,14 +64,14 @@ def test_get_scoreset_records( scores_csv_text = f.read() with requests_mock.Mocker() as m: m.get( - f"http://api.mavedb.org/api/v1/score-sets/{urn}", + f"https://api.mavedb.org/api/v1/score-sets/{urn}", json=scoreset_metadata_response[urn], ) scoreset_metadata = get_scoreset_metadata( urn, dcd_mapping_dir=resources_data_dir ) m.get( - f"http://api.mavedb.org/api/v1/score-sets/{urn}/scores", + f"https://api.mavedb.org/api/v1/score-sets/{urn}/scores", text=scores_csv_text, ) scoreset_records = get_scoreset_records( From 37c504e8a475dfb45c01ab1e84cc97fd1dcd2f2d Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Jun 2025 10:05:21 -0700 Subject: [PATCH 23/23] Mock cdot data provider for tests Although cdot fetching is not currently directly used in any tests, the cdot REST data provider is imported into dcd_mapping.vrs_map. The cdot data provider uses a ChainedSeqFetcher, to which specific fasta file paths are passed in dcd_mapping.lookup. This results in a FileNotFoundError during tests, since these fasta files are not available outside of the mapper Docker container. This fix uses a fake fasta file to generate a ChainedSeqFetcher, so this change does not support actual cdot transcript fetches for future tests. --- tests/conftest.py | 16 +++++++ tests/fixtures/test.fasta | 2 + tests/fixtures/test.fasta.fai | 1 + tests/test_vrs_map.py | 89 +++++++++++++++++++++-------------- 4 files changed, 72 insertions(+), 36 deletions(-) create mode 100644 tests/fixtures/test.fasta create mode 100644 tests/fixtures/test.fasta.fai diff --git a/tests/conftest.py b/tests/conftest.py index c588794..e18a0e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from unittest.mock import MagicMock import pytest +from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher from dcd_mapping.schemas import ( AlignmentResult, @@ -216,3 +217,18 @@ def _translate_identifier( mocker.patch("dcd_mapping.vrs_map.get_seqrepo", return_value=mock_seqrepo_access) mocker.patch("dcd_mapping.lookup.get_seqrepo", return_value=mock_seqrepo_access) return mock_seqrepo_access + + +@pytest.fixture() +def data_provider(fixture_data_dir: Path): + """Provide a ChainedSeqFetcher with mocked fasta files for testing. + Currently, no tests actually use cdot, but a FileNotFoundError would be raised + without this fixture when cdot is imported into dcd_mapping.vrs_map. + """ + # NOTE: This fasta file would not work for test cases that actually use cdot fetching, + # it is only meant to avoid a FileNotFoundError. + test_fasta_file = fixture_data_dir / "test.fasta" + + seqfetcher = ChainedSeqFetcher(FastaSeqFetcher(test_fasta_file)) + + yield seqfetcher # noqa: PT022 diff --git a/tests/fixtures/test.fasta b/tests/fixtures/test.fasta new file mode 100644 index 0000000..b310257 --- /dev/null +++ b/tests/fixtures/test.fasta @@ -0,0 +1,2 @@ +>test_contig_random_sequence +ATTCCATTGCTATATTGCCGGGATGCAATTTTAGCCCGAATATTTATAGCGCGCGTATGATCCTACATC diff --git a/tests/fixtures/test.fasta.fai b/tests/fixtures/test.fasta.fai new file mode 100644 index 0000000..2ebabf3 --- /dev/null +++ b/tests/fixtures/test.fasta.fai @@ -0,0 +1 @@ +test_contig_random_sequence 69 29 69 70 diff --git a/tests/test_vrs_map.py b/tests/test_vrs_map.py index 425011c..20b0244 100644 --- a/tests/test_vrs_map.py +++ b/tests/test_vrs_map.py @@ -4,10 +4,12 @@ we're focused on remaining consistent w/ previous results. * Move expected data into a separate JSON file or something? """ +from collections.abc import Generator from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest +from cdot.hgvs.dataproviders import ChainedSeqFetcher from cool_seq_tool.schemas import AnnotationLayer from ga4gh.vrs._internal.models import Allele, Haplotype @@ -88,6 +90,7 @@ def _get_fixtures(urn: str): def test_2_a_2( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000002-a-2" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -105,13 +108,15 @@ def test_2_a_2( } mappings = {} - for target_gene in metadata.target_genes: - mappings[target_gene] = vrs_map( - metadata=metadata.target_genes[target_gene], - align_result=align_result[target_gene], - records=records[target_gene], - transcript=tx_result[target_gene], - ) + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) assert mappings["hYAP65 WW domain"] is not None assert len(mappings["hYAP65 WW domain"]) == 1 @@ -136,6 +141,7 @@ def test_2_a_2( def test_41_a_1( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000041-a-1" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -164,13 +170,15 @@ def test_41_a_1( } mappings = {} - for target_gene in metadata.target_genes: - mappings[target_gene] = vrs_map( - metadata=metadata.target_genes[target_gene], - align_result=align_result[target_gene], - records=records[target_gene], - transcript=tx_result[target_gene], - ) + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) assert mappings["Src catalytic domain"] is not None assert len(mappings["Src catalytic domain"]) == 5 @@ -195,6 +203,7 @@ def test_41_a_1( def test_99_a_1( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000099-a-1" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -234,13 +243,15 @@ def test_99_a_1( }, } mappings = {} - for target_gene in metadata.target_genes: - mappings[target_gene] = vrs_map( - metadata=metadata.target_genes[target_gene], - align_result=align_result[target_gene], - records=records[target_gene], - transcript=tx_result[target_gene], - ) + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) assert mappings["RHO"] is not None assert len(mappings["RHO"]) == 8 # includes protein and genomic for all 4 rows @@ -265,6 +276,7 @@ def test_99_a_1( def test_103_c_1( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000103-c-1" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -289,13 +301,15 @@ def test_103_c_1( } mappings = {} - for target_gene in metadata.target_genes: - mappings[target_gene] = vrs_map( - metadata=metadata.target_genes[target_gene], - align_result=align_result[target_gene], - records=records[target_gene], - transcript=tx_result[target_gene], - ) + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) assert mappings["MAPK1"] is not None assert len(mappings["MAPK1"]) == 4 for m in mappings["MAPK1"]: @@ -315,6 +329,7 @@ def test_103_c_1( def test_1_b_2( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000001-b-2" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -355,13 +370,15 @@ def test_1_b_2( } mappings = {} - for target_gene in metadata.target_genes: - mappings[target_gene] = vrs_map( - metadata=metadata.target_genes[target_gene], - align_result=align_result[target_gene], - records=records[target_gene], - transcript=tx_result[target_gene], - ) + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) assert mappings["SUMO1"] is not None assert len(mappings["SUMO1"]) == 8 for m in mappings["SUMO1"]: