diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index 3656ac6..1c4587e 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -4,6 +4,8 @@ jobs: test: name: test py${{ matrix.python-version }} runs-on: ubuntu-latest + env: + MAVEDB_BASE_URL: https://api.mavedb.org strategy: matrix: python-version: ["3.11", "3.12"] diff --git a/Dockerfile b/Dockerfile index 22a982b..363ee58 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,6 +46,17 @@ RUN pip install -e '.[dev,tests]' RUN pip install -U polars-lts-cpu # install gene normalizer with pg dependencies. TODO: can the pg dependencies be specified in pyproject.toml? #RUN pip install 'gene-normalizer[pg]' + +# not working, needs to happen after db volume is mounted +# ENV GENE_NORM_DB_URL=postgres://postgres:postgres@db:5432/gene_normalizer +# RUN echo "y" | gene_norm_update_remote + ENV PYTHONUNBUFFERED 1 ENV PYTHONPATH "${PYTHONPATH}:/usr/src/app/src" + +# Tell Docker that we will listen on port 8000. +EXPOSE 8000 + +# At container startup, run the application using uvicorn. +CMD ["uvicorn", "api.server_main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index e2edbe4..f4266f8 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -34,6 +34,21 @@ services: volumes: - vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo + api: + build: + context: . + command: bash -c "uvicorn api.server_main:app --host 0.0.0.0 --port 8000 --reload" + depends_on: + - db + - seqrepo + env_file: + - settings/.env.dev + ports: + - "8004:8000" + volumes: + - .:/usr/src/app + - vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo + volumes: vrs-mapping-data-dev: vrs-mapping-seqrepo-dev: diff --git a/pyproject.toml b/pyproject.toml index e7ba7a2..331d3dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,10 @@ dependencies = [ "pydantic>=2", "python-dotenv", "setuptools>=68.0", # tmp -- ensure 3.12 compatibility - "mavehgvs==0.6.1" + "mavehgvs==0.6.1", + "fastapi", + "starlette", + "uvicorn" ] dynamic = ["version"] diff --git a/settings/.env.dev b/settings/.env.dev index f7980e6..4e761c7 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -19,6 +19,13 @@ POSTGRES_DB=gene_normalizer UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20180821 +#################################################################################################### +# Environment variables for MaveDB connection +#################################################################################################### + +MAVEDB_BASE_URL=http://localhost:8000 +MAVEDB_API_KEY= + #################################################################################################### # Environment variables for seqrepo #################################################################################################### diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..2e93371 --- /dev/null +++ b/src/api/__init__.py @@ -0,0 +1 @@ +"""Provide VRS mapping utilities API""" diff --git a/src/api/routers/__init__.py b/src/api/routers/__init__.py new file mode 100644 index 0000000..feb4a14 --- /dev/null +++ b/src/api/routers/__init__.py @@ -0,0 +1 @@ +"""Provide routers for dcd mapping API""" diff --git a/src/api/routers/map.py b/src/api/routers/map.py new file mode 100644 index 0000000..8577263 --- /dev/null +++ b/src/api/routers/map.py @@ -0,0 +1,161 @@ +""""Provide mapping router""" +from cool_seq_tool.schemas import AnnotationLayer +from fastapi import APIRouter, HTTPException +from fastapi.responses import JSONResponse +from requests import HTTPError + +from dcd_mapping.align import AlignmentError, BlatNotFoundError, align +from dcd_mapping.annotate import ( + _get_computed_reference_sequence, + _get_mapped_reference_sequence, + _set_scoreset_layer, + annotate, +) +from dcd_mapping.lookup import DataLookupError +from dcd_mapping.mavedb_data import ( + ScoresetNotSupportedError, + get_raw_scoreset_metadata, + get_scoreset_metadata, + get_scoreset_records, +) +from dcd_mapping.resource_utils import ResourceAcquisitionError +from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion +from dcd_mapping.transcripts import TxSelectError, select_transcript +from dcd_mapping.vrs_map import VrsMapError, vrs_map + +router = APIRouter( + prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}} +) + + +@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) +async def map_scoreset(urn: str) -> ScoresetMapping: + """Perform end-to-end mapping for a scoreset. + + :param urn: identifier for a scoreset. + :param output_path: optional path to save output at + :param vrs_version: version of VRS objects to output (1.3 or 2) + :param silent: if True, suppress console information output + """ + try: + metadata = get_scoreset_metadata(urn) + records = get_scoreset_records(urn, True) + except ScoresetNotSupportedError as e: + return ScoresetMapping( + metadata=None, + error_message=str(e).strip("'"), + ) + except ResourceAcquisitionError as e: + msg = f"Unable to acquire resource from MaveDB: {e}" + raise HTTPException(status_code=500, detail=msg) from e + + try: + alignment_result = align(metadata, True) + except BlatNotFoundError as e: + msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." + raise HTTPException(status_code=500, detail=msg) from e + except ResourceAcquisitionError as e: + msg = f"BLAT resource could not be acquired: {e}" + raise HTTPException(status_code=500, detail=msg) from e + except AlignmentError as e: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) + + try: + transcript = await select_transcript(metadata, records, alignment_result) + except (TxSelectError, KeyError, ValueError) as e: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) + except HTTPError as e: + msg = f"HTTP error occurred during transcript selection: {e}" + raise HTTPException(status_code=500, detail=msg) from e + except DataLookupError as e: + msg = f"Data lookup error occurred during transcript selection: {e}" + raise HTTPException(status_code=500, detail=msg) from e + + try: + vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) + except VrsMapError as e: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) + if vrs_results is None: + return ScoresetMapping( + metadata=metadata, + error_message="No variant mappings available for this score set", + ) + + try: + vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2) + except Exception as e: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) + if vrs_results is None: + return ScoresetMapping( + metadata=metadata, + error_message="No annotated variant mappings available for this score set", + ) + + try: + raw_metadata = get_raw_scoreset_metadata(urn) + preferred_layers = { + _set_scoreset_layer(urn, vrs_results), + } + + reference_sequences = { + layer: { + "computed_reference_sequence": None, + "mapped_reference_sequence": None, + } + for layer in AnnotationLayer + } + + for layer in preferred_layers: + reference_sequences[layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence(urn, layer, transcript) + reference_sequences[layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence(layer, transcript, alignment_result) + + mapped_scores: list[ScoreAnnotation] = [] + for m in vrs_results: + if m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + except Exception as e: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) + + return JSONResponse( + content=ScoresetMapping( + metadata=raw_metadata, + computed_protein_reference_sequence=reference_sequences[ + AnnotationLayer.PROTEIN + ]["computed_reference_sequence"], + mapped_protein_reference_sequence=reference_sequences[ + AnnotationLayer.PROTEIN + ]["mapped_reference_sequence"], + computed_genomic_reference_sequence=reference_sequences[ + AnnotationLayer.GENOMIC + ]["computed_reference_sequence"], + mapped_genomic_reference_sequence=reference_sequences[ + AnnotationLayer.GENOMIC + ]["mapped_reference_sequence"], + mapped_scores=mapped_scores, + ).model_dump(exclude_none=True) + ) diff --git a/src/api/server_main.py b/src/api/server_main.py new file mode 100644 index 0000000..0df2a50 --- /dev/null +++ b/src/api/server_main.py @@ -0,0 +1,14 @@ +"""FastAPI server file""" +import uvicorn +from fastapi import FastAPI + +from api.routers import map + +app = FastAPI() + +app.include_router(map.router) + + +# If the application is not already being run within a uvicorn server, start uvicorn here. +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) # noqa: S104 diff --git a/src/dcd_mapping/__init__.py b/src/dcd_mapping/__init__.py index 5525c98..b571764 100644 --- a/src/dcd_mapping/__init__.py +++ b/src/dcd_mapping/__init__.py @@ -8,7 +8,9 @@ from dotenv import load_dotenv from .main import map_scoreset, map_scoreset_urn +from .version import dcd_mapping_version __all__ = ["map_scoreset", "map_scoreset_urn"] +__version__ = dcd_mapping_version load_dotenv() diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 8d9f878..db2d3f8 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -205,6 +205,7 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: else: if list(output): hit_chrs = [h.id for h in output] + # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome _logger.warning( "Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s", urn, @@ -221,8 +222,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: best_score_hit = hit if best_score_hit is None: - _logger.error("Couldn't get hits from %s -- check BLAT output.", urn) - raise AlignmentError + msg = f"Couldn't get BLAT hits from {urn}" + raise AlignmentError(msg) return best_score_hit @@ -246,12 +247,8 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP else: best_hsp = max(hit, key=lambda hsp: hsp.score) if best_hsp is None: - _logger.error( - "Unable to get best HSP from hit -- this should be impossible? urn: %s, hit: %s", - urn, - hit, - ) - raise AlignmentError + msg = f"Unable to get best HSP from BLAT hit: {hit}" + raise AlignmentError(msg) return best_hsp diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index fc58742..88fb6b9 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -251,34 +251,35 @@ def _annotate_allele_mapping( post_mapped: Allele = mapped_score.post_mapped # get vrs_ref_allele_seq for pre-mapped variants - pre_mapped.extensions = [_get_vrs_ref_allele_seq(post_mapped, metadata, tx_results)] - - # Determine reference sequence - if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: - sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" - accession = get_chromosome_identifier_from_vrs_id(sequence_id) - if accession is None: - raise ValueError - if accession.startswith("refseq:"): - accession = accession[7:] - else: - if tx_results is None: - raise ValueError # impossible by definition - accession = tx_results.np + pre_mapped.extensions = [_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results)] + + if post_mapped: + # Determine reference sequence + if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: + sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" + accession = get_chromosome_identifier_from_vrs_id(sequence_id) + if accession is None: + raise ValueError + if accession.startswith("refseq:"): + accession = accession[7:] + else: + if tx_results is None: + raise ValueError # impossible by definition + accession = tx_results.np - sr = get_seqrepo() - loc = mapped_score.post_mapped.location - sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) - post_mapped.extensions = [ - Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) - ] - hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) - post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] + sr = get_seqrepo() + loc = mapped_score.post_mapped.location + sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" + ref = sr.get_sequence(sequence_id, loc.start, loc.end) + post_mapped.extensions = [ + Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) + ] + hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) + post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _allele_to_vod(pre_mapped) - post_mapped = _allele_to_vod(post_mapped) + post_mapped = _allele_to_vod(post_mapped) if post_mapped else None return ScoreAnnotationWithLayer( pre_mapped=pre_mapped, @@ -287,24 +288,27 @@ def _annotate_allele_mapping( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score else None, annotation_layer=mapped_score.annotation_layer, + error_message=mapped_score.error_message + if mapped_score.error_message + else None, # TODO might not need if statement here ) def _annotate_haplotype_mapping( - mapping: MappedScore, + mapped_score: MappedScore, tx_results: TxSelectResult | None, metadata: ScoresetMetadata, vrs_version: VrsVersion = VrsVersion.V_2, ) -> ScoreAnnotationWithLayer: """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" - pre_mapped: Haplotype = mapping.pre_mapped # type: ignore - post_mapped: Haplotype = mapping.post_mapped # type: ignore + pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore + post_mapped: Haplotype = mapped_score.post_mapped # type: ignore # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, tx_results)] # Determine reference sequence - if mapping.annotation_layer == AnnotationLayer.GENOMIC: + if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: sequence_id = ( f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}" ) @@ -318,28 +322,32 @@ def _annotate_haplotype_mapping( raise ValueError # impossible by definition accession = tx_results.np - sr = get_seqrepo() - for allele in post_mapped.members: - loc = allele.location - sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) # TODO type issues?? - allele.extensions = [ - Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) - ] - hgvs, syntax = _get_hgvs_string(allele, accession) - allele.expressions = [Expression(syntax=syntax, value=hgvs)] + if post_mapped: + sr = get_seqrepo() + for allele in post_mapped.members: + loc = allele.location + sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" + ref = sr.get_sequence(sequence_id, loc.start, loc.end) # TODO type issues?? + allele.extensions = [ + Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) + ] + hgvs, syntax = _get_hgvs_string(allele, accession) + allele.expressions = [Expression(syntax=syntax, value=hgvs)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped) - post_mapped = _haplotype_to_haplotype_1_3(post_mapped) + post_mapped = _haplotype_to_haplotype_1_3(post_mapped) if post_mapped else None return ScoreAnnotationWithLayer( pre_mapped=pre_mapped, post_mapped=post_mapped, vrs_version=vrs_version, - mavedb_id=mapping.accession_id, - score=float(mapping.score) if mapping.score is not None else None, - annotation_layer=mapping.annotation_layer, + mavedb_id=mapped_score.accession_id, + score=float(mapped_score.score) if mapped_score.score is not None else None, + annotation_layer=mapped_score.annotation_layer, + error_message=mapped_score.error_message + if mapped_score.error_message + else None, # TODO might not need if statement here ) @@ -367,16 +375,26 @@ def annotate( """ score_annotations = [] for mapped_score in mapped_scores: - if isinstance(mapped_score.pre_mapped, Haplotype) and isinstance( - mapped_score.post_mapped, Haplotype + if mapped_score.pre_mapped is None: + score_annotations.append( + ScoreAnnotationWithLayer( + mavedb_id=mapped_score.accession_id, + score=float(mapped_score.score) if mapped_score.score else None, + error_message=mapped_score.error_message, + ) + ) + elif isinstance(mapped_score.pre_mapped, Haplotype) and ( + isinstance(mapped_score.post_mapped, Haplotype) + or mapped_score.post_mapped is None ): score_annotations.append( _annotate_haplotype_mapping( mapped_score, tx_results, metadata, vrs_version ) ) - elif isinstance(mapped_score.pre_mapped, Allele) and isinstance( - mapped_score.post_mapped, Allele + elif isinstance(mapped_score.pre_mapped, Allele) and ( + isinstance(mapped_score.post_mapped, Allele) + or mapped_score.post_mapped is None ): score_annotations.append( _annotate_allele_mapping( @@ -384,6 +402,7 @@ def annotate( ) ) else: + # TODO how to combine this error message with other potential error messages? ValueError("inconsistent variant structure") return score_annotations @@ -432,9 +451,13 @@ def _get_mapped_reference_sequence( :return A MappedReferenceSequence object """ if layer == AnnotationLayer.PROTEIN and tx_output is not None: + if tx_output.np is None: + msg = "No NP accession associated with reference transcript" + raise ValueError(msg) vrs_id = get_vrs_id_from_identifier(tx_output.np) if vrs_id is None: - raise ValueError + msg = "ID could not be acquired from Seqrepo for transcript identifier" + raise ValueError(msg) return MappedReferenceSequence( sequence_type=TargetSequenceType.PROTEIN, sequence_id=vrs_id, @@ -443,7 +466,8 @@ def _get_mapped_reference_sequence( seq_id = get_chromosome_identifier(align_result.chrom) vrs_id = get_vrs_id_from_identifier(seq_id) if vrs_id is None: - raise ValueError + msg = "ID could not be acquired from Seqrepo for chromosome identifier" + raise ValueError(msg) return MappedReferenceSequence( sequence_type=TargetSequenceType.DNA, sequence_id=vrs_id, @@ -469,6 +493,28 @@ def _set_scoreset_layer( return AnnotationLayer.PROTEIN +def write_scoreset_mapping_to_json( + urn: str, + scoreset_mapping: ScoresetMapping, + output_path: Path | None = None, +) -> Path: + """Write the given ScoresetMapping as a JSON at the specified + or default ouput path. + """ + if not output_path: + now = datetime.datetime.now(tz=datetime.UTC).isoformat() + output_path = LOCAL_STORE_PATH / f"{urn}_mapping_{now}.json" + + _logger.info("Saving mapping output to %s", output_path) + with output_path.open("w") as file: + json.dump( + scoreset_mapping.model_dump(exclude_unset=False, exclude_none=True), + file, + indent=4, + ) + return output_path + + def save_mapped_output_json( urn: str, mappings: list[ScoreAnnotationWithLayer], @@ -507,10 +553,22 @@ def save_mapped_output_json( reference_sequences[layer][ "mapped_reference_sequence" ] = _get_mapped_reference_sequence(layer, tx_output, align_result) + # except Exception as e: + # _logger.warning( + # str(e) + # ) + # output = ScoresetMapping( + # metadata=metadata, + # error_message = str(e).strip("'") + # ) + + # return write_scoreset_mapping_to_json mapped_scores: list[ScoreAnnotation] = [] for m in mappings: - if m.annotation_layer in preferred_layers: + if m.pre_mapped is None: + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + elif m.annotation_layer in preferred_layers: # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) @@ -531,15 +589,4 @@ def save_mapped_output_json( mapped_scores=mapped_scores, ) - if not output_path: - now = datetime.datetime.now(tz=datetime.UTC).isoformat() - output_path = LOCAL_STORE_PATH / f"{urn}_mapping_{now}.json" - - _logger.info("Saving mapping output to %s", output_path) - with output_path.open("w") as file: - json.dump( - output.model_dump(exclude_unset=True, exclude_none=True), - file, - indent=4, - ) - return output_path + return write_scoreset_mapping_to_json(urn, output, output_path) diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 42f80fb..a037af8 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -212,12 +212,15 @@ async def get_protein_accession(transcript: str) -> str | None: :param transcript: transcript accession, e.g. ``"NM_002529.3"`` :return: protein accession if successful """ - uta = CoolSeqToolBuilder().uta_db - query = f""" - SELECT pro_ac FROM {uta.schema}.associated_accessions - WHERE tx_ac = '{transcript}' - """ # noqa: S608 - result = await uta.execute_query(query) + try: + uta = CoolSeqToolBuilder().uta_db + query = f""" + SELECT pro_ac FROM {uta.schema}.associated_accessions + WHERE tx_ac = '{transcript}' + """ # noqa: S608 + result = await uta.execute_query(query) + except Exception as e: + raise DataLookupError from e if result: return result[0]["pro_ac"] return None @@ -239,16 +242,19 @@ async def get_transcripts( :param end: ending position :return: candidate transcript accessions """ - uta = CoolSeqToolBuilder().uta_db - query = f""" - SELECT tx_ac - FROM {uta.schema}.tx_exon_aln_v - WHERE hgnc = '{gene_symbol}' - AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i) - AND alt_ac = '{chromosome_ac}' - AND tx_ac NOT LIKE 'NR_%'; - """ # noqa: S608 - result = await uta.execute_query(query) + try: + uta = CoolSeqToolBuilder().uta_db + query = f""" + SELECT tx_ac + FROM {uta.schema}.tx_exon_aln_v + WHERE hgnc = '{gene_symbol}' + AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i) + AND alt_ac = '{chromosome_ac}' + AND tx_ac NOT LIKE 'NR_%'; + """ # noqa: S608 + result = await uta.execute_query(query) + except Exception as e: + raise DataLookupError from e return [row["tx_ac"] for row in result] @@ -428,7 +434,8 @@ def get_chromosome_identifier(chromosome: str) -> str: for ac in tmp_acs: acs.append(ac.split("refseq:")[-1]) if not acs: - raise KeyError + msg = f"Cannot retrieve NC identifier for {chromosome} from Seqrepo" + raise KeyError(msg) # make sure e.g. version .10 > version .9 sorted_results = sorted(acs, key=lambda i: int(i.split(".")[-1])) @@ -446,7 +453,10 @@ def get_ucsc_chromosome_name(chromosome: str) -> str: sr = CoolSeqToolBuilder().seqrepo_access result, _ = sr.translate_identifier(chromosome, "GRCh38") if not result: - raise KeyError + msg = ( + f"Cannot retrieve USCS-style chromosome name for {chromosome} from Seqrepo" + ) + raise KeyError(msg) sorted_results = sorted([r for r in result if "chr" in r]) try: @@ -465,7 +475,8 @@ def get_chromosome_identifier_from_vrs_id(sequence_id: str) -> str | None: sr = CoolSeqToolBuilder().seqrepo_access result, _ = sr.translate_identifier(sequence_id, "refseq") if not result: - raise KeyError + msg = f"Cannot retrieve NC identifier for {sequence_id} from Seqrepo" + raise KeyError(msg) sorted_results = sorted(result) return sorted_results[-1] @@ -479,8 +490,8 @@ def get_vrs_id_from_identifier(sequence_id: str) -> str | None: sr = CoolSeqToolBuilder().seqrepo_access result, _ = sr.translate_identifier(sequence_id, "ga4gh") if not result: - raise KeyError - + msg = f"Cannot retrieve GA4GH SQ identifier for {sequence_id} from Seqrepo" + raise KeyError(msg) sorted_results = sorted(result) return sorted_results[-1] diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 201f1b1..a51fb82 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -6,14 +6,29 @@ from pathlib import Path import click +from requests import HTTPError from dcd_mapping.align import AlignmentError, BlatNotFoundError, align -from dcd_mapping.annotate import annotate, save_mapped_output_json -from dcd_mapping.lookup import check_gene_normalizer, check_seqrepo, check_uta -from dcd_mapping.mavedb_data import get_scoreset_metadata, get_scoreset_records +from dcd_mapping.annotate import ( + annotate, + save_mapped_output_json, + write_scoreset_mapping_to_json, +) +from dcd_mapping.lookup import ( + DataLookupError, + check_gene_normalizer, + check_seqrepo, + check_uta, +) +from dcd_mapping.mavedb_data import ( + ScoresetNotSupportedError, + get_scoreset_metadata, + get_scoreset_records, +) from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreRow, + ScoresetMapping, ScoresetMetadata, VrsVersion, ) @@ -145,22 +160,51 @@ async def map_scoreset( msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." _emit_info(msg, silent, logging.ERROR) raise e + except ResourceAcquisitionError as e: + _emit_info(f"BLAT resource could not be acquired: {e}", silent, logging.ERROR) + raise e except AlignmentError as e: _emit_info( f"Alignment failed for scoreset {metadata.urn} {e}", silent, logging.ERROR ) - raise e + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return _emit_info("Alignment complete.", silent) _emit_info("Selecting reference sequence...", silent) try: transcript = await select_transcript(metadata, records, alignment_result) - except TxSelectError as e: + except (TxSelectError, KeyError, ValueError) as e: _emit_info( f"Transcript selection failed for scoreset {metadata.urn}", silent, logging.ERROR, ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return + except HTTPError as e: + _emit_info( + f"HTTP error occurred during transcript selection: {e}", + silent, + logging.ERROR, + ) + raise e + except DataLookupError as e: + _emit_info( + f"Data lookup error occurred during transcript selection: {e}", + silent, + logging.ERROR, + ) raise e _emit_info("Reference selection complete.", silent) @@ -171,22 +215,75 @@ async def map_scoreset( _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR ) - raise e + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return if vrs_results is None: - _emit_info(f"No mapping available for {metadata.urn}", silent) + _emit_info(f"No mapping available for {metadata.urn}", silent, logging.ERROR) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping( + metadata=metadata, + error_message="No variant mappings available for this score set", + ), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return _emit_info("VRS mapping complete.", silent) _emit_info("Annotating metadata and saving to file...", silent) - vrs_results = annotate(vrs_results, transcript, metadata, vrs_version) - final_output = save_mapped_output_json( - metadata.urn, - vrs_results, - alignment_result, - transcript, - prefer_genomic, - output_path, - ) + try: + vrs_results = annotate(vrs_results, transcript, metadata, vrs_version) + except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors + _emit_info( + f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR + ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return + if vrs_results is None: + _emit_info(f"No annotation available for {metadata.urn}", silent, logging.ERROR) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping( + metadata=metadata, + error_message="No annotated variant mappings available for this score set", + ), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return + try: + final_output = save_mapped_output_json( + metadata.urn, + vrs_results, + alignment_result, + transcript, + prefer_genomic, + output_path, + ) + except Exception as e: + _emit_info( + f"Error in creating or saving final score set mapping for {metadata.urn} {e}", + silent, + logging.ERROR, + ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return _emit_info(f"Annotated scores saved to: {final_output}.", silent) @@ -207,6 +304,18 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn) records = get_scoreset_records(urn, silent) + except ScoresetNotSupportedError as e: + _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) + final_output = write_scoreset_mapping_to_json( + urn, + ScoresetMapping( + metadata=None, + error_message=str(e).strip("'"), + ), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return except ResourceAcquisitionError as e: msg = f"Unable to acquire resource from MaveDB: {e}" _logger.critical(msg) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index e858dc4..5b94e5d 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -15,7 +15,9 @@ from dcd_mapping.resource_utils import ( LOCAL_STORE_PATH, + MAVEDB_BASE_URL, ResourceAcquisitionError, + authentication_header, http_download, ) from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, UniProtRef @@ -32,13 +34,21 @@ _logger = logging.getLogger(__name__) +class ScoresetNotSupportedError(Exception): + """Raise when a score set cannot be mapped because it has characteristics that are not currently supported.""" + + def get_scoreset_urns() -> set[str]: """Fetch all scoreset URNs. Since species is annotated at the scoreset target level, we can't yet filter on anything like `homo sapien` -- meaning this is fairly slow. :return: set of URN strings """ - r = requests.get("https://api.mavedb.org/api/v1/experiments/", timeout=30) + r = requests.get( + f"{MAVEDB_BASE_URL}/api/v1/experiments/", + timeout=30, + headers=authentication_header(), + ) r.raise_for_status() scoreset_urn_lists = [ experiment["scoreSetUrns"] @@ -74,7 +84,11 @@ def get_human_urns() -> list[str]: scoreset_urns = get_scoreset_urns() human_scoresets: list[str] = [] for urn in scoreset_urns: - r = requests.get(f"https://api.mavedb.org/api/v1/score-sets/{urn}", timeout=30) + r = requests.get( + f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}", + timeout=30, + headers=authentication_header(), + ) try: r.raise_for_status() except requests.exceptions.HTTPError: @@ -123,8 +137,8 @@ def get_raw_scoreset_metadata( dcd_mapping_dir = LOCAL_STORE_PATH metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json" if not metadata_file.exists(): - url = f"https://api.mavedb.org/api/v1/score-sets/{scoreset_urn}" - r = requests.get(url, timeout=30) + url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}" + r = requests.get(url, timeout=30, headers=authentication_header()) try: r.raise_for_status() except requests.HTTPError as e: @@ -156,13 +170,17 @@ def get_scoreset_metadata( """ metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir) - if not _metadata_response_is_human(metadata): - msg = f"Experiment for {scoreset_urn} contains no human targets" - raise ResourceAcquisitionError(msg) if len(metadata["targetGenes"]) > 1: - msg = f"Multiple target genes for {scoreset_urn} -- look into this." - raise ResourceAcquisitionError(msg) + msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." + raise ScoresetNotSupportedError(msg) gene = metadata["targetGenes"][0] + target_sequence_gene = gene.get("targetSequence") + if target_sequence_gene is None: + msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." + raise ScoresetNotSupportedError(msg) + if not _metadata_response_is_human(metadata): + msg = f"Experiment for {scoreset_urn} contains no human targets" + raise ScoresetNotSupportedError(msg) try: structured_data = ScoresetMetadata( urn=metadata["urn"], @@ -175,7 +193,7 @@ def get_scoreset_metadata( except (KeyError, ValidationError) as e: msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" _logger.error(msg) - raise ResourceAcquisitionError(msg) from e + raise ScoresetNotSupportedError(msg) from e return structured_data @@ -238,7 +256,7 @@ def get_scoreset_records( if urn == "urn:mavedb:00000053-a-1": _get_experiment_53_scores(scores_csv, silent) else: - url = f"https://api.mavedb.org/api/v1/score-sets/{urn}/scores" + url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}/scores" try: http_download(url, scores_csv, silent) except requests.HTTPError as e: diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index c6269f2..153a99c 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -6,6 +6,9 @@ import requests from tqdm import tqdm +MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY") +MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL") + LOCAL_STORE_PATH = Path( os.environ.get( "DCD_MAPPING_RESOURCES_DIR", Path.home() / ".local" / "share" / "dcd_mapping" @@ -19,6 +22,11 @@ class ResourceAcquisitionError(Exception): """Raise when resource acquisition fails.""" +def authentication_header() -> dict | None: + """Fetch with api key envvar, if available.""" + return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None + + def http_download(url: str, out_path: Path, silent: bool = True) -> Path: """Download a file via HTTP. @@ -30,7 +38,9 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path: """ if not silent: click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}") - with requests.get(url, stream=True, timeout=30) as r: + with requests.get( + url, stream=True, timeout=30, headers=authentication_header() + ) as r: r.raise_for_status() total_size = int(r.headers.get("content-length", 0)) with out_path.open("wb") as h: diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index d470cb9..45f70e2 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -1,12 +1,14 @@ """Provide class definitions for commonly-used information objects.""" +import datetime from enum import Enum from typing import Any, Literal from cool_seq_tool.schemas import AnnotationLayer, Strand, TranscriptPriority from ga4gh.vrs._internal.models import Allele, Haplotype -from pydantic import BaseModel, ConfigDict, StrictBool, StrictInt, StrictStr +from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictInt, StrictStr from dcd_mapping import vrs_v1_schemas +from dcd_mapping.version import dcd_mapping_version class TargetSequenceType(str, Enum): @@ -152,10 +154,11 @@ class MappedScore(BaseModel): model_config = ConfigDict(use_enum_values=True) accession_id: StrictStr - annotation_layer: AnnotationLayer score: str | None - pre_mapped: Allele | Haplotype - post_mapped: Allele | Haplotype + annotation_layer: AnnotationLayer | None = None + pre_mapped: Allele | Haplotype | None = None + post_mapped: Allele | Haplotype | None = None + error_message: str | None = None class ScoreAnnotation(BaseModel): @@ -164,12 +167,15 @@ class ScoreAnnotation(BaseModel): This model defines what an individual mapping instance looks like in the final JSON. """ - pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype - post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype - vrs_version: VrsVersion mavedb_id: StrictStr - relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to" - score: float | None + relation: Literal[ + "SO:is_homologous_to" + ] = "SO:is_homologous_to" # TODO this should probably be None if pre_mapped is false? + pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None + post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None + vrs_version: VrsVersion | None = None + score: float | None = None + error_message: str | None = None class ScoreAnnotationWithLayer(ScoreAnnotation): @@ -179,15 +185,20 @@ class ScoreAnnotationWithLayer(ScoreAnnotation): Used for filtering individual annotations just before saving the final JSON product. """ - annotation_layer: AnnotationLayer + annotation_layer: AnnotationLayer | None = None class ScoresetMapping(BaseModel): """Provide all mapped scores for a scoreset.""" metadata: Any # TODO get exact MaveDB metadata structure? - computed_protein_reference_sequence: ComputedReferenceSequence | None - mapped_protein_reference_sequence: MappedReferenceSequence | None - computed_genomic_reference_sequence: ComputedReferenceSequence | None - mapped_genomic_reference_sequence: MappedReferenceSequence | None - mapped_scores: list[ScoreAnnotation] + dcd_mapping_version: str = Field(default=dcd_mapping_version) + mapped_date_utc: str = Field( + default=datetime.datetime.now(tz=datetime.UTC).isoformat() + ) + computed_protein_reference_sequence: ComputedReferenceSequence | None = None + mapped_protein_reference_sequence: MappedReferenceSequence | None = None + computed_genomic_reference_sequence: ComputedReferenceSequence | None = None + mapped_genomic_reference_sequence: MappedReferenceSequence | None = None + mapped_scores: list[ScoreAnnotation] | None = None + error_message: str | None = None diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py new file mode 100644 index 0000000..8806049 --- /dev/null +++ b/src/dcd_mapping/version.py @@ -0,0 +1,3 @@ +"""Provide dcd mapping version""" + +dcd_mapping_version = "2024.1.0" diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index bf31775..28d5ab7 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -4,7 +4,6 @@ from collections.abc import Iterable from itertools import cycle -import click from Bio.Seq import Seq from cool_seq_tool.schemas import AnnotationLayer, Strand from ga4gh.core import ga4gh_identify, sha512t24u @@ -226,7 +225,7 @@ def _adjust_genomic_variant_to_ref( break if query_subrange_containing_hit is None or target_subrange_containing_hit is None: - msg = "Variant was not contained, or multi-position variant was not fully contained, within the aligned portion of the query sequence." + msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence" raise ValueError(msg) for idx, start in enumerate(starts): @@ -292,7 +291,7 @@ def _map_protein_coding_pro( row: ScoreRow, sequence_id: str, transcript: TxSelectResult, -) -> MappedScore | None: +) -> MappedScore: """Construct VRS object mapping for ``hgvs_pro`` variant column entry These arguments are a little lazy and could be pruned down later @@ -302,15 +301,27 @@ def _map_protein_coding_pro( :param transcript: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ - if ( - row.hgvs_pro in {"_wt", "_sy", "NA"} - or "fs" in row.hgvs_pro - or len(row.hgvs_pro) == 3 - ): + if row.hgvs_pro in {"_wt", "_sy", "NA"} or len(row.hgvs_pro) == 3: _logger.warning( "Can't process variant syntax %s for %s", row.hgvs_pro, row.accession ) - return None + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message=f"Can't process variant syntax {row.hgvs_pro}", + ) + + if "fs" in row.hgvs_pro: + _logger.warning( + "Can't process variant syntax %s for %s because protein frameshift variants are not supported", + row.hgvs_pro, + row.accession, + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Protein frameshift variants are not supported", + ) # TODO: Handle edge cases without hardcoding URNs. # Special case for experiment set urn:mavedb:0000097 @@ -324,47 +335,70 @@ def _map_protein_coding_pro( post_mapped=vrs_variation, ) - pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( - row.hgvs_pro, - AnnotationLayer.PROTEIN, - tx=transcript, - ) - post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( - row.hgvs_pro, - AnnotationLayer.PROTEIN, - tx=transcript, - ) - - pre_mapped_protein = _construct_vrs_allele( - pre_mapped_hgvs_strings, - AnnotationLayer.PROTEIN, - sequence_id, - True, - ) - post_mapped_protein = _construct_vrs_allele( - post_mapped_hgvs_strings, - AnnotationLayer.PROTEIN, - None, - False, - ) + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_pro, + AnnotationLayer.PROTEIN, + tx=transcript, + ) + pre_mapped_protein = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.PROTEIN, + sequence_id, + True, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped protein variant for %s, accession %s: %s", + row.hgvs_pro, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) - if pre_mapped_protein and post_mapped_protein: + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_pro, + AnnotationLayer.PROTEIN, + tx=transcript, + ) + post_mapped_protein = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.PROTEIN, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped protein variant for %s, accession %s: %s", + row.hgvs_pro, + row.accession, + str(e), + ) return MappedScore( accession_id=row.accession, score=row.score, annotation_layer=AnnotationLayer.PROTEIN, pre_mapped=pre_mapped_protein, - post_mapped=post_mapped_protein, + error_message=str(e).strip("'"), ) - return None + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.PROTEIN, + pre_mapped=pre_mapped_protein, + post_mapped=post_mapped_protein, + ) def _map_genomic( row: ScoreRow, sequence_id: str, align_result: AlignmentResult, -) -> MappedScore | None: +) -> MappedScore: """Construct VRS object mapping for ``hgvs_nt`` variant column entry These arguments are a little lazy and could be pruned down later @@ -374,40 +408,78 @@ def _map_genomic( :param align_result: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ - pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) - post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) + if ( + row.hgvs_nt in {"_wt", "_sy", "="} + or "fs" + in row.hgvs_nt # TODO I think this line can be taken out, I don't think fs nomenclature can be used for nt anyway + or len(row.hgvs_nt) == 3 + ): + _logger.warning( + "Can't process variant syntax %s for %s", row.hgvs_nt, row.accession + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message=f"Can't process variant syntax {row.hgvs_nt}", + ) - pre_mapped_genomic = _construct_vrs_allele( - pre_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - sequence_id, - True, - ) - post_mapped_genomic = _construct_vrs_allele( - post_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - None, - False, - ) + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + sequence_id, + True, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) - if pre_mapped_genomic and post_mapped_genomic: + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) return MappedScore( accession_id=row.accession, score=row.score, annotation_layer=AnnotationLayer.GENOMIC, pre_mapped=pre_mapped_genomic, - post_mapped=post_mapped_genomic, + error_message=str(e), ) - return None + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.GENOMIC, + pre_mapped=pre_mapped_genomic, + post_mapped=post_mapped_genomic, + ) def _get_allele_sequence(allele: Allele) -> str: @@ -483,24 +555,12 @@ def _map_protein_coding( hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript) if hgvs_pro_mappings: variations.append(hgvs_pro_mappings) - else: - _logger.warning( - "Encountered apparently invalid protein variants in %s: %s", - row.accession, - row.hgvs_pro, - ) if _hgvs_nt_is_valid(row.hgvs_nt): hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result) if hgvs_nt_mappings: variations.append(hgvs_nt_mappings) - else: - _logger.warning( - "Encountered apparently invalid genomic variants in %s: %s", - row.accession, - row.hgvs_nt, - ) return variations @@ -521,26 +581,8 @@ def _map_regulatory_noncoding( sequence_id = store_sequence(metadata.target_sequence) for row in records: - if ( - row.hgvs_nt in {"_wt", "_sy", "="} - or "fs" in row.hgvs_nt - or len(row.hgvs_nt) == 3 - ): - _logger.warning( - "Can't process variant syntax %s for %s", row.hgvs_nt, metadata.urn - ) - continue - hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) - - if hgvs_nt_mappings: - variations.append(hgvs_nt_mappings) - else: - _logger.warning( - "Encountered apparently invalid genomic variants in %s: %s", - row.accession, - row.hgvs_nt, - ) + variations.append(hgvs_nt_mappings) return variations @@ -633,12 +675,13 @@ def vrs_map( :param silent: If true, suppress console output :return: A list of mapping results """ - if metadata.urn == "urn:mavedb:00000072-a-1": - msg = f"No RefSeq accession is available for {metadata.urn}." - if not silent: - click.echo(msg) - _logger.warning(msg) - return None + # TODO address this hardcoding, and if we keep it, this should be a score set mapping error message + # if metadata.urn == "urn:mavedb:00000072-a-1": + # msg = f"No RefSeq accession is available for {metadata.urn}." + # if not silent: + # click.echo(msg) + # _logger.warning(msg) + # return None if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: return _map_protein_coding(