Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 3 additions & 27 deletions src/mavedb/data_providers/services.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
from datetime import date
from typing import Optional, TypedDict
from typing import Optional

import requests
from cdot.hgvs.dataproviders import SeqFetcher, ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider

from mavedb.lib.mapping import VRSMap

GENOMIC_FASTA_FILES = [
"/data/GCF_000001405.39_GRCh38.p13_genomic.fna.gz",
"/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz",
Expand All @@ -21,29 +21,5 @@ def cdot_rest() -> RESTDataProvider:
return RESTDataProvider(seqfetcher=seqfetcher())


class VRSMap:
url: str

class ScoreSetMappingResults(TypedDict):
metadata: Optional[dict[str, str]]
dcd_mapping_version: str
mapped_date_utc: date
computed_genomic_reference_sequence: Optional[dict[str, str]]
mapped_genomic_reference_sequence: Optional[dict[str, str]]
computed_protein_reference_sequence: Optional[dict[str, str]]
mapped_protein_reference_sequence: Optional[dict[str, str]]
mapped_scores: Optional[list[dict]]
error_message: Optional[str]

def __init__(self, url: str) -> None:
self.url = url

def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults:
uri = f"{self.url}/api/v1/map/{score_set_urn}"
response = requests.post(uri)
response.raise_for_status()
return response.json()


def vrs_mapper(url: Optional[str] = None) -> VRSMap:
return VRSMap(DCD_MAP_URL) if not url else VRSMap(url)
31 changes: 31 additions & 0 deletions src/mavedb/lib/mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from datetime import date
from typing import Optional, TypedDict, Union

import requests

ANNOTATION_LAYERS = {
"g": "genomic",
"p": "protein",
"c": "cdna",
}


class VRSMap:
url: str

class ScoreSetMappingResults(TypedDict):
metadata: Optional[dict[str, str]]
dcd_mapping_version: str
mapped_date_utc: date
reference_sequences: Optional[dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]]
mapped_scores: Optional[list[dict]]
error_message: Optional[str]

def __init__(self, url: str) -> None:
self.url = url

def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults:
uri = f"{self.url}/api/v1/map/{score_set_urn}"
response = requests.post(uri)
response.raise_for_status()
return response.json()
77 changes: 37 additions & 40 deletions src/mavedb/scripts/populate_mapped_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from sqlalchemy.orm import Session

from mavedb.data_providers.services import vrs_mapper
from mavedb.lib.exceptions import NonexistentMappingReferenceError
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
from mavedb.lib.mapping import ANNOTATION_LAYERS
from mavedb.models.enums.mapping_state import MappingState
from mavedb.models.score_set import ScoreSet
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.target_gene import TargetGene
from mavedb.models.variant import Variant

from mavedb.scripts.environment import script_environment, with_database_session
Expand Down Expand Up @@ -91,47 +92,43 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all
db.commit()
logger.info(f"No mapped variants available for {score_set.urn}.")
else:
computed_genomic_ref = mapped_scoreset.get("computed_genomic_reference_sequence")
mapped_genomic_ref = mapped_scoreset.get("mapped_genomic_reference_sequence")
computed_protein_ref = mapped_scoreset.get("computed_protein_reference_sequence")
mapped_protein_ref = mapped_scoreset.get("mapped_protein_reference_sequence")

# assumes one target gene per score set, which is currently true in mavedb as of sept. 2024.
target_gene = db.scalars(
select(TargetGene)
.join(ScoreSet)
.where(
ScoreSet.urn == str(score_set.urn),
reference_metadata = mapped_scoreset.get("reference_sequences")
if not reference_metadata:
raise NonexistentMappingReferenceError()

for target_gene_identifier in reference_metadata:
target_gene = next(
(
target_gene
for target_gene in score_set.target_genes
if target_gene.name == target_gene_identifier
),
None,
)
).one()

excluded_pre_mapped_keys = {"sequence"}
if computed_genomic_ref and mapped_genomic_ref:
pre_mapped_metadata = computed_genomic_ref
target_gene.pre_mapped_metadata = cast(
{
"genomic": {
k: pre_mapped_metadata[k]
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
if not target_gene:
raise ValueError(
f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}."
)
# allow for multiple annotation layers
pre_mapped_metadata = {}
post_mapped_metadata = {}
excluded_pre_mapped_keys = {"sequence"}
for annotation_layer in reference_metadata[target_gene_identifier]:
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
"computed_reference_sequence"
)
if layer_premapped:
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
k: layer_premapped[k]
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
}
},
JSONB,
)
target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB)
elif computed_protein_ref and mapped_protein_ref:
pre_mapped_metadata = computed_protein_ref
target_gene.pre_mapped_metadata = cast(
{
"protein": {
k: pre_mapped_metadata[k]
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
}
},
JSONB,
)
target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB)
else:
raise ValueError(f"incomplete or inconsistent metadata for score set {score_set.urn}")
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
"mapped_reference_sequence"
)
if layer_postmapped:
post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped
target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB)
target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB)

mapped_variants = [
variant_from_mapping(db=db, mapping=mapped_score, dcd_mapping_version=dcd_mapping_version)
Expand Down
79 changes: 34 additions & 45 deletions src/mavedb/worker/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
NonexistentMappingResultsError,
)
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
from mavedb.lib.mapping import ANNOTATION_LAYERS
from mavedb.lib.score_sets import (
columns_for_dataset,
create_variants,
Expand Down Expand Up @@ -390,55 +391,43 @@ async def map_variants_for_score_set(
score_set.mapping_state = MappingState.failed
score_set.mapping_errors = {"error_message": mapping_results.get("error_message")}
else:
# TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support:
# this assumes single-target mapping, will need to be changed to support multi-target mapping
# just in case there are multiple target genes in the db for a score set (this point shouldn't be reached
# while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence.
# TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support:
# this also assumes that the score set is based on a target sequence, not a target accession

computed_genomic_ref = mapping_results.get("computed_genomic_reference_sequence")
mapped_genomic_ref = mapping_results.get("mapped_genomic_reference_sequence")
computed_protein_ref = mapping_results.get("computed_protein_reference_sequence")
mapped_protein_ref = mapping_results.get("mapped_protein_reference_sequence")

if computed_genomic_ref:
target_sequence = computed_genomic_ref["sequence"] # noqa: F841
elif computed_protein_ref:
target_sequence = computed_protein_ref["sequence"] # noqa: F841
else:
reference_metadata = mapping_results.get("reference_sequences")
if not reference_metadata:
raise NonexistentMappingReferenceError()

# TODO(VariantEffect/dcd_mapping2#2): Handle variant mappings for score sets with more than 1 target.
target_gene = score_set.target_genes[0]

excluded_pre_mapped_keys = {"sequence"}
if computed_genomic_ref and mapped_genomic_ref:
pre_mapped_metadata = computed_genomic_ref
target_gene.pre_mapped_metadata = cast(
{
"genomic": {
k: pre_mapped_metadata[k]
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
}
},
JSONB,
for target_gene_identifier in reference_metadata:
target_gene = next(
(
target_gene
for target_gene in score_set.target_genes
if target_gene.name == target_gene_identifier
),
None,
)
target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB)
elif computed_protein_ref and mapped_protein_ref:
pre_mapped_metadata = computed_protein_ref
target_gene.pre_mapped_metadata = cast(
{
"protein": {
k: pre_mapped_metadata[k]
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
if not target_gene:
raise ValueError(
f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}."
)
# allow for multiple annotation layers
pre_mapped_metadata = {}
post_mapped_metadata = {}
excluded_pre_mapped_keys = {"sequence"}
for annotation_layer in reference_metadata[target_gene_identifier]:
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
"computed_reference_sequence"
)
if layer_premapped:
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
k: layer_premapped[k]
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
}
},
JSONB,
)
target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB)
else:
raise NonexistentMappingReferenceError()
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
"mapped_reference_sequence"
)
if layer_postmapped:
post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped
target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB)
target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB)

total_variants = 0
successful_mapped_variants = 0
Expand Down
Loading