From f72eff9bf767b2ec684c4a4114353780cb26650a Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 12:59:15 -0600 Subject: [PATCH 01/62] feat: add boundary column --- .../boundary_rxns/naiveB_boundary_rxns.csv | 140 +++++++++--------- main/data/boundary_rxns/smB_boundary_rxns.csv | 140 +++++++++--------- 2 files changed, 140 insertions(+), 140 deletions(-) diff --git a/main/data/boundary_rxns/naiveB_boundary_rxns.csv b/main/data/boundary_rxns/naiveB_boundary_rxns.csv index 7130c9be..f9297391 100644 --- a/main/data/boundary_rxns/naiveB_boundary_rxns.csv +++ b/main/data/boundary_rxns/naiveB_boundary_rxns.csv @@ -1,70 +1,70 @@ -Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate -Exchange,glc_D,Extracellular,-100,1000 -Exchange,fe2,Extracellular,-1000,1000 -Exchange,gal,Extracellular,0,1000 -Exchange,zn2,Extracellular,-1000,1000 -Exchange,ca2,Extracellular,-1000,1000 -Exchange,na1,Extracellular,-1000,1000 -Exchange,cl,Extracellular,-1000,1000 -Exchange,k,Extracellular,-1000,1000 -Exchange,h,Extracellular,-1000,1000 -Exchange,h2o,Extracellular,-1000,1000 -Exchange,o2,Extracellular,-1000,1000 -Exchange,pi,Extracellular,-1000,1000 -Exchange,ppi,Extracellular,-1000,1000 -Exchange,co2,Extracellular,-1000,1000 -Exchange,nh4,Extracellular,-1000,1000 -Exchange,no2,Extracellular,-1000,1000 -Exchange,co,Extracellular,-1000,1000 -Exchange,asn_L,Extracellular,-1,1000 -Exchange,asp_L,Extracellular,-1,1000 -Exchange,glu_L,Extracellular,-1,1000 -Exchange,ile_L,Extracellular,-1,1000 -Exchange,leu_L,Extracellular,-1,1000 -Exchange,lys_L,Extracellular,-1,1000 -Exchange,met_L,Extracellular,-1,1000 -Exchange,pro_L,Extracellular,-1,1000 -Exchange,ser_L,Extracellular,-1,1000 -Exchange,val_L,Extracellular,-1,1000 -Exchange,gly,Extracellular,-1,1000 -Exchange,cys_L,Extracellular,-1,1000 -Exchange,ala_L,Extracellular,-1,1000 -Exchange,his_L,Extracellular,-1,1000 -Exchange,thr_L,Extracellular,-1,1000 -Exchange,gln_L,Extracellular,-1,1000 -Exchange,phe_L,Extracellular,-1,1000 -Exchange,tyr_L,Extracellular,-1,1000 -Exchange,arg_L,Extracellular,-1,1000 -Exchange,trp_L,Extracellular,-1,1000 -Exchange,eicostet,Extracellular,-1,1000 -Exchange,hdca,Extracellular,-1,1000 -Exchange,hdcea,Extracellular,-1,1000 -Exchange,lnlc,Extracellular,-1,1000 -Exchange,lnlnca,Extracellular,-1,1000 -Exchange,lnlncg,Extracellular,-1,1000 -Exchange,ocdca,Extracellular,-1,1000 -Exchange,ocdcea,Extracellular,-1,1000 -Exchange,thmmp,Extracellular,0,1000 -Exchange,thmtp,Extracellular,0,1000 -Exchange,ncam,Extracellular,0,1000 -Exchange,pnto_R,Extracellular,0,1000 -Exchange,pydxn,Extracellular,0,1000 -Exchange,pydx,Extracellular,0,1000 -Exchange,pydam,Extracellular,0,1000 -Exchange,btn,Extracellular,0,1000 -Exchange,fol,Extracellular,-10,1000 -Exchange,aqcobal,Extracellular,0,1000 -Exchange,vitd3,Extracellular,-1,1000 -Exchange,ascb_L,Extracellular,0,1000 -Exchange,retinol,Extracellular,-10,1000 -Exchange,retinal,Extracellular,-10,1000 -Exchange,ala_B,Extracellular,-1,1000 -Exchange,ala_D,Extracellular,-1,1000 -Exchange,h2co3,Extracellular,-1000,1000 -Exchange,h2o2,Extracellular,-1000,1000 -Exchange,hco3,Extracellular,-1000,1000 -Exchange,orn,Extracellular,-1,1000 -Exchange,orn_D,Extracellular,-1,1000 -Exchange,so4,Extracellular,-1000,1000 -Exchange,ribflv,Extracellular,-1,1000 -Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file +Boundary,Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Demand,Exchange,glc_D,Extracellular,-100,1000 +Demand,Exchange,fe2,Extracellular,-1000,1000 +Demand,Exchange,gal,Extracellular,0,1000 +Demand,Exchange,zn2,Extracellular,-1000,1000 +Demand,Exchange,ca2,Extracellular,-1000,1000 +Demand,Exchange,na1,Extracellular,-1000,1000 +Demand,Exchange,cl,Extracellular,-1000,1000 +Demand,Exchange,k,Extracellular,-1000,1000 +Demand,Exchange,h,Extracellular,-1000,1000 +Demand,Exchange,h2o,Extracellular,-1000,1000 +Demand,Exchange,o2,Extracellular,-1000,1000 +Demand,Exchange,pi,Extracellular,-1000,1000 +Demand,Exchange,ppi,Extracellular,-1000,1000 +Demand,Exchange,co2,Extracellular,-1000,1000 +Demand,Exchange,nh4,Extracellular,-1000,1000 +Demand,Exchange,no2,Extracellular,-1000,1000 +Demand,Exchange,co,Extracellular,-1000,1000 +Demand,Exchange,asn_L,Extracellular,-1,1000 +Demand,Exchange,asp_L,Extracellular,-1,1000 +Demand,Exchange,glu_L,Extracellular,-1,1000 +Demand,Exchange,ile_L,Extracellular,-1,1000 +Demand,Exchange,leu_L,Extracellular,-1,1000 +Demand,Exchange,lys_L,Extracellular,-1,1000 +Demand,Exchange,met_L,Extracellular,-1,1000 +Demand,Exchange,pro_L,Extracellular,-1,1000 +Demand,Exchange,ser_L,Extracellular,-1,1000 +Demand,Exchange,val_L,Extracellular,-1,1000 +Demand,Exchange,gly,Extracellular,-1,1000 +Demand,Exchange,cys_L,Extracellular,-1,1000 +Demand,Exchange,ala_L,Extracellular,-1,1000 +Demand,Exchange,his_L,Extracellular,-1,1000 +Demand,Exchange,thr_L,Extracellular,-1,1000 +Demand,Exchange,gln_L,Extracellular,-1,1000 +Demand,Exchange,phe_L,Extracellular,-1,1000 +Demand,Exchange,tyr_L,Extracellular,-1,1000 +Demand,Exchange,arg_L,Extracellular,-1,1000 +Demand,Exchange,trp_L,Extracellular,-1,1000 +Demand,Exchange,eicostet,Extracellular,-1,1000 +Demand,Exchange,hdca,Extracellular,-1,1000 +Demand,Exchange,hdcea,Extracellular,-1,1000 +Demand,Exchange,lnlc,Extracellular,-1,1000 +Demand,Exchange,lnlnca,Extracellular,-1,1000 +Demand,Exchange,lnlncg,Extracellular,-1,1000 +Demand,Exchange,ocdca,Extracellular,-1,1000 +Demand,Exchange,ocdcea,Extracellular,-1,1000 +Demand,Exchange,thmmp,Extracellular,0,1000 +Demand,Exchange,thmtp,Extracellular,0,1000 +Demand,Exchange,ncam,Extracellular,0,1000 +Demand,Exchange,pnto_R,Extracellular,0,1000 +Demand,Exchange,pydxn,Extracellular,0,1000 +Demand,Exchange,pydx,Extracellular,0,1000 +Demand,Exchange,pydam,Extracellular,0,1000 +Demand,Exchange,btn,Extracellular,0,1000 +Demand,Exchange,fol,Extracellular,-10,1000 +Demand,Exchange,aqcobal,Extracellular,0,1000 +Demand,Exchange,vitd3,Extracellular,-1,1000 +Demand,Exchange,ascb_L,Extracellular,0,1000 +Demand,Exchange,retinol,Extracellular,-10,1000 +Demand,Exchange,retinal,Extracellular,-10,1000 +Demand,Exchange,ala_B,Extracellular,-1,1000 +Demand,Exchange,ala_D,Extracellular,-1,1000 +Demand,Exchange,h2co3,Extracellular,-1000,1000 +Demand,Exchange,h2o2,Extracellular,-1000,1000 +Demand,Exchange,hco3,Extracellular,-1000,1000 +Demand,Exchange,orn,Extracellular,-1,1000 +Demand,Exchange,orn_D,Extracellular,-1,1000 +Demand,Exchange,so4,Extracellular,-1000,1000 +Demand,Exchange,ribflv,Extracellular,-1,1000 +Demand,Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file diff --git a/main/data/boundary_rxns/smB_boundary_rxns.csv b/main/data/boundary_rxns/smB_boundary_rxns.csv index 9f0f9385..f9297391 100644 --- a/main/data/boundary_rxns/smB_boundary_rxns.csv +++ b/main/data/boundary_rxns/smB_boundary_rxns.csv @@ -1,70 +1,70 @@ -Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate -Exchange,glc_D,Extracellular,-100,1000 -Exchange,fe2,Extracellular,-1000,1000 -Exchange,gal,Extracellular,0,1000 -Exchange,zn2,Extracellular,-1000,1000 -Exchange,ca2,Extracellular,-1000,1000 -Exchange,na1,Extracellular,-1000,1000 -Exchange,cl,Extracellular,-1000,1000 -Exchange,k,Extracellular,-1000,1000 -Exchange,h,Extracellular,-1000,1000 -Exchange,h2o,Extracellular,-1000,1000 -Exchange,o2,Extracellular,-1000,1000 -Exchange,pi,Extracellular,-1000,1000 -Exchange,ppi,Extracellular,-1000,1000 -Exchange,co2,Extracellular,-1000,1000 -Exchange,nh4,Extracellular,-1000,1000 -Exchange,no2,Extracellular,-1000,1000 -Exchange,co,Extracellular,-1000,1000 -Exchange,asn_L,Extracellular,-1,1000 -Exchange,asp_L,Extracellular,-1,1000 -Exchange,glu_L,Extracellular,-1,1000 -Exchange,ile_L,Extracellular,-1,1000 -Exchange,leu_L,Extracellular,-1,1000 -Exchange,lys_L,Extracellular,-1,1000 -Exchange,met_L,Extracellular,-1,1000 -Exchange,pro_L,Extracellular,-1,1000 -Exchange,ser_L,Extracellular,-1,1000 -Exchange,val_L,Extracellular,-1,1000 -Exchange,gly,Extracellular,-1,1000 -Exchange,cys_L,Extracellular,-1,1000 -Exchange,ala_L,Extracellular,-1,1000 -Exchange,his_L,Extracellular,-1,1000 -Exchange,thr_L,Extracellular,-1,1000 -Exchange,gln_L,Extracellular,-1,1000 -Exchange,phe_L,Extracellular,-1,1000 -Exchange,tyr_L,Extracellular,-1,1000 -Exchange,arg_L,Extracellular,-1,1000 -Exchange,trp_L,Extracellular,-1,1000 -Exchange,eicostet,Extracellular,-1,1000 -Exchange,hdca,Extracellular,-1,1000 -Exchange,hdcea,Extracellular,-1,1000 -Exchange,lnlc,Extracellular,-1,1000 -Exchange,lnlnca,Extracellular,-1,1000 -Exchange,lnlncg,Extracellular,-1,1000 -Exchange,ocdca,Extracellular,-1,1000 -Exchange,ocdcea,Extracellular,-1,1000 -Exchange,thmmp,Extracellular,0,1000 -Exchange,thmtp,Extracellular,0,1000 -Exchange,ncam,Extracellular,0,1000 -Exchange,pnto_R,Extracellular,0,1000 -Exchange,pydxn,Extracellular,0,1000 -Exchange,pydx,Extracellular,0,1000 -Exchange,pydam,Extracellular,0,1000 -Exchange,btn,Extracellular,0,1000 -Exchange,fol,Extracellular,-10,1000 -Exchange,aqcobal,Extracellular,0,1000 -Exchange,vitd3,Extracellular,-1,1000 -Exchange,ascb_L,Extracellular,0,1000 -Exchange,retinol,Extracellular,-10,1000 -Exchange,retinal,Extracellular,-10,1000 -Exchange,ala_B,Extracellular,-1,1000 -Exchange,ala_D,Extracellular,-1,1000 -Exchange,h2co3,Extracellular,-1000,1000 -Exchange,h2o2,Extracellular,-1000,1000 -Exchange,hco3,Extracellular,-1000,1000 -Exchange,orn,Extracellular,-1,1000 -Exchange,orn_D,Extracellular,-1,1000 -Exchange,so4,Extracellular,-1000,1000 -Exchange,ribflv,Extracellular,-1,1000 -Exchange,Lcystin,Extracellular,-1,1000 +Boundary,Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Demand,Exchange,glc_D,Extracellular,-100,1000 +Demand,Exchange,fe2,Extracellular,-1000,1000 +Demand,Exchange,gal,Extracellular,0,1000 +Demand,Exchange,zn2,Extracellular,-1000,1000 +Demand,Exchange,ca2,Extracellular,-1000,1000 +Demand,Exchange,na1,Extracellular,-1000,1000 +Demand,Exchange,cl,Extracellular,-1000,1000 +Demand,Exchange,k,Extracellular,-1000,1000 +Demand,Exchange,h,Extracellular,-1000,1000 +Demand,Exchange,h2o,Extracellular,-1000,1000 +Demand,Exchange,o2,Extracellular,-1000,1000 +Demand,Exchange,pi,Extracellular,-1000,1000 +Demand,Exchange,ppi,Extracellular,-1000,1000 +Demand,Exchange,co2,Extracellular,-1000,1000 +Demand,Exchange,nh4,Extracellular,-1000,1000 +Demand,Exchange,no2,Extracellular,-1000,1000 +Demand,Exchange,co,Extracellular,-1000,1000 +Demand,Exchange,asn_L,Extracellular,-1,1000 +Demand,Exchange,asp_L,Extracellular,-1,1000 +Demand,Exchange,glu_L,Extracellular,-1,1000 +Demand,Exchange,ile_L,Extracellular,-1,1000 +Demand,Exchange,leu_L,Extracellular,-1,1000 +Demand,Exchange,lys_L,Extracellular,-1,1000 +Demand,Exchange,met_L,Extracellular,-1,1000 +Demand,Exchange,pro_L,Extracellular,-1,1000 +Demand,Exchange,ser_L,Extracellular,-1,1000 +Demand,Exchange,val_L,Extracellular,-1,1000 +Demand,Exchange,gly,Extracellular,-1,1000 +Demand,Exchange,cys_L,Extracellular,-1,1000 +Demand,Exchange,ala_L,Extracellular,-1,1000 +Demand,Exchange,his_L,Extracellular,-1,1000 +Demand,Exchange,thr_L,Extracellular,-1,1000 +Demand,Exchange,gln_L,Extracellular,-1,1000 +Demand,Exchange,phe_L,Extracellular,-1,1000 +Demand,Exchange,tyr_L,Extracellular,-1,1000 +Demand,Exchange,arg_L,Extracellular,-1,1000 +Demand,Exchange,trp_L,Extracellular,-1,1000 +Demand,Exchange,eicostet,Extracellular,-1,1000 +Demand,Exchange,hdca,Extracellular,-1,1000 +Demand,Exchange,hdcea,Extracellular,-1,1000 +Demand,Exchange,lnlc,Extracellular,-1,1000 +Demand,Exchange,lnlnca,Extracellular,-1,1000 +Demand,Exchange,lnlncg,Extracellular,-1,1000 +Demand,Exchange,ocdca,Extracellular,-1,1000 +Demand,Exchange,ocdcea,Extracellular,-1,1000 +Demand,Exchange,thmmp,Extracellular,0,1000 +Demand,Exchange,thmtp,Extracellular,0,1000 +Demand,Exchange,ncam,Extracellular,0,1000 +Demand,Exchange,pnto_R,Extracellular,0,1000 +Demand,Exchange,pydxn,Extracellular,0,1000 +Demand,Exchange,pydx,Extracellular,0,1000 +Demand,Exchange,pydam,Extracellular,0,1000 +Demand,Exchange,btn,Extracellular,0,1000 +Demand,Exchange,fol,Extracellular,-10,1000 +Demand,Exchange,aqcobal,Extracellular,0,1000 +Demand,Exchange,vitd3,Extracellular,-1,1000 +Demand,Exchange,ascb_L,Extracellular,0,1000 +Demand,Exchange,retinol,Extracellular,-10,1000 +Demand,Exchange,retinal,Extracellular,-10,1000 +Demand,Exchange,ala_B,Extracellular,-1,1000 +Demand,Exchange,ala_D,Extracellular,-1,1000 +Demand,Exchange,h2co3,Extracellular,-1000,1000 +Demand,Exchange,h2o2,Extracellular,-1000,1000 +Demand,Exchange,hco3,Extracellular,-1000,1000 +Demand,Exchange,orn,Extracellular,-1,1000 +Demand,Exchange,orn_D,Extracellular,-1,1000 +Demand,Exchange,so4,Extracellular,-1000,1000 +Demand,Exchange,ribflv,Extracellular,-1,1000 +Demand,Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file From 7b581470111243532e9dc17b2a74490a30384d32 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 13:00:20 -0600 Subject: [PATCH 02/62] fix: allow new boundary column --- main/como/create_context_specific_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 373dab92..896c90cc 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -550,6 +550,7 @@ def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: df = _create_df(path) for column in df.columns: if column not in [ + "boundary", "reaction", "abbreviation", "compartment", From 897d4c0b3fb25684e5ae56d00ff2ef4a81b338c9 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 13:09:13 -0600 Subject: [PATCH 03/62] feat: write zFPKM graphs Allow writing zFPKM graphs to a specific location --- main/como/rnaseq.py | 32 ++++++++++++++++++++++++++------ main/como/rnaseq_gen.py | 13 +++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/main/como/rnaseq.py b/main/como/rnaseq.py index 1c504d97..055078fc 100644 --- a/main/como/rnaseq.py +++ b/main/como/rnaseq.py @@ -460,10 +460,11 @@ def zfpkm_transform( return results, zfpkm_df -def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): +def zfpkm_plot(results, *, write_png_filepath: Path, plot_xfloor: int = -4, subplot_titles: bool = True): """Plot the log2(FPKM) density and fitted Gaussian for each sample. :param results: A dictionary of intermediate results from zfpkm_transform. + :param write_png_filepath: The path to write the plot to :param: subplot_titles: Whether to display facet titles (sample names). :param plot_xfloor: Lower limit for the x-axis. :param subplot_titles: Whether to display facet titles (sample names). @@ -509,7 +510,7 @@ def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): fig.update_layout(legend_tracegroupgap=0) fig.update_layout(height=600 * len(results), width=1000, title_text="zFPKM Plots", showlegend=True) - fig.write_image("zfpkm_plot.png") + fig.write_image(write_png_filepath) def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: @@ -619,7 +620,13 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO return metrics -def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: +def zfpkm_filter( + *, + metrics: NamedMetrics, + filtering_options: _FilteringOptions, + write_png_filepath: Path, + calcualte_fpkm: bool, +) -> NamedMetrics: """Apply zFPKM filtering to the FPKM matrix for a given sample.""" min_sample_expression = filtering_options.replicate_ratio high_confidence_sample_expression = filtering_options.high_replicate_ratio @@ -637,7 +644,7 @@ def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, minimums = matrix == 0 results, zfpkm_df = zfpkm_transform(matrix) zfpkm_df[minimums] = -4 - zfpkm_plot(results) + zfpkm_plot(results, write_png_filepath=write_png_filepath) # determine which genes are expressed min_samples = round(min_sample_expression * len(zfpkm_df.columns)) @@ -661,6 +668,7 @@ def filter_counts( technique: FilteringTechnique, filtering_options: _FilteringOptions, prep: RNASeqPreparationMethod, + write_zfpkm_png_filepath: Path, ) -> NamedMetrics: """Filter the count matrix based on the specified technique.""" match technique: @@ -671,9 +679,19 @@ def filter_counts( case FilteringTechnique.tpm: return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) case FilteringTechnique.zfpkm: - return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True) + return zfpkm_filter( + metrics=metrics, + filtering_options=filtering_options, + write_png_filepath=write_zfpkm_png_filepath, + calcualte_fpkm=True, + ) case FilteringTechnique.umi: - return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False) + return zfpkm_filter( + metrics=metrics, + filtering_options=filtering_options, + write_png_filepath=write_zfpkm_png_filepath, + calcualte_fpkm=False, + ) case _: raise ValueError(f"Technique must be one of {FilteringTechnique}") @@ -692,6 +710,7 @@ async def save_rnaseq_tests( high_batch_ratio: float, technique: FilteringTechnique, cut_off: int | float, + write_zfpkm_png_filepath: Path, ): """Save the results of the RNA-Seq tests to a CSV file.""" filtering_options = _FilteringOptions( @@ -726,6 +745,7 @@ async def save_rnaseq_tests( technique=technique, filtering_options=filtering_options, prep=prep, + write_zfpkm_png_filepath=write_zfpkm_png_filepath, ) expressed_genes: list[str] = [] diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 8f8ffa34..55234919 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -3,6 +3,7 @@ import argparse import asyncio from dataclasses import dataclass +from pathlib import Path import pandas as pd from fast_bioservices import Taxon @@ -24,6 +25,7 @@ class _Arguments: minimum_cutoff: int | str library_prep: RNASeqPreparationMethod taxon: Taxon + write_zfpkm_png_filepath: Path def __post_init__(self): self.library_prep = RNASeqPreparationMethod.from_string(str(self.library_prep)) @@ -48,6 +50,7 @@ async def _handle_context_batch( cut_off: int | float | str, prep: RNASeqPreparationMethod, taxon: Taxon, + write_zfpkm_png_filepath: Path, ) -> None: """Iterate through each context type and create rnaseq expression file. @@ -110,6 +113,7 @@ async def _handle_context_batch( technique=technique, cut_off=cut_off, taxon_id=taxon, + write_zfpkm_png_filepath=write_zfpkm_png_filepath, ) logger.success(f"Results saved at '{rnaseq_output_filepath}'") @@ -119,6 +123,7 @@ async def rnaseq_gen( config_filename: str, prep: RNASeqPreparationMethod, taxon_id: int | str | Taxon, + write_zfpkm_png_filepath: Path, replicate_ratio: float = 0.5, high_replicate_ratio: float = 1.0, batch_ratio: float = 0.5, @@ -180,6 +185,7 @@ async def rnaseq_gen( cut_off=cut_off, prep=prep, taxon=taxon_id, + write_zfpkm_png_filepath=write_zfpkm_png_filepath, ) @@ -287,6 +293,12 @@ def _parse_args() -> _Arguments: "Will separate samples into groups to only compare similarly prepared libraries. " "For example, mRNA, total-rna, scRNA, etc", ) + parser.add_argument( + "--write-zfpkm-png-filepath", + required=False, + type=Path, + help="If using zFPKM, the location to write graphs", + ) args = parser.parse_args() args.filtering_technique = args.filtering_technique.lower() args.taxon = Taxon.from_int(int(args.taxon)) if str(args.taxon).isdigit() else Taxon.from_string(str(args.taxon)) # type: ignore @@ -306,5 +318,6 @@ def _parse_args() -> _Arguments: cut_off=args.minimum_cutoff, prep=args.library_prep, taxon_id=args.taxon, + write_zfpkm_png_filepath=args.write_zfpkm_png_filepath, ) ) From 355b9df052fc0c68884687c063ccf0db27b9ad59 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:09:03 -0600 Subject: [PATCH 04/62] refactor: remove rnaseq.py This moves all components of rnaseq.py into rnaseq_gen.py --- main/como/rnaseq.py | 764 -------------------------------------------- 1 file changed, 764 deletions(-) delete mode 100644 main/como/rnaseq.py diff --git a/main/como/rnaseq.py b/main/como/rnaseq.py deleted file mode 100644 index cf45f39c..00000000 --- a/main/como/rnaseq.py +++ /dev/null @@ -1,764 +0,0 @@ -from __future__ import annotations - -import gc -import math -import multiprocessing -import time -from collections import namedtuple -from dataclasses import dataclass, field -from enum import Enum -from functools import partial -from multiprocessing.pool import Pool -from pathlib import Path -from typing import Callable, NamedTuple - -import numpy as np -import numpy.typing as npt -import pandas as pd -import plotly.graph_objs as go -import scanpy as sc -import sklearn -import sklearn.neighbors -from fast_bioservices import Taxon -from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol -from loguru import logger -from pandas import DataFrame -from plotly.subplots import make_subplots -from scipy.signal import find_peaks -from sklearn.neighbors import KernelDensity - -from como.migrations import gene_info_migrations -from como.types import RNAPrepMethod -from como.utils import convert_gene_data - - -class _FilteringOptions(NamedTuple): - replicate_ratio: float - batch_ratio: float - cut_off: float - high_replicate_ratio: float - high_batch_ratio: float - - -class FilteringTechnique(Enum): - """RNA sequencing filtering capabilities.""" - - cpm = "cpm" - zfpkm = "zfpkm" - tpm = "quantile" - umi = "umi" - - @staticmethod - def from_string(value: str) -> FilteringTechnique: - """Create a filtering technique object from a string.""" - match value.lower(): - case "cpm": - return FilteringTechnique.cpm - case "zfpkm": - return FilteringTechnique.zfpkm - case "quantile": - return FilteringTechnique.tpm - case "umi": - return FilteringTechnique.umi - case _: - possible_values = [t.value for t in FilteringTechnique] - raise ValueError(f"Filtering technique must be one of {possible_values}; got: {value}") - - -class LayoutMethod(Enum): - """RNA sequencing layout method.""" - - paired_end = "paired-end" - single_end = "single-end" - - -@dataclass -class _StudyMetrics: - study: str - num_samples: int - count_matrix: pd.DataFrame - fragment_lengths: npt.NDArray[np.float32] - sample_names: list[str] - layout: list[LayoutMethod] - entrez_gene_ids: list[str] - gene_sizes: npt.NDArray[np.float32] - __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) - __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) - __high_confidence_entrez_gene_ids: list[str] = field(default=list) - - def __post_init__(self): - for layout in self.layout: - if layout not in LayoutMethod: - raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}") - - @property - def normalization_matrix(self) -> pd.DataFrame: - return self.__normalization_matrix - - @normalization_matrix.setter - def normalization_matrix(self, value: pd.DataFrame) -> None: - self.__normalization_matrix = value - - @property - def z_score_matrix(self) -> pd.DataFrame: - return self.__z_score_matrix - - @z_score_matrix.setter - def z_score_matrix(self, value: pd.DataFrame) -> None: - self.__z_score_matrix = value - - @property - def high_confidence_entrez_gene_ids(self) -> list[str]: - return self.__high_confidence_entrez_gene_ids - - @high_confidence_entrez_gene_ids.setter - def high_confidence_entrez_gene_ids(self, values: list[str]) -> None: - self.__high_confidence_entrez_gene_ids = values - - -Density = namedtuple("Density", ["x", "y"]) - - -class _ZFPKMResult(NamedTuple): - zfpkm: pd.Series - density: Density - mu: float - std_dev: float - max_fpkm: float - - -class _ReadMatrixResults(NamedTuple): - metrics: dict[str, _StudyMetrics] - entrez_gene_ids: list[str] - - -NamedMetrics = dict[str, _StudyMetrics] - - -def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]: - """Return a function that filters rows of an array based on the sum of elements being greater than or equal to A at least k times. - - This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA - - :param k: The minimum number of times the sum of elements must be greater than or equal to A. - :param a: The value to compare the sum of elements to. - :return: A function that accepts a NumPy array to perform the actual filtering - """ # noqa: E501 - - def filter_func(row: npt.NDArray) -> bool: - return np.sum(row >= a) >= k - - return filter_func - - -def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDArray], bool]) -> npt.NDArray: - """Apply a filter function to the rows of the data and return the filtered array. - - This code is based on the `genefilter` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/genefilter - - :param data: The data to filter - :param filter_func: THe function to filter the data by - :return: A NumPy array of the filtered data. - """ - match type(data): - case pd.DataFrame: - return data.apply(filter_func, axis=1).values - case npt.NDArray: - return np.apply_along_axis(filter_func, axis=1, arr=data) - case _: - raise ValueError("Unsupported data type. Must be a Pandas DataFrame or a NumPy array.") - - -async def _read_counts_matrix( - *, - context_name: str, - counts_matrix_filepath: Path, - config_filepath: Path, - gene_info_filepath: Path, - taxon_id: Taxon, -) -> _ReadMatrixResults: - """Read the counts matrix and returns the results. - - :param context_name: The context name being processed. Usually a cell type, but can be any string - :param counts_matrix_filepath: The file path to the gene count matrix - :param config_filepath: The file path to the Excel configuration file - :param gene_info_filepath: The file path to gene information generated by `rnaseq_preprocess.py` - :param taxon_id: The NCBI Taxon ID - :return: A dataclass `ReadMatrixResults` - """ - logger.trace(f"Reading config_filepath at '{config_filepath}'") - config_df: pd.DataFrame = pd.read_excel(config_filepath, sheet_name=context_name, header=0) - gene_info: pd.DataFrame = pd.read_csv(gene_info_filepath) - gene_info = gene_info[gene_info["ensembl_gene_id"] != "-"].reset_index(drop=True) - gene_info = gene_info_migrations(gene_info) - - match counts_matrix_filepath.suffix: - case ".csv": - logger.debug(f"Reading CSV file at '{counts_matrix_filepath}'") - counts_matrix: pd.DataFrame = pd.read_csv(counts_matrix_filepath, header=0) - if "ensembl_gene_id" not in counts_matrix.columns: - raise ValueError( - f"Counts matrix must contain a column named 'ensembl_gene_id'. " - f"Ensure the file '{counts_matrix_filepath}' contains this column." - ) - conversion = await ensembl_to_gene_id_and_symbol( - ids=counts_matrix["ensembl_gene_id"].tolist(), taxon=taxon_id - ) - counts_matrix = counts_matrix.merge(conversion, on="ensembl_gene_id", how="left") - - case ".h5ad": - logger.debug(f"Reading h5ad file at '{counts_matrix_filepath}'") - adata: sc.AnnData = sc.read_h5ad(counts_matrix_filepath) - counts_matrix: pd.DataFrame = adata.to_df().T # Make sample names the columns and gene data the index - - # Coherce the incoming gene data (i.e., Gene Symbols) into Entrez and Ensembl Gene IDs - conversion = await convert_gene_data(counts_matrix.index.tolist(), taxon_id) - counts_matrix.index.name = conversion.index.name - counts_matrix = counts_matrix.merge(conversion, left_index=True, right_index=True) - counts_matrix = counts_matrix[counts_matrix["entrez_gene_id"] != "-"] - counts_matrix.reset_index(inplace=True) - - # explicit garbage collection because this function runs for a little while - del adata - del conversion - gc.collect() - - case _: - raise ValueError( - f"Unknown file extension '{counts_matrix_filepath.suffix}'. Valid options are '.csv' or '.h5ad'." - ) - - if counts_matrix.empty: - raise ValueError( - f"Counts matrix is empty. Ensure the file contains data. " - f"Attempted to process file '{counts_matrix_filepath}'" - ) - - # Only include Entrez and Ensembl Gene IDs that are present in `gene_info` - counts_matrix["entrez_gene_id"] = counts_matrix["entrez_gene_id"].str.split("//") - counts_matrix = counts_matrix.explode("entrez_gene_id") - counts_matrix = counts_matrix.replace(to_replace="-", value=pd.NA).dropna() - counts_matrix["entrez_gene_id"] = counts_matrix["entrez_gene_id"].astype(int) - - gene_info = gene_info.replace(to_replace="-", value=pd.NA).dropna() - gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) - - counts_matrix = counts_matrix.merge( - gene_info[["entrez_gene_id", "ensembl_gene_id"]], - on=["entrez_gene_id", "ensembl_gene_id"], - how="inner", - ) - gene_info = gene_info.merge( - counts_matrix[["entrez_gene_id", "ensembl_gene_id"]], - on=["entrez_gene_id", "ensembl_gene_id"], - how="inner", - ) - - entrez_gene_ids: list[str] = gene_info["entrez_gene_id"].tolist() - metrics: NamedMetrics = {} - for study in config_df["study"].unique().tolist(): - study_sample_names = config_df[config_df["study"] == study]["sample_name"].tolist() - layouts = config_df[config_df["study"] == study]["layout"].tolist() - metrics[study] = _StudyMetrics( - count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)], - fragment_lengths=config_df[config_df["study"] == study]["fragment_length"].values, - sample_names=study_sample_names, - layout=[LayoutMethod(layout) for layout in layouts], - num_samples=len(study_sample_names), - entrez_gene_ids=entrez_gene_ids, - gene_sizes=np.array(gene_info["size"].values).astype(np.float32), - study=study, - ) - metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0 - metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id") - - return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist()) - - -def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary.""" - for sample in metrics: - count_matrix = metrics[sample].count_matrix - - gene_sizes = metrics[sample].gene_sizes - - tpm_matrix = pd.DataFrame(data=None, index=count_matrix.index, columns=count_matrix.columns) - for i in range(len(count_matrix.columns)): - values: pd.Series = count_matrix.iloc[:, i] + 1 # Add 1 to prevent division by 0 - rate = np.log(values.tolist()) - np.log(gene_sizes) - denominator = np.log(np.sum(np.exp(rate))) - tpm_value = np.exp(rate - denominator + np.log(1e6)) - tpm_matrix.iloc[:, i] = tpm_value - metrics[sample].normalization_matrix = tpm_matrix - - return metrics - - -def calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary.""" # noqa: E501 - matrix_values = [] - for study in metrics: - for sample in range(metrics[study].num_samples): - layout = metrics[study].layout[sample] - count_matrix: npt.NDArray = metrics[study].count_matrix.iloc[:, sample].values - gene_size = metrics[study].gene_sizes - - count_matrix = count_matrix.astype(np.float32) - gene_size = gene_size.astype(np.float32) - - match layout: - case LayoutMethod.paired_end: # FPKM - mean_fragment_lengths = metrics[study].fragment_lengths[sample] - # Ensure non-negative value - effective_length = [max(0, size - (mean_fragment_lengths + 1)) for size in gene_size] - n = count_matrix.sum() - fpkm = ((count_matrix + 1) * 1e9) / (np.array(effective_length) * n) - matrix_values.append(fpkm) - case LayoutMethod.single_end: # RPKM - # Add a pseudocount before log to ensure log(0) does not happen - rate = np.log(count_matrix + 1) - np.log(gene_size) - exp_rate = np.exp(rate - np.log(np.sum(count_matrix)) + np.log(1e9)) - matrix_values.append(exp_rate) - case _: - raise ValueError("Invalid normalization method specified") - - fpkm_matrix = pd.DataFrame(matrix_values).T # Transpose is needed because values were appended as rows - fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)] - metrics[study].normalization_matrix = fpkm_matrix - - metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns - - return metrics - - -def _zfpkm_calculation(col: pd.Series, kernel: KernelDensity, peak_parameters: tuple[float, float]) -> _ZFPKMResult: - """Log2 Transformations. - - Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting - - Kernel Density Estimation (kde) - - Non-parametric method to estimate the probability density function (PDF) of a random variable - - Estimates the distribution of log2-transformed FPKM values - - Bandwidth parameter controls the smoothness of the density estimate - - KDE Explanation - - A way to smooth a histogram to get a better idea of the underlying distribution of the data - - Given a set of data points, we want to understand how they are distributed. - Histograms can be useful, but are sensitive to bin size and number - - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point - - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel, - and less weight to points farther away - - Kernel functions are summed along each point on the x-axis - - A smooth curve is created that represents the estimated density of the data - - Peak Finding - - Identifies that are above a certain height and separated by a minimum distance - - Represent potential local maxima in the distribution - - Peak Selection - - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu) - of the "inactive" gene distribution - - The peak representing unexpressed or inactive genes should be at a lower FPKM - value compared to the peak representing expressed genes - - Standard Deviation Estimation - - The mean of log2-FPKM values are greater than the calculated mu - - Standard deviation is estimated based on the assumption that the right tail of the distribution - This represents expressed genes) can be approximated by a half-normal distribution - - zFPKM Transformation - - Centers disbribution around 0 and scales it by the standard deviation. - This makes it easier to compare gene expression across different samples - - Represents the number of standard deviations away from the mean of the "inactive" gene distribution - - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak - - A zFPKM value of 0 represents the mean of the "inactive" distribution - - Research shows that a zFPKM value of -3 or greater can be used as - a threshold for calling a gene as "expressed" - : https://doi.org/10.1186/1471-2164-14-778 - """ - col_log2: npt.NDArray = np.log2(col + 1) - col_log2 = np.nan_to_num(col_log2, nan=0) - refit: KernelDensity = kernel.fit(col_log2.reshape(-1, 1)) # type: ignore - - # kde: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(col_log2.reshape(-1, 1)) - x_range = np.linspace(col_log2.min(), col_log2.max(), 1000) - density = np.exp(refit.score_samples(x_range.reshape(-1, 1))) - peaks, _ = find_peaks(density, height=peak_parameters[0], distance=peak_parameters[1]) - peak_positions = x_range[peaks] - - mu = 0 - max_fpkm = 0 - stddev = 1 - - if len(peaks) != 0: - mu = peak_positions.max() - max_fpkm = density[peaks[np.argmax(peak_positions)]] - u = col_log2[col_log2 > mu].mean() - stddev = (u - mu) * np.sqrt(np.pi / 2) - zfpkm = pd.Series((col_log2 - mu) / stddev, dtype=np.float32, name=col.name) - - return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm) - - -def zfpkm_transform( - fpkm_df: pd.DataFrame, - bandwidth: int = 0.5, - peak_parameters: tuple[float, float] = (0.02, 1.0), - update_every_percent: float = 0.1, -) -> tuple[dict[str, _ZFPKMResult], DataFrame]: - """Perform zFPKM calculation/transformation.""" - if update_every_percent > 1: - logger.warning( - f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - " - f"will convert to percentage" - ) - update_every_percent /= 100 - - total = len(fpkm_df.columns) - update_per_step: int = int(np.ceil(total * update_every_percent)) - cores = multiprocessing.cpu_count() - 2 - logger.debug(f"Processing {total:,} samples through zFPKM transform using {cores} cores") - logger.debug( - f"Will update every {update_per_step:,} steps as this is approximately " - f"{update_every_percent:.1%} of {total:,}" - ) - - with Pool(processes=cores) as pool: - kernel = KernelDensity(kernel="gaussian", bandwidth=bandwidth) - chunksize = int(math.ceil(len(fpkm_df.columns) / (4 * cores))) - partial_func = partial(_zfpkm_calculation, kernel=kernel, peak_parameters=peak_parameters) - chunk_time = time.time() - start_time = time.time() - - log_padding = len(str(f"{total:,}")) - zfpkm_df = pd.DataFrame(data=0, index=fpkm_df.index, columns=fpkm_df.columns) - results: dict[str, _ZFPKMResult] = {} - result: _ZFPKMResult - for i, result in enumerate( - pool.imap( - partial_func, - (fpkm_df[col] for col in fpkm_df.columns), - chunksize=chunksize, - ) - ): - key = str(result.zfpkm.name) - results[key] = result - zfpkm_df[key] = result.zfpkm - - # show updates every X% and at the end, but skip on first iteration - if i != 0 and (i % update_per_step == 0 or i == total): - current_time = time.time() - chunk = current_time - chunk_time - total_time = current_time - start_time - formatted = f"{i:,}" - logger.debug( - f"Processed {formatted:>{log_padding}} of {total:,} - " - f"chunk took {chunk:.1f} seconds - " - f"running for {total_time:.1f} seconds" - ) - chunk_time = current_time - return results, zfpkm_df - - -def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): - """Plot the log2(FPKM) density and fitted Gaussian for each sample. - - :param results: A dictionary of intermediate results from zfpkm_transform. - :param: subplot_titles: Whether to display facet titles (sample names). - :param plot_xfloor: Lower limit for the x-axis. - :param subplot_titles: Whether to display facet titles (sample names). - """ - mega_df = pd.DataFrame(columns=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"]) - for name, result in results.items(): - stddev = result.std_dev - x = np.array(result.density.x) - y = np.array(result.density.y) - - fitted = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) - max_fpkm = y.max() - max_fitted = fitted.max() - scale_fitted = fitted * (max_fpkm / max_fitted) - - df = pd.DataFrame( - { - "sample_name": [name] * len(x), - "log2fpkm": x, - "fpkm_density": y, - "fitted_density_scaled": scale_fitted, - } - ) - mega_df = pd.concat([mega_df, df], ignore_index=True) - - mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") - subplot_titles = list(results.keys()) if subplot_titles else None - fig = make_subplots( - rows=len(results), - cols=1, - subplot_titles=subplot_titles, - vertical_spacing=min(0.05, (1 / (len(results) - 1))), - ) - - for i, (name, group) in enumerate(mega_df.groupby("sample_name"), start=1): - fig.add_trace( - trace=go.Scatter(x=group["log2fpkm"], y=group["density"], mode="lines", name=name, legendgroup=name), - row=i, - col=1, - ) - fig.update_xaxes(title_text="log2(FPKM)", range=[plot_xfloor, max(group["log2fpkm"].tolist())], row=i, col=1) - fig.update_yaxes(title_text="density [scaled]", row=i, col=1) - fig.update_layout(legend_tracegroupgap=0) - - fig.update_layout(height=600 * len(results), width=1000, title_text="zFPKM Plots", showlegend=True) - fig.write_image("zfpkm_plot.png") - - -def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the z-score for each sample in the metrics dictionary.""" - for sample in metrics: - log_matrix = np.log(metrics[sample].normalization_matrix) - z_matrix = pd.DataFrame( - data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names - ) - metrics[sample].z_score_matrix = z_matrix - return metrics - - -def cpm_filter( - *, - metrics: NamedMetrics, - filtering_options: _FilteringOptions, - output_csv_filepath: Path, -) -> NamedMetrics: - """Apply Counts Per Million (CPM) filtering to the count matrix for a given sample.""" - n_exp = filtering_options.replicate_ratio - n_top = filtering_options.high_replicate_ratio - cut_off = filtering_options.cut_off - - metric: _StudyMetrics - for metric in metrics.values(): - counts: pd.DataFrame = metric.count_matrix - entrez_ids: list[str] = metric.entrez_gene_ids - library_size: pd.DataFrame = counts.sum(axis=1) - - # For library_sizes equal to 0, add 1 to prevent divide by 0 - # This will not impact the final counts per million calculation because the original counts are still 0 - # thus, (0 / 1) * 1_000_000 = 0 - library_size[library_size == 0] = 1 - - output_csv_filepath.parent.mkdir(parents=True, exist_ok=True) - counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000 - counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids)) - logger.debug(f"Writing CPM matrix to {output_csv_filepath}") - counts_per_million.to_csv(output_csv_filepath, index=False) - - # TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason. - # Most likely due to multiplying by 1_000_000, not exactly sure why - - min_samples = round(n_exp * len(counts.columns)) # noqa: F841 - top_samples = round(n_top * len(counts.columns)) # noqa: F841 - test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) - for i in range(len(counts_per_million.columns)): - cutoff = ( - 10e6 / (np.median(np.sum(counts[:, i]))) - if cut_off == "default" - else 1e6 * cut_off / np.median(np.sum(counts[:, i])) - ) - test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) - - return metrics - - -def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics: - """Apply quantile-based filtering to the TPM matrix for a given sample.""" - # TODO: Write the TPM matrix to disk - - n_exp = filtering_options.replicate_ratio - n_top = filtering_options.high_replicate_ratio - cut_off = filtering_options.cut_off - metrics = calculate_tpm(metrics) - - sample: str - metric: _StudyMetrics - for sample, metric in metrics.items(): - entrez_ids = metric.entrez_gene_ids - gene_size = metric.gene_sizes - tpm_matrix: pd.DataFrame = metric.normalization_matrix - - min_samples = round(n_exp * len(tpm_matrix.columns)) - top_samples = round(n_top * len(tpm_matrix.columns)) - - tpm_quantile = tpm_matrix[tpm_matrix > 0] - quantile_cutoff = np.quantile( - a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0 - ) # Compute quantile across columns - boolean_expression = pd.DataFrame( - data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns - ).astype(int) - - min_func = k_over_a(min_samples, 0.9) - top_func = k_over_a(top_samples, 0.9) - - min_genes: npt.NDArray[bool] = genefilter(boolean_expression, min_func) - top_genes: npt.NDArray[bool] = genefilter(boolean_expression, top_func) - - # Only keep `entrez_gene_ids` that pass `min_genes` - metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes) if keep] - metric.gene_sizes = [gene for gene, keep in zip(gene_size, min_genes) if keep] - metric.count_matrix = metric.count_matrix.iloc[min_genes, :] - metric.normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :] - - keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes) if keep] - metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes) if keep] - - metrics = calculate_z_score(metrics) - - return metrics - - -def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: - """Apply zFPKM filtering to the FPKM matrix for a given sample.""" - min_sample_expression = filtering_options.replicate_ratio - high_confidence_sample_expression = filtering_options.high_replicate_ratio - cut_off = filtering_options.cut_off - - if calcualte_fpkm: - metrics = calculate_fpkm(metrics) - - metric: _StudyMetrics - for metric in metrics.values(): - # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead - matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix - matrix = matrix[matrix.sum(axis=1) > 0] - - minimums = matrix == 0 - results, zfpkm_df = zfpkm_transform(matrix) - zfpkm_df[minimums] = -4 - zfpkm_plot(results) - - # determine which genes are expressed - min_samples = round(min_sample_expression * len(zfpkm_df.columns)) - min_func = k_over_a(min_samples, cut_off) - min_genes: npt.NDArray[bool] = genefilter(zfpkm_df, min_func) - metric.entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, min_genes) if keep] - - # determine which genes are confidently expressed - top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns)) - top_func = k_over_a(top_samples, cut_off) - top_genes: npt.NDArray[bool] = genefilter(zfpkm_df, top_func) - metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, top_genes) if keep] - - return metrics - - -def filter_counts( - *, - metrics: NamedMetrics, - technique: FilteringTechnique, - filtering_options: _FilteringOptions, - cpm_output_filepath: Path | None = None, -) -> NamedMetrics: - """Filter the count matrix based on the specified technique.""" - match technique: - case FilteringTechnique.cpm: - if cpm_output_filepath is None: - raise ValueError("CPM output filepath must be provided") - return cpm_filter( - metrics=metrics, - filtering_options=filtering_options, - output_csv_filepath=cpm_output_filepath, - ) - - case FilteringTechnique.tpm: - return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) - - case FilteringTechnique.zfpkm: - return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True) - - case FilteringTechnique.umi: - return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False) - - case _: - raise ValueError(f"Technique must be one of {FilteringTechnique}") - - -async def save_rnaseq_tests( - context_name: str, - counts_matrix_filepath: Path, - config_filepath: Path, - gene_info_filepath: Path, - output_filepath: Path, - prep: RNAPrepMethod, - taxon_id: Taxon, - replicate_ratio: float, - batch_ratio: float, - high_replicate_ratio: float, - high_batch_ratio: float, - technique: FilteringTechnique, - cut_off: int | float, -): - """Save the results of the RNA-Seq tests to a CSV file.""" - filtering_options = _FilteringOptions( - replicate_ratio=replicate_ratio, - batch_ratio=batch_ratio, - cut_off=cut_off, - high_replicate_ratio=high_replicate_ratio, - high_batch_ratio=high_batch_ratio, - ) - - if prep == RNAPrepMethod.SCRNA: - technique = FilteringTechnique.umi - logger.warning( - "Single cell filtration does not normalize and assumes " - "gene counts are counted with Unique Molecular Identifiers (UMIs). " - "Setting filtering technique to UMI now." - ) - - read_counts_results: _ReadMatrixResults = await _read_counts_matrix( - context_name=context_name, - counts_matrix_filepath=counts_matrix_filepath, - config_filepath=config_filepath, - gene_info_filepath=gene_info_filepath, - taxon_id=taxon_id, - ) - metrics = read_counts_results.metrics - entrez_gene_ids = read_counts_results.entrez_gene_ids - - metrics = filter_counts( - metrics=metrics, - technique=technique, - filtering_options=filtering_options, - ) - - expressed_genes: list[str] = [] - top_genes: list[str] = [] - for metric in metrics.values(): - expressed_genes.extend(metric.entrez_gene_ids) - top_genes.extend(metric.high_confidence_entrez_gene_ids) - - expression_frequency = pd.Series(expressed_genes).value_counts() - expression_df = pd.DataFrame( - {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values} - ) - expression_df["prop"] = expression_df["frequency"] / len(metrics) - expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio] - - top_frequency = pd.Series(top_genes).value_counts() - top_df = pd.DataFrame({"entrez_gene_id": top_frequency.index, "frequency": top_frequency.values}) - top_df["prop"] = top_df["frequency"] / len(metrics) - top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio] - - boolean_matrix = pd.DataFrame(data={"entrez_gene_id": entrez_gene_ids, "expressed": 0, "high": 0}) - for gene in entrez_gene_ids: - if gene in expression_df["entrez_gene_id"]: - boolean_matrix.loc[gene, "expressed"] = 1 - if gene in top_df["entrez_gene_id"]: - boolean_matrix.loc[gene, "high"] = 1 - - expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1]) - high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1]) - - boolean_matrix.to_csv(output_filepath, index=False) - logger.info( - f"{context_name} - Found {expressed_count} expressed genes, " - f"{high_confidence_count} of which are confidently expressed" - ) - logger.success(f"Wrote boolean matrix to {output_filepath}") From a43b20dd4b4771b941bfc1c24c087ed1860cc94e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:09:23 -0600 Subject: [PATCH 05/62] refactor: import items from rnaseq.py --- main/como/rnaseq_gen.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 47aeb8ee..134ada3a 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -1,16 +1,33 @@ from __future__ import annotations -import argparse -import asyncio -from dataclasses import dataclass +import math +import multiprocessing +import time +from collections import namedtuple +from dataclasses import dataclass, field +from enum import Enum +from functools import partial +from multiprocessing.pool import Pool +from pathlib import Path +from typing import Callable, NamedTuple +import numpy as np +import numpy.typing as npt import pandas as pd -from fast_bioservices import Taxon +import plotly.graph_objs as go +import scanpy as sc +import sklearn +import sklearn.neighbors +from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol from loguru import logger +from pandas import DataFrame +from plotly.subplots import make_subplots +from scipy.signal import find_peaks +from sklearn.neighbors import KernelDensity -from como import Config -from como.custom_types import RNAPrepMethod -from como.rnaseq import FilteringTechnique, save_rnaseq_tests +from como.migrations import gene_info_migrations +from como.project import Config +from como.types import RNAPrepMethod @dataclass From 5675a357a49b99589bd415c5b25a3e951cf55f5f Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:10:02 -0600 Subject: [PATCH 06/62] refactor: remove command line usage --- main/como/rnaseq_gen.py | 134 +--------------------------------------- 1 file changed, 3 insertions(+), 131 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 134ada3a..db3803a1 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -30,17 +30,13 @@ from como.types import RNAPrepMethod -@dataclass -class _Arguments: - config_file: str +class _FilteringOptions(NamedTuple): replicate_ratio: float batch_ratio: float + cut_off: float high_replicate_ratio: float high_batch_ratio: float - filtering_technique: FilteringTechnique - minimum_cutoff: int | str - library_prep: RNAPrepMethod - taxon: Taxon + def __post_init__(self): self.library_prep = RNAPrepMethod.from_string(str(self.library_prep)) @@ -200,128 +196,4 @@ async def rnaseq_gen( ) -def _parse_args() -> _Arguments: - parser = argparse.ArgumentParser( - prog="rnaseq_gen.py", - description="Generate a list of active and high-confidence genes from a counts matrix using a user defined " - "at normalization-technique at /work/data/results//rnaseq_.csv: " - "https://github.com/HelikarLab/FastqToGeneCounts", - epilog="For additional help, please post questions/issues in the MADRID GitHub repo at " - "https://github.com/HelikarLab/MADRID or email babessell@gmail.com", - ) - parser.add_argument( - "-c", - "--config-file", - type=str, - required=True, - dest="config_file", - help="Name of config .xlsx file in the /work/data/config_files/. Can be generated using " - "rnaseq_preprocess.py or manually created and imported into the Juypterlab", - ) - parser.add_argument( - "-r", - "--replicate-ratio", - type=float, - required=False, - default=0.5, - dest="replicate_ratio", - help="Ratio of replicates required for a gene to be active within that study/batch group " - "Example: 0.7 means that for a gene to be active, at least 70% of replicates in a group " - "must pass the cutoff after normalization", - ) - parser.add_argument( - "-g", - "--batch-ratio", - type=float, - required=False, - default=0.5, - dest="batch_ratio", - help="Ratio of groups (studies or batches) required for a gene to be active " - "Example: 0.7 means that for a gene to be active, at least 70% of groups in a study must " - "have passed the replicate ratio test", - ) - parser.add_argument( - "-rh", - "--high-replicate-ratio", - type=float, - required=False, - default=1.0, - dest="high_replicate_ratio", - help="Ratio of replicates required for a gene to be considered high-confidence. " - "High-confidence genes ignore consensus with other data-sources, such as proteomics. " - "Example: 0.9 means that for a gene to be high-confidence, " - "at least 90% of replicates in a group must pass the cutoff after normalization", - ) - parser.add_argument( - "-gh", - "--high-batch-ratio", - type=float, - required=False, - default=1.0, - dest="high_batch_ratio", - help="Ratio of studies/batches required for a gene to be considered high-confidence within that group. " - "High-confidence genes ignore consensus with other data-sources, like proteomics. " - "Example: 0.9 means that for a gene to be high-confidence, " - "at least 90% of groups in a study must have passed the replicate ratio test", - ) - parser.add_argument( - "--taxon", - "--taxon-id", - type=str, - required=True, - dest="taxon", - help="The NCBI Taxonomy ID that is being proessed. '9606' for humans, '10090' for mice.", - ) - parser.add_argument( - "-t", - "--filt-technique", - type=str, - required=False, - default="quantile", - dest="filtering_technique", - help="Technique to normalize and filter counts with. " - "Either 'zfpkm', 'quantile', or 'cpm'. More info about each method is discussed in pipeline.ipynb.", - ) - parser.add_argument( - "--minimum-cutoff", - type=int, - required=False, - default=None, - dest="minimum_cutoff", - help="The minimum cutoff used for the filtration technique. " - "If the filtering technique is zFPKM, the default is -3. " - "If the filtering technique is quantile-tpm, the default is 25. " - "If the filtering technique is flat-cpm, the default is determined dynamically. " - "If the filtering technique is quantile, the default is 25.", - ) - parser.add_argument( - "-p", - "--library-prep", - required=True, - choices=["total", "mrna", "scrna"], - dest="library_prep", - help="Library preparation method. " - "Will separate samples into groups to only compare similarly prepared libraries. " - "For example, mRNA, total-rna, scRNA, etc", - ) - args = parser.parse_args() - args.filtering_technique = args.filtering_technique.lower() - args.taxon = Taxon.from_int(int(args.taxon)) if str(args.taxon).isdigit() else Taxon.from_string(str(args.taxon)) # type: ignore - return _Arguments(**vars(args)) - - -if __name__ == "__main__": - args = _parse_args() - asyncio.run( - rnaseq_gen( - config_filename=args.config_file, - replicate_ratio=args.replicate_ratio, - batch_ratio=args.batch_ratio, - high_replicate_ratio=args.high_replicate_ratio, - high_batch_ratio=args.high_batch_ratio, - technique=args.filtering_technique, - cut_off=args.minimum_cutoff, - prep=args.library_prep, - taxon_id=args.taxon, - ) ) From b5a199c73bd4f3513960d9c15aecfde8b4d62c2f Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:10:23 -0600 Subject: [PATCH 07/62] refactor: add classes for rna processing --- main/como/rnaseq_gen.py | 120 ++++++++++++++++++++++++++++------------ 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index db3803a1..6966980a 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -38,45 +38,97 @@ class _FilteringOptions(NamedTuple): high_batch_ratio: float +class FilteringTechnique(Enum): + """RNA sequencing filtering capabilities.""" + + cpm = "cpm" + zfpkm = "zfpkm" + tpm = "quantile" + umi = "umi" + + @staticmethod + def from_string(value: str) -> FilteringTechnique: + """Create a filtering technique object from a string.""" + match value.lower(): + case "cpm": + return FilteringTechnique.cpm + case "zfpkm": + return FilteringTechnique.zfpkm + case "quantile": + return FilteringTechnique.tpm + case "umi": + return FilteringTechnique.umi + case _: + possible_values = [t.value for t in FilteringTechnique] + raise ValueError(f"Got a filtering technique of '{value}'; should be one of: {possible_values}") + + +class LayoutMethod(Enum): + """RNA sequencing layout method.""" + + paired_end = "paired-end" + single_end = "single-end" + + +@dataclass +class _StudyMetrics: + study: str + num_samples: int + count_matrix: pd.DataFrame + fragment_lengths: npt.NDArray[np.float32] + sample_names: list[str] + layout: list[LayoutMethod] + entrez_gene_ids: list[str] + gene_sizes: npt.NDArray[np.float32] + __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __high_confidence_entrez_gene_ids: list[str] = field(default=list) + def __post_init__(self): - self.library_prep = RNAPrepMethod.from_string(str(self.library_prep)) - self.filtering_technique = FilteringTechnique.from_string(str(self.filtering_technique)) + for layout in self.layout: + if layout not in LayoutMethod: + raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}") - if self.minimum_cutoff is None: - if self.filtering_technique == FilteringTechnique.tpm: - self.minimum_cutoff = 25 - elif self.filtering_technique == FilteringTechnique.cpm: - self.minimum_cutoff = "default" - elif self.filtering_technique == FilteringTechnique.zfpkm: - self.minimum_cutoff = -3 + @property + def normalization_matrix(self) -> pd.DataFrame: + return self.__normalization_matrix + @normalization_matrix.setter + def normalization_matrix(self, value: pd.DataFrame) -> None: + self.__normalization_matrix = value -async def _handle_context_batch( - config_filename: str, - replicate_ratio: float, - batch_ratio: float, - replicate_ratio_high: float, - batch_ratio_high: float, - technique: FilteringTechnique, - cut_off: int | float | str, - prep: RNAPrepMethod, - taxon: Taxon, -) -> None: - """Iterate through each context type and create rnaseq expression file. + @property + def z_score_matrix(self) -> pd.DataFrame: + return self.__z_score_matrix - :param config_filename: The configuration filename to read - :param replicate_ratio: The percentage of replicates that a gene must - appear in for a gene to be marked as "active" in a batch/study - :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" - :param replicate_ratio_high: The percentage of replicates that a gene must - appear in for a gene to be marked "highly confident" in its expression in a batch/study - :param batch_ratio_high: The percentage of batches that a gene must - appear in for a gene to be marked "highly confident" in its expression - :param technique: The filtering technique to use - :param cut_off: The cutoff value to use for the provided filtering technique - :param prep: The library preparation method - :param taxon: The NCBI Taxon ID - :return: None + @z_score_matrix.setter + def z_score_matrix(self, value: pd.DataFrame) -> None: + self.__z_score_matrix = value + + @property + def high_confidence_entrez_gene_ids(self) -> list[str]: + return self.__high_confidence_entrez_gene_ids + + @high_confidence_entrez_gene_ids.setter + def high_confidence_entrez_gene_ids(self, values: list[str]) -> None: + self.__high_confidence_entrez_gene_ids = values + + +class _ZFPKMResult(NamedTuple): + zfpkm: pd.Series + density: Density + mu: float + std_dev: float + max_fpkm: float + + +class _ReadMatrixResults(NamedTuple): + metrics: dict[str, _StudyMetrics] + entrez_gene_ids: list[str] + + +Density = namedtuple("Density", ["x", "y"]) +NamedMetrics = dict[str, _StudyMetrics] """ config = Config() From 5a2dd33e8f24f373ad8db2a417678f10d74973c3 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:11:10 -0600 Subject: [PATCH 08/62] feat: added k_over_a calculation --- main/como/rnaseq_gen.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 6966980a..c9ef8562 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -129,6 +129,22 @@ class _ReadMatrixResults(NamedTuple): Density = namedtuple("Density", ["x", "y"]) NamedMetrics = dict[str, _StudyMetrics] + + +def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]: + """Return a function that filters rows of an array based on the sum of elements being greater than or equal to A at least k times. + + This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA + + :param k: The minimum number of times the sum of elements must be greater than or equal to A. + :param a: The value to compare the sum of elements to. + :return: A function that accepts a NumPy array to perform the actual filtering + """ # noqa: E501 + + def filter_func(row: npt.NDArray) -> bool: + return np.sum(row >= a) >= k + + return filter_func """ config = Config() From fbb2b765be8f90e548cc56ef82a884baaf85673e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:11:24 -0600 Subject: [PATCH 09/62] feat: added genefilter function --- main/como/rnaseq_gen.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index c9ef8562..2ad3390b 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -145,14 +145,27 @@ def filter_func(row: npt.NDArray) -> bool: return np.sum(row >= a) >= k return filter_func + + +def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDArray], bool]) -> npt.NDArray: + """Apply a filter function to the rows of the data and return the filtered array. + + This code is based on the `genefilter` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/genefilter + + :param data: The data to filter + :param filter_func: THe function to filter the data by + :return: A NumPy array of the filtered data. """ - config = Config() + if not isinstance(data, (pd.DataFrame, npt.NDArray)): + raise TypeError("Unsupported data type. Must be a Pandas DataFrame or a NumPy array.") + + return ( + data.apply(filter_func, axis=1).values + if isinstance(data, pd.DataFrame) + else np.apply_along_axis(filter_func, axis=1, arr=data) + ) + - config_filepath = config.config_dir / config_filename - if not config_filepath.exists(): - raise FileNotFoundError(f"Unable to find '{config_filename}' at the path: '{config_filepath}'") - xl = pd.ExcelFile(config_filepath) - sheet_names = xl.sheet_names logger.info(f"Reading config file: {config_filepath}") From 1de1717baad4bb3d029b0bd243a0523b887caf5e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:12:08 -0600 Subject: [PATCH 10/62] refactor: create separate read_counts matrix Reduce complexity of building matrix results --- main/como/rnaseq_gen.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 2ad3390b..795e21d3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -166,8 +166,22 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr ) +async def _read_counts(path: Path) -> pd.DataFrame: + if path.suffix not in {".csv", ".h5ad"}: + raise ValueError(f"Unknown file extension '{path.suffix}'. Valid options are '.csv' or '.h5ad'.") + + matrix: pd.DataFrame + if path.suffix == ".csv": + logger.debug(f"Reading CSV file at '{path}'") + matrix = pd.read_csv(path, header=0) + elif path.suffix == ".h5ad": + logger.debug(f"Reading h5ad file at '{path}'") + # Make sample names the columns and gene data the index + matrix = sc.read_h5ad(path).to_df().T + + return matrix + - logger.info(f"Reading config file: {config_filepath}") for context_name in sheet_names: logger.debug(f"Starting '{context_name}'") From 279f7cc513e9bfb2501f432ede90f4f42a470024 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:12:44 -0600 Subject: [PATCH 11/62] feat: bring zfpkm_filter from rnaseq.py --- main/como/rnaseq_gen.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 795e21d3..600bfa23 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -222,6 +222,41 @@ async def _read_counts(path: Path) -> pd.DataFrame: logger.success(f"Results saved at '{rnaseq_output_filepath}'") +def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: + """Apply zFPKM filtering to the FPKM matrix for a given sample.""" + min_sample_expression = filtering_options.replicate_ratio + high_confidence_sample_expression = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + + if calcualte_fpkm: + metrics = calculate_fpkm(metrics) + + metric: _StudyMetrics + for metric in metrics.values(): + # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead + matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix + matrix = matrix[matrix.sum(axis=1) > 0] + + minimums = matrix == 0 + results, zfpkm_df = zfpkm_transform(matrix) + zfpkm_df[minimums] = -4 + zfpkm_plot(results) + + # determine which genes are expressed + min_samples = round(min_sample_expression * len(zfpkm_df.columns)) + min_func = k_over_a(min_samples, cut_off) + min_genes: npt.NDArray[bool] = genefilter(zfpkm_df, min_func) + metric.entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, min_genes) if keep] + + # determine which genes are confidently expressed + top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns)) + top_func = k_over_a(top_samples, cut_off) + top_genes: npt.NDArray[bool] = genefilter(zfpkm_df, top_func) + metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, top_genes) if keep] + + return metrics + + async def rnaseq_gen( # config_filepath: Path, config_filename: str, From 7e2fe3d663a91b928a42e5979d33c5f67b71f479 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:13:28 -0600 Subject: [PATCH 12/62] feat: added matrix builder This is functionally equivalent to the _read_counts_matrix function from rnaseq.py --- main/como/rnaseq_gen.py | 60 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 600bfa23..572f3d5c 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -182,9 +182,65 @@ async def _read_counts(path: Path) -> pd.DataFrame: return matrix +async def _build_matrix_results( + *, + matrix: pd.DataFrame, + gene_info: pd.DataFrame, + metadata_df: pd.DataFrame, + taxon: int, +) -> _ReadMatrixResults: + """Read the counts matrix and returns the results. + + :param matrix: The gene counts matrix to process + :param metadata_df: The configuration dataframe related to the current context + :param taxon: The NCBI Taxon ID + :return: A dataclass `ReadMatrixResults` + """ + gene_info = gene_info_migrations(gene_info) + conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) + matrix = matrix.merge(conversion, on="ensembl_gene_id", how="left") + + # Only include Entrez and Ensembl Gene IDs that are present in `gene_info` + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].str.split("//") + matrix = matrix.explode("entrez_gene_id") + matrix = matrix.replace(to_replace="-", value=pd.NA).dropna() + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) + + gene_info = gene_info.replace(to_replace="-", value=pd.NA).dropna() + gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) + + counts_matrix = matrix.merge( + gene_info[["entrez_gene_id", "ensembl_gene_id"]], + on=["entrez_gene_id", "ensembl_gene_id"], + how="inner", + ) + gene_info = gene_info.merge( + counts_matrix[["entrez_gene_id", "ensembl_gene_id"]], + on=["entrez_gene_id", "ensembl_gene_id"], + how="inner", + ) + + entrez_gene_ids: list[str] = gene_info["entrez_gene_id"].tolist() + metrics: NamedMetrics = {} + for study in metadata_df["study"].unique().tolist(): + study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist() + layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist() + metrics[study] = _StudyMetrics( + count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)], + fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values, + sample_names=study_sample_names, + layout=[LayoutMethod(layout) for layout in layouts], + num_samples=len(study_sample_names), + entrez_gene_ids=entrez_gene_ids, + gene_sizes=np.array(gene_info["size"].values).astype(np.float32), + study=study, + ) + metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0 + metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id") + + return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist()) + - for context_name in sheet_names: - logger.debug(f"Starting '{context_name}'") rnaseq_input_filepath = ( config.data_dir / "data_matrices" / context_name / f"gene_counts_matrix_{prep.value}_{context_name}" From 9eaa2acc1b669024abb4ec57a74bc35f25ee912e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:13:58 -0600 Subject: [PATCH 13/62] feat: added tpm calculation --- main/como/rnaseq_gen.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 572f3d5c..9238eda2 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -241,9 +241,24 @@ async def _build_matrix_results( return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist()) +def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary.""" + for sample in metrics: + count_matrix = metrics[sample].count_matrix + + gene_sizes = metrics[sample].gene_sizes + + tpm_matrix = pd.DataFrame(data=None, index=count_matrix.index, columns=count_matrix.columns) + for i in range(len(count_matrix.columns)): + values: pd.Series = count_matrix.iloc[:, i] + 1 # Add 1 to prevent division by 0 + rate = np.log(values.tolist()) - np.log(gene_sizes) + denominator = np.log(np.sum(np.exp(rate))) + tpm_value = np.exp(rate - denominator + np.log(1e6)) + tpm_matrix.iloc[:, i] = tpm_value + metrics[sample].normalization_matrix = tpm_matrix + + return metrics - rnaseq_input_filepath = ( - config.data_dir / "data_matrices" / context_name / f"gene_counts_matrix_{prep.value}_{context_name}" ) if prep == RNAPrepMethod.SCRNA: rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".h5ad") From 3670016c55cc53e7060288b42242b25fe3911185 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:14:11 -0600 Subject: [PATCH 14/62] feat: added fpkm calculation --- main/como/rnaseq_gen.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 9238eda2..e9288bb3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -259,6 +259,43 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: return metrics + +def calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary.""" # noqa: E501 + matrix_values = [] + for study in metrics: + for sample in range(metrics[study].num_samples): + layout = metrics[study].layout[sample] + count_matrix: npt.NDArray = metrics[study].count_matrix.iloc[:, sample].values + gene_size = metrics[study].gene_sizes + + count_matrix = count_matrix.astype(np.float32) + gene_size = gene_size.astype(np.float32) + + match layout: + case LayoutMethod.paired_end: # FPKM + mean_fragment_lengths = metrics[study].fragment_lengths[sample] + # Ensure non-negative value + effective_length = [max(0, size - (mean_fragment_lengths + 1)) for size in gene_size] + n = count_matrix.sum() + fpkm = ((count_matrix + 1) * 1e9) / (np.array(effective_length) * n) + matrix_values.append(fpkm) + case LayoutMethod.single_end: # RPKM + # Add a pseudocount before log to ensure log(0) does not happen + rate = np.log(count_matrix + 1) - np.log(gene_size) + exp_rate = np.exp(rate - np.log(np.sum(count_matrix)) + np.log(1e9)) + matrix_values.append(exp_rate) + case _: + raise ValueError("Invalid normalization method specified") + + fpkm_matrix = pd.DataFrame(matrix_values).T # Transpose is needed because values were appended as rows + fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)] + metrics[study].normalization_matrix = fpkm_matrix + + metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns + + return metrics + ) if prep == RNAPrepMethod.SCRNA: rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".h5ad") From 1cd4dbb00a523a6a8bf50c5f166714d8d28681bb Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:14:33 -0600 Subject: [PATCH 15/62] feat: added zfpkm transformation and calculation --- main/como/rnaseq_gen.py | 139 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 127 insertions(+), 12 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index e9288bb3..7b88962e 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -296,19 +296,134 @@ def calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics: return metrics + +def _zfpkm_calculation(col: pd.Series, kernel: KernelDensity, peak_parameters: tuple[float, float]) -> _ZFPKMResult: + """Log2 Transformations. + + Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting + + Kernel Density Estimation (kde) + - Non-parametric method to estimate the probability density function (PDF) of a random variable + - Estimates the distribution of log2-transformed FPKM values + - Bandwidth parameter controls the smoothness of the density estimate + - KDE Explanation + - A way to smooth a histogram to get a better idea of the underlying distribution of the data + - Given a set of data points, we want to understand how they are distributed. + Histograms can be useful, but are sensitive to bin size and number + - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point + - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel, + and less weight to points farther away + - Kernel functions are summed along each point on the x-axis + - A smooth curve is created that represents the estimated density of the data + + Peak Finding + - Identifies that are above a certain height and separated by a minimum distance + - Represent potential local maxima in the distribution + + Peak Selection + - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu) + of the "inactive" gene distribution + - The peak representing unexpressed or inactive genes should be at a lower FPKM + value compared to the peak representing expressed genes + + Standard Deviation Estimation + - The mean of log2-FPKM values are greater than the calculated mu + - Standard deviation is estimated based on the assumption that the right tail of the distribution + This represents expressed genes) can be approximated by a half-normal distribution + + zFPKM Transformation + - Centers disbribution around 0 and scales it by the standard deviation. + This makes it easier to compare gene expression across different samples + - Represents the number of standard deviations away from the mean of the "inactive" gene distribution + - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak + - A zFPKM value of 0 represents the mean of the "inactive" distribution + - Research shows that a zFPKM value of -3 or greater can be used as + a threshold for calling a gene as "expressed" + : https://doi.org/10.1186/1471-2164-14-778 + """ + col_log2: npt.NDArray = np.log2(col + 1) + col_log2 = np.nan_to_num(col_log2, nan=0) + refit: KernelDensity = kernel.fit(col_log2.reshape(-1, 1)) # type: ignore + + # kde: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(col_log2.reshape(-1, 1)) + x_range = np.linspace(col_log2.min(), col_log2.max(), 1000) + density = np.exp(refit.score_samples(x_range.reshape(-1, 1))) + peaks, _ = find_peaks(density, height=peak_parameters[0], distance=peak_parameters[1]) + peak_positions = x_range[peaks] + + mu = 0 + max_fpkm = 0 + stddev = 1 + + if len(peaks) != 0: + mu = peak_positions.max() + max_fpkm = density[peaks[np.argmax(peak_positions)]] + u = col_log2[col_log2 > mu].mean() + stddev = (u - mu) * np.sqrt(np.pi / 2) + zfpkm = pd.Series((col_log2 - mu) / stddev, dtype=np.float32, name=col.name) + + return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm) + + +def zfpkm_transform( + fpkm_df: pd.DataFrame, + bandwidth: int = 0.5, + peak_parameters: tuple[float, float] = (0.02, 1.0), + update_every_percent: float = 0.1, +) -> tuple[dict[str, _ZFPKMResult], DataFrame]: + """Perform zFPKM calculation/transformation.""" + if update_every_percent > 1: + logger.warning( + f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - " + f"will convert to percentage" ) - if prep == RNAPrepMethod.SCRNA: - rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".h5ad") - elif prep in {RNAPrepMethod.TOTAL, RNAPrepMethod.MRNA}: - rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".csv") - - if not rnaseq_input_filepath.exists(): - logger.warning(f"Gene counts matrix not found at {rnaseq_input_filepath}, skipping...") - continue - - gene_info_filepath = config.data_dir / "gene_info.csv" - rnaseq_output_filepath = ( - config.result_dir / context_name / prep.value / f"rnaseq_{prep.value}_{context_name}.csv" + update_every_percent /= 100 + + total = len(fpkm_df.columns) + update_per_step: int = int(np.ceil(total * update_every_percent)) + cores = multiprocessing.cpu_count() - 2 + logger.debug(f"Processing {total:,} samples through zFPKM transform using {cores} cores") + logger.debug( + f"Will update every {update_per_step:,} steps as this is approximately " + f"{update_every_percent:.1%} of {total:,}" + ) + + with Pool(processes=cores) as pool: + kernel = KernelDensity(kernel="gaussian", bandwidth=bandwidth) + chunksize = int(math.ceil(len(fpkm_df.columns) / (4 * cores))) + partial_func = partial(_zfpkm_calculation, kernel=kernel, peak_parameters=peak_parameters) + chunk_time = time.time() + start_time = time.time() + + log_padding = len(str(f"{total:,}")) + zfpkm_df = pd.DataFrame(data=0, index=fpkm_df.index, columns=fpkm_df.columns) + results: dict[str, _ZFPKMResult] = {} + result: _ZFPKMResult + for i, result in enumerate( + pool.imap( + partial_func, + (fpkm_df[col] for col in fpkm_df.columns), + chunksize=chunksize, + ) + ): + key = str(result.zfpkm.name) + results[key] = result + zfpkm_df[key] = result.zfpkm + + # show updates every X% and at the end, but skip on first iteration + if i != 0 and (i % update_per_step == 0 or i == total): + current_time = time.time() + chunk = current_time - chunk_time + total_time = current_time - start_time + formatted = f"{i:,}" + logger.debug( + f"Processed {formatted:>{log_padding}} of {total:,} - " + f"chunk took {chunk:.1f} seconds - " + f"running for {total_time:.1f} seconds" + ) + chunk_time = current_time + return results, zfpkm_df + ) rnaseq_output_filepath.parent.mkdir(parents=True, exist_ok=True) From dd8698e3a9b7ca1311b1d0beda96150d8d45ea77 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:14:50 -0600 Subject: [PATCH 16/62] feat: added zfpkm plotting --- main/como/rnaseq_gen.py | 67 ++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 7b88962e..7c35ea3b 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -424,25 +424,58 @@ def zfpkm_transform( chunk_time = current_time return results, zfpkm_df + +def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): + """Plot the log2(FPKM) density and fitted Gaussian for each sample. + + :param results: A dictionary of intermediate results from zfpkm_transform. + :param: subplot_titles: Whether to display facet titles (sample names). + :param plot_xfloor: Lower limit for the x-axis. + :param subplot_titles: Whether to display facet titles (sample names). + """ + mega_df = pd.DataFrame(columns=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"]) + for name, result in results.items(): + stddev = result.std_dev + x = np.array(result.density.x) + y = np.array(result.density.y) + + fitted = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) + max_fpkm = y.max() + max_fitted = fitted.max() + scale_fitted = fitted * (max_fpkm / max_fitted) + + df = pd.DataFrame( + { + "sample_name": [name] * len(x), + "log2fpkm": x, + "fpkm_density": y, + "fitted_density_scaled": scale_fitted, + } ) - rnaseq_output_filepath.parent.mkdir(parents=True, exist_ok=True) - - await save_rnaseq_tests( - context_name=context_name, - counts_matrix_filepath=rnaseq_input_filepath, - config_filepath=config_filepath, - output_filepath=rnaseq_output_filepath.as_posix(), - gene_info_filepath=gene_info_filepath, - prep=prep, - replicate_ratio=replicate_ratio, - batch_ratio=batch_ratio, - high_replicate_ratio=replicate_ratio_high, - high_batch_ratio=batch_ratio_high, - technique=technique, - cut_off=cut_off, - taxon_id=taxon, + mega_df = pd.concat([mega_df, df], ignore_index=True) + + mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") + subplot_titles = list(results.keys()) if subplot_titles else None + fig = make_subplots( + rows=len(results), + cols=1, + subplot_titles=subplot_titles, + vertical_spacing=min(0.05, (1 / (len(results) - 1))), + ) + + for i, (name, group) in enumerate(mega_df.groupby("sample_name"), start=1): + fig.add_trace( + trace=go.Scatter(x=group["log2fpkm"], y=group["density"], mode="lines", name=name, legendgroup=name), + row=i, + col=1, ) - logger.success(f"Results saved at '{rnaseq_output_filepath}'") + fig.update_xaxes(title_text="log2(FPKM)", range=[plot_xfloor, max(group["log2fpkm"].tolist())], row=i, col=1) + fig.update_yaxes(title_text="density [scaled]", row=i, col=1) + fig.update_layout(legend_tracegroupgap=0) + + fig.update_layout(height=600 * len(results), width=1000, title_text="zFPKM Plots", showlegend=True) + fig.write_image("zfpkm_plot.png") + def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: From 01db1efc7fbe76bb49579b7b68a52b2ab6b7c36e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:15:03 -0600 Subject: [PATCH 17/62] feat: aded calculate z score --- main/como/rnaseq_gen.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 7c35ea3b..6b57dd55 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -477,6 +477,16 @@ def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): fig.write_image("zfpkm_plot.png") +def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the z-score for each sample in the metrics dictionary.""" + for sample in metrics: + log_matrix = np.log(metrics[sample].normalization_matrix) + z_matrix = pd.DataFrame( + data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names + ) + metrics[sample].z_score_matrix = z_matrix + return metrics + def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: """Apply zFPKM filtering to the FPKM matrix for a given sample.""" From a47f752059e68241588dd3bbcc6a34a119729615 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:15:15 -0600 Subject: [PATCH 18/62] feat: added cpm filtering --- main/como/rnaseq_gen.py | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 6b57dd55..4d04dcef 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -488,6 +488,54 @@ def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: return metrics +def cpm_filter( + *, + context_name: str, + metrics: NamedMetrics, + filtering_options: _FilteringOptions, + prep: RNAPrepMethod, +) -> NamedMetrics: + """Apply Counts Per Million (CPM) filtering to the count matrix for a given sample.""" + config = Config() + n_exp = filtering_options.replicate_ratio + n_top = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + + sample: str + metric: _StudyMetrics + for sample, metric in metrics.items(): + counts: pd.DataFrame = metric.count_matrix + entrez_ids: list[str] = metric.entrez_gene_ids + library_size: pd.DataFrame = counts.sum(axis=1) + + # For library_sizes equal to 0, add 1 to prevent divide by 0 + # This will not impact the final counts per million calculation because the original counts are still 0 + # thus, (0 / 1) * 1_000_000 = 0 + library_size[library_size == 0] = 1 + + output_filepath = config.result_dir / context_name / prep.value / f"CPM_Matrix_{prep.value}_{sample}.csv" + output_filepath.parent.mkdir(parents=True, exist_ok=True) + counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000 + counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids)) + logger.debug(f"Writing CPM matrix to {output_filepath}") + counts_per_million.to_csv(output_filepath, index=False) + + # TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason. + # Most likely due to multiplying by 1_000_000, not exactly sure why + + min_samples = round(n_exp * len(counts.columns)) # noqa: F841 + top_samples = round(n_top * len(counts.columns)) # noqa: F841 + test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) + for i in range(len(counts_per_million.columns)): + cutoff = ( + 10e6 / (np.median(np.sum(counts[:, i]))) + if cut_off == "default" + else 1e6 * cut_off / np.median(np.sum(counts[:, i])) + ) + test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) + + return metrics + def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: """Apply zFPKM filtering to the FPKM matrix for a given sample.""" min_sample_expression = filtering_options.replicate_ratio From b6460c22de205a9932872e19806b062f5d05aeeb Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:15:35 -0600 Subject: [PATCH 19/62] feat: added tpm quantile filtering --- main/como/rnaseq_gen.py | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 4d04dcef..6ec871e3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -536,6 +536,54 @@ def cpm_filter( return metrics + +def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics: + """Apply quantile-based filtering to the TPM matrix for a given sample.""" + # TODO: Write the TPM matrix to disk + + n_exp = filtering_options.replicate_ratio + n_top = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + metrics = calculate_tpm(metrics) + + sample: str + metric: _StudyMetrics + for sample, metric in metrics.items(): + entrez_ids = metric.entrez_gene_ids + gene_size = metric.gene_sizes + tpm_matrix: pd.DataFrame = metric.normalization_matrix + + min_samples = round(n_exp * len(tpm_matrix.columns)) + top_samples = round(n_top * len(tpm_matrix.columns)) + + tpm_quantile = tpm_matrix[tpm_matrix > 0] + quantile_cutoff = np.quantile( + a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0 + ) # Compute quantile across columns + boolean_expression = pd.DataFrame( + data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns + ).astype(int) + + min_func = k_over_a(min_samples, 0.9) + top_func = k_over_a(top_samples, 0.9) + + min_genes: npt.NDArray[bool] = genefilter(boolean_expression, min_func) + top_genes: npt.NDArray[bool] = genefilter(boolean_expression, top_func) + + # Only keep `entrez_gene_ids` that pass `min_genes` + metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes) if keep] + metric.gene_sizes = [gene for gene, keep in zip(gene_size, min_genes) if keep] + metric.count_matrix = metric.count_matrix.iloc[min_genes, :] + metric.normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :] + + keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes) if keep] + metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes) if keep] + + metrics = calculate_z_score(metrics) + + return metrics + + def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: """Apply zFPKM filtering to the FPKM matrix for a given sample.""" min_sample_expression = filtering_options.replicate_ratio From e0c98b430e4b1c14801b131efdfd406b8a36d470 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:15:53 -0600 Subject: [PATCH 20/62] feat: added root filtering logic --- main/como/rnaseq_gen.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 6ec871e3..745d519b 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -619,9 +619,29 @@ def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, return metrics -async def rnaseq_gen( - # config_filepath: Path, - config_filename: str, +def filter_counts( + *, + context_name: str, + metrics: NamedMetrics, + technique: FilteringTechnique, + filtering_options: _FilteringOptions, + prep: RNAPrepMethod, +) -> NamedMetrics: + """Filter the count matrix based on the specified technique.""" + match technique: + case FilteringTechnique.cpm: + return cpm_filter( + context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep + ) + case FilteringTechnique.tpm: + return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) + case FilteringTechnique.zfpkm: + return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True) + case FilteringTechnique.umi: + return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False) + case _: + raise ValueError(f"Technique must be one of {FilteringTechnique}") + prep: RNAPrepMethod, taxon_id: int | str | Taxon, replicate_ratio: float = 0.5, From a4d6f2f94099639b8fea828d06e6b3f5ab9545e5 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:16:15 -0600 Subject: [PATCH 21/62] feat: added logic for performing statistical tests --- main/como/rnaseq_gen.py | 76 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 745d519b..b430c916 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -642,8 +642,82 @@ def filter_counts( case _: raise ValueError(f"Technique must be one of {FilteringTechnique}") + +async def _save_rnaseq_tests( + context_name: str, + rnaseq_matrix: pd.DataFrame, + metadata_df: pd.DataFrame, + gene_info_df: pd.DataFrame, + output_filepath: Path, prep: RNAPrepMethod, - taxon_id: int | str | Taxon, + taxon: int, + replicate_ratio: float, + batch_ratio: float, + high_replicate_ratio: float, + high_batch_ratio: float, + technique: FilteringTechnique, + cut_off: int | float, +): + """Save the results of the RNA-Seq tests to a CSV file.""" + filtering_options = _FilteringOptions( + replicate_ratio=replicate_ratio, + batch_ratio=batch_ratio, + cut_off=cut_off, + high_replicate_ratio=high_replicate_ratio, + high_batch_ratio=high_batch_ratio, + ) + + read_counts_results: _ReadMatrixResults = await _build_matrix_results( + matrix=rnaseq_matrix, + gene_info=gene_info_df, + metadata_df=metadata_df, + taxon=taxon, + ) + metrics = read_counts_results.metrics + entrez_gene_ids = read_counts_results.entrez_gene_ids + + metrics = filter_counts( + context_name=context_name, + metrics=metrics, + technique=technique, + filtering_options=filtering_options, + prep=prep, + ) + + expressed_genes: list[str] = [] + top_genes: list[str] = [] + for metric in metrics.values(): + expressed_genes.extend(metric.entrez_gene_ids) + top_genes.extend(metric.high_confidence_entrez_gene_ids) + + expression_frequency = pd.Series(expressed_genes).value_counts() + expression_df = pd.DataFrame( + {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values} + ) + expression_df["prop"] = expression_df["frequency"] / len(metrics) + expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio] + + top_frequency = pd.Series(top_genes).value_counts() + top_df = pd.DataFrame({"entrez_gene_id": top_frequency.index, "frequency": top_frequency.values}) + top_df["prop"] = top_df["frequency"] / len(metrics) + top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio] + + boolean_matrix = pd.DataFrame(data={"entrez_gene_id": entrez_gene_ids, "expressed": 0, "high": 0}) + for gene in entrez_gene_ids: + if gene in expression_df["entrez_gene_id"]: + boolean_matrix.loc[gene, "expressed"] = 1 + if gene in top_df["entrez_gene_id"]: + boolean_matrix.loc[gene, "high"] = 1 + + expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1]) + high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1]) + + boolean_matrix.to_csv(output_filepath, index=False) + logger.info( + f"{context_name} - Found {expressed_count} expressed and {high_confidence_count} confidently expressed genes" + ) + logger.success(f"Wrote boolean matrix to {output_filepath}") + replicate_ratio: float = 0.5, high_replicate_ratio: float = 1.0, batch_ratio: float = 0.5, From e9665bf168d7d5724b748830040041db96bee79c Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:16:29 -0600 Subject: [PATCH 22/62] feat: create metadata df --- main/como/rnaseq_gen.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index b430c916..f86b1b17 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -718,6 +718,15 @@ async def _save_rnaseq_tests( ) logger.success(f"Wrote boolean matrix to {output_filepath}") + +async def _create_metadata_df(path: Path) -> pd.DataFrame: + if path.suffix not in {".xls", ".xlsx"}: + raise ValueError( + f"Expected an excel file with extension of '.xlsx' or '.xls', got '{path.suffix}'. " + f"Attempted to process: {path}" + ) + return pd.read_excel(path) + replicate_ratio: float = 0.5, high_replicate_ratio: float = 1.0, batch_ratio: float = 0.5, From dc7c1fec6ec09beb8712676ad2e808a860525fbc Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:16:54 -0600 Subject: [PATCH 23/62] refactor: allow passing specific filepaths --- main/como/rnaseq_gen.py | 65 +++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index f86b1b17..13619059 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -727,6 +727,16 @@ async def _create_metadata_df(path: Path) -> pd.DataFrame: ) return pd.read_excel(path) + +async def rnaseq_gen( # noqa: C901, allow complex function + context_name: str, + input_rnaseq_filepath: Path, + input_gene_info_filepath: Path, + output_rnaseq_filepath: Path, + prep: RNAPrepMethod, + taxon: int, + input_metadata_filepath: Path | None = None, + input_metadata_df: pd.DataFrame | None = None, replicate_ratio: float = 0.5, high_replicate_ratio: float = 1.0, batch_ratio: float = 0.5, @@ -740,9 +750,9 @@ async def _create_metadata_df(path: Path) -> pd.DataFrame: then study/batch numbers are checked for consensus according to batch ratios. The zFPKM method is outlined here: https://pubmed.ncbi.nlm.nih.gov/24215113/ - :param config_filename: The configuration filename to read + :param metadata_filepath: The configuration filename to read :param prep: The preparation method - :param taxon_id: The NCBI Taxon ID + :param taxon: The NCBI Taxon ID :param replicate_ratio: The percentage of replicates that a gene must appear in for a gene to be marked as "active" in a batch/study :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" @@ -754,41 +764,58 @@ async def _create_metadata_df(path: Path) -> pd.DataFrame: :param cut_off: The cutoff value to use for the provided filtering technique :return: None """ - if isinstance(technique, str): - technique = FilteringTechnique(technique.lower()) - if isinstance(taxon_id, (str, int)): - taxon_id = Taxon.from_string(str(taxon_id)) + if not input_metadata_df and not input_metadata_filepath: + raise ValueError("At least one of input_metadata_filepath or input_metadata_df must be provided") + + technique = ( + FilteringTechnique.from_string(str(technique.lower())) if isinstance(technique, (str, int)) else technique + ) match technique: case FilteringTechnique.tpm: - cut_off = 25 if cut_off is None else cut_off + cut_off = cut_off or 25 if cut_off < 1 or cut_off > 100: raise ValueError("Quantile must be between 1 - 100") case FilteringTechnique.cpm: - if cut_off is not None and cut_off < 0: + if cut_off and cut_off < 0: raise ValueError("Cutoff must be greater than 0") - elif cut_off is None: + elif cut_off: cut_off = "default" case FilteringTechnique.zfpkm: - cut_off = "default" if cut_off is None else cut_off + cut_off = "default" if cut_off else cut_off case FilteringTechnique.umi: pass case _: raise ValueError(f"Technique must be one of {FilteringTechnique}") - await _handle_context_batch( - config_filename=config_filename, + if not input_rnaseq_filepath.exists(): + raise FileNotFoundError(f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'") + + if prep == RNAPrepMethod.SCRNA: + technique = FilteringTechnique.umi + logger.warning( + "Single cell filtration does not normalize and assumes " + "gene counts are counted with Unique Molecular Identifiers (UMIs). " + "Setting filtering technique to UMI now." + ) + + logger.debug(f"Starting '{context_name}'") + output_rnaseq_filepath.parent.mkdir(parents=True, exist_ok=True) + + await _save_rnaseq_tests( + context_name=context_name, + rnaseq_matrix=await _read_counts(input_rnaseq_filepath), + metadata_df=input_metadata_df or await _create_metadata_df(input_metadata_filepath), + gene_info_df=pd.read_csv(input_gene_info_filepath), + output_filepath=output_rnaseq_filepath, + prep=prep, + taxon=taxon, replicate_ratio=replicate_ratio, - replicate_ratio_high=high_replicate_ratio, batch_ratio=batch_ratio, - batch_ratio_high=high_batch_ratio, + high_replicate_ratio=high_replicate_ratio, + high_batch_ratio=high_batch_ratio, technique=technique, cut_off=cut_off, - prep=prep, - taxon=taxon_id, - ) - - ) From f35e09a263ee552d42e188ab71e2c5c52dc686c2 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:17:25 -0600 Subject: [PATCH 24/62] refactor: rename variable names for easier reuse --- main/COMO.ipynb | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/main/COMO.ipynb b/main/COMO.ipynb index d361f9b2..b721bbbb 100644 --- a/main/COMO.ipynb +++ b/main/COMO.ipynb @@ -276,22 +276,22 @@ "from como.types import RNAPrepMethod\n", "\n", "context_names = [\"naiveB\"]\n", - "output_gene_info_filepaths = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", - "como_context_dirs = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", - "output_trna_filepaths = [Path(f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for context in context_names]\n", - "output_polya_filepaths = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", + "gene_info_filepath = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", + "como_context_dir = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", + "trna_matrix_filepath = [Path(f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for context in context_names]\n", + "polya_matrix_filepath = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", "\n", "\n", "for i in range(len(context_names)):\n", " await rnaseq_preprocess(\n", " context_name=context_names[i],\n", " taxon=9606,\n", - " output_gene_info_filepath=output_gene_info_filepaths[i],\n", - " como_context_dir=como_context_dirs[i],\n", + " output_gene_info_filepath=gene_info_filepath[i],\n", + " como_context_dir=como_context_dir[i],\n", " output_trna_config_filepath=Path(\"./data/config_sheets/trna_config.xlsx\"),\n", - " output_trna_count_matrix_filepath=output_trna_filepaths[i],\n", + " output_trna_count_matrix_filepath=trna_matrix_filepath[i],\n", " output_polya_config_filepath=Path(\"./data/config_sheets/mrna_config.xlsx\"),\n", - " output_polya_count_matrix_filepath=output_polya_filepaths[i],\n", + " output_polya_count_matrix_filepath=polya_matrix_filepath[i],\n", " cache=True,\n", " log_level=\"INFO\",\n", " )" @@ -373,7 +373,7 @@ "metadata": {}, "outputs": [], "source": [ - "# step 2.2 RNA-seq Analysis for Total RNA-seq library preparation\n", + "from como.rnaseq_gen import rnaseq_gen\n", "\n", "trnaseq_config_file = \"trnaseq_data_inputs_auto.xlsx\"\n", "rep_ratio = 0.75\n", @@ -438,6 +438,11 @@ "minimum_cutoff = -3\n", "taxon_id = \"human\"\n", "\n", + "await rnaseq_gen(\n", + " context_name=\"naiveB\",\n", + " input_rnaseq_filepath=\n", + ")\n", + "\n", "# fmt: off\n", "cmd = \" \".join(\n", " [\n", From 763b51ada11deff6265399d176234f23415ce150 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:42:53 -0600 Subject: [PATCH 25/62] fix: use better column names --- .../boundary_rxns/naiveB_boundary_rxns.csv | 140 +++++++++--------- main/data/boundary_rxns/smB_boundary_rxns.csv | 140 +++++++++--------- 2 files changed, 140 insertions(+), 140 deletions(-) diff --git a/main/data/boundary_rxns/naiveB_boundary_rxns.csv b/main/data/boundary_rxns/naiveB_boundary_rxns.csv index f9297391..0b08556e 100644 --- a/main/data/boundary_rxns/naiveB_boundary_rxns.csv +++ b/main/data/boundary_rxns/naiveB_boundary_rxns.csv @@ -1,70 +1,70 @@ -Boundary,Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate -Demand,Exchange,glc_D,Extracellular,-100,1000 -Demand,Exchange,fe2,Extracellular,-1000,1000 -Demand,Exchange,gal,Extracellular,0,1000 -Demand,Exchange,zn2,Extracellular,-1000,1000 -Demand,Exchange,ca2,Extracellular,-1000,1000 -Demand,Exchange,na1,Extracellular,-1000,1000 -Demand,Exchange,cl,Extracellular,-1000,1000 -Demand,Exchange,k,Extracellular,-1000,1000 -Demand,Exchange,h,Extracellular,-1000,1000 -Demand,Exchange,h2o,Extracellular,-1000,1000 -Demand,Exchange,o2,Extracellular,-1000,1000 -Demand,Exchange,pi,Extracellular,-1000,1000 -Demand,Exchange,ppi,Extracellular,-1000,1000 -Demand,Exchange,co2,Extracellular,-1000,1000 -Demand,Exchange,nh4,Extracellular,-1000,1000 -Demand,Exchange,no2,Extracellular,-1000,1000 -Demand,Exchange,co,Extracellular,-1000,1000 -Demand,Exchange,asn_L,Extracellular,-1,1000 -Demand,Exchange,asp_L,Extracellular,-1,1000 -Demand,Exchange,glu_L,Extracellular,-1,1000 -Demand,Exchange,ile_L,Extracellular,-1,1000 -Demand,Exchange,leu_L,Extracellular,-1,1000 -Demand,Exchange,lys_L,Extracellular,-1,1000 -Demand,Exchange,met_L,Extracellular,-1,1000 -Demand,Exchange,pro_L,Extracellular,-1,1000 -Demand,Exchange,ser_L,Extracellular,-1,1000 -Demand,Exchange,val_L,Extracellular,-1,1000 -Demand,Exchange,gly,Extracellular,-1,1000 -Demand,Exchange,cys_L,Extracellular,-1,1000 -Demand,Exchange,ala_L,Extracellular,-1,1000 -Demand,Exchange,his_L,Extracellular,-1,1000 -Demand,Exchange,thr_L,Extracellular,-1,1000 -Demand,Exchange,gln_L,Extracellular,-1,1000 -Demand,Exchange,phe_L,Extracellular,-1,1000 -Demand,Exchange,tyr_L,Extracellular,-1,1000 -Demand,Exchange,arg_L,Extracellular,-1,1000 -Demand,Exchange,trp_L,Extracellular,-1,1000 -Demand,Exchange,eicostet,Extracellular,-1,1000 -Demand,Exchange,hdca,Extracellular,-1,1000 -Demand,Exchange,hdcea,Extracellular,-1,1000 -Demand,Exchange,lnlc,Extracellular,-1,1000 -Demand,Exchange,lnlnca,Extracellular,-1,1000 -Demand,Exchange,lnlncg,Extracellular,-1,1000 -Demand,Exchange,ocdca,Extracellular,-1,1000 -Demand,Exchange,ocdcea,Extracellular,-1,1000 -Demand,Exchange,thmmp,Extracellular,0,1000 -Demand,Exchange,thmtp,Extracellular,0,1000 -Demand,Exchange,ncam,Extracellular,0,1000 -Demand,Exchange,pnto_R,Extracellular,0,1000 -Demand,Exchange,pydxn,Extracellular,0,1000 -Demand,Exchange,pydx,Extracellular,0,1000 -Demand,Exchange,pydam,Extracellular,0,1000 -Demand,Exchange,btn,Extracellular,0,1000 -Demand,Exchange,fol,Extracellular,-10,1000 -Demand,Exchange,aqcobal,Extracellular,0,1000 -Demand,Exchange,vitd3,Extracellular,-1,1000 -Demand,Exchange,ascb_L,Extracellular,0,1000 -Demand,Exchange,retinol,Extracellular,-10,1000 -Demand,Exchange,retinal,Extracellular,-10,1000 -Demand,Exchange,ala_B,Extracellular,-1,1000 -Demand,Exchange,ala_D,Extracellular,-1,1000 -Demand,Exchange,h2co3,Extracellular,-1000,1000 -Demand,Exchange,h2o2,Extracellular,-1000,1000 -Demand,Exchange,hco3,Extracellular,-1000,1000 -Demand,Exchange,orn,Extracellular,-1,1000 -Demand,Exchange,orn_D,Extracellular,-1,1000 -Demand,Exchange,so4,Extracellular,-1000,1000 -Demand,Exchange,ribflv,Extracellular,-1,1000 -Demand,Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file +Boundary,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Exchange,glc_D,Extracellular,-100,1000 +Exchange,fe2,Extracellular,-1000,1000 +Exchange,gal,Extracellular,0,1000 +Exchange,zn2,Extracellular,-1000,1000 +Exchange,ca2,Extracellular,-1000,1000 +Exchange,na1,Extracellular,-1000,1000 +Exchange,cl,Extracellular,-1000,1000 +Exchange,k,Extracellular,-1000,1000 +Exchange,h,Extracellular,-1000,1000 +Exchange,h2o,Extracellular,-1000,1000 +Exchange,o2,Extracellular,-1000,1000 +Exchange,pi,Extracellular,-1000,1000 +Exchange,ppi,Extracellular,-1000,1000 +Exchange,co2,Extracellular,-1000,1000 +Exchange,nh4,Extracellular,-1000,1000 +Exchange,no2,Extracellular,-1000,1000 +Exchange,co,Extracellular,-1000,1000 +Exchange,asn_L,Extracellular,-1,1000 +Exchange,asp_L,Extracellular,-1,1000 +Exchange,glu_L,Extracellular,-1,1000 +Exchange,ile_L,Extracellular,-1,1000 +Exchange,leu_L,Extracellular,-1,1000 +Exchange,lys_L,Extracellular,-1,1000 +Exchange,met_L,Extracellular,-1,1000 +Exchange,pro_L,Extracellular,-1,1000 +Exchange,ser_L,Extracellular,-1,1000 +Exchange,val_L,Extracellular,-1,1000 +Exchange,gly,Extracellular,-1,1000 +Exchange,cys_L,Extracellular,-1,1000 +Exchange,ala_L,Extracellular,-1,1000 +Exchange,his_L,Extracellular,-1,1000 +Exchange,thr_L,Extracellular,-1,1000 +Exchange,gln_L,Extracellular,-1,1000 +Exchange,phe_L,Extracellular,-1,1000 +Exchange,tyr_L,Extracellular,-1,1000 +Exchange,arg_L,Extracellular,-1,1000 +Exchange,trp_L,Extracellular,-1,1000 +Exchange,eicostet,Extracellular,-1,1000 +Exchange,hdca,Extracellular,-1,1000 +Exchange,hdcea,Extracellular,-1,1000 +Exchange,lnlc,Extracellular,-1,1000 +Exchange,lnlnca,Extracellular,-1,1000 +Exchange,lnlncg,Extracellular,-1,1000 +Exchange,ocdca,Extracellular,-1,1000 +Exchange,ocdcea,Extracellular,-1,1000 +Exchange,thmmp,Extracellular,0,1000 +Exchange,thmtp,Extracellular,0,1000 +Exchange,ncam,Extracellular,0,1000 +Exchange,pnto_R,Extracellular,0,1000 +Exchange,pydxn,Extracellular,0,1000 +Exchange,pydx,Extracellular,0,1000 +Exchange,pydam,Extracellular,0,1000 +Exchange,btn,Extracellular,0,1000 +Exchange,fol,Extracellular,-10,1000 +Exchange,aqcobal,Extracellular,0,1000 +Exchange,vitd3,Extracellular,-1,1000 +Exchange,ascb_L,Extracellular,0,1000 +Exchange,retinol,Extracellular,-10,1000 +Exchange,retinal,Extracellular,-10,1000 +Exchange,ala_B,Extracellular,-1,1000 +Exchange,ala_D,Extracellular,-1,1000 +Exchange,h2co3,Extracellular,-1000,1000 +Exchange,h2o2,Extracellular,-1000,1000 +Exchange,hco3,Extracellular,-1000,1000 +Exchange,orn,Extracellular,-1,1000 +Exchange,orn_D,Extracellular,-1,1000 +Exchange,so4,Extracellular,-1000,1000 +Exchange,ribflv,Extracellular,-1,1000 +Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file diff --git a/main/data/boundary_rxns/smB_boundary_rxns.csv b/main/data/boundary_rxns/smB_boundary_rxns.csv index f9297391..0b08556e 100644 --- a/main/data/boundary_rxns/smB_boundary_rxns.csv +++ b/main/data/boundary_rxns/smB_boundary_rxns.csv @@ -1,70 +1,70 @@ -Boundary,Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate -Demand,Exchange,glc_D,Extracellular,-100,1000 -Demand,Exchange,fe2,Extracellular,-1000,1000 -Demand,Exchange,gal,Extracellular,0,1000 -Demand,Exchange,zn2,Extracellular,-1000,1000 -Demand,Exchange,ca2,Extracellular,-1000,1000 -Demand,Exchange,na1,Extracellular,-1000,1000 -Demand,Exchange,cl,Extracellular,-1000,1000 -Demand,Exchange,k,Extracellular,-1000,1000 -Demand,Exchange,h,Extracellular,-1000,1000 -Demand,Exchange,h2o,Extracellular,-1000,1000 -Demand,Exchange,o2,Extracellular,-1000,1000 -Demand,Exchange,pi,Extracellular,-1000,1000 -Demand,Exchange,ppi,Extracellular,-1000,1000 -Demand,Exchange,co2,Extracellular,-1000,1000 -Demand,Exchange,nh4,Extracellular,-1000,1000 -Demand,Exchange,no2,Extracellular,-1000,1000 -Demand,Exchange,co,Extracellular,-1000,1000 -Demand,Exchange,asn_L,Extracellular,-1,1000 -Demand,Exchange,asp_L,Extracellular,-1,1000 -Demand,Exchange,glu_L,Extracellular,-1,1000 -Demand,Exchange,ile_L,Extracellular,-1,1000 -Demand,Exchange,leu_L,Extracellular,-1,1000 -Demand,Exchange,lys_L,Extracellular,-1,1000 -Demand,Exchange,met_L,Extracellular,-1,1000 -Demand,Exchange,pro_L,Extracellular,-1,1000 -Demand,Exchange,ser_L,Extracellular,-1,1000 -Demand,Exchange,val_L,Extracellular,-1,1000 -Demand,Exchange,gly,Extracellular,-1,1000 -Demand,Exchange,cys_L,Extracellular,-1,1000 -Demand,Exchange,ala_L,Extracellular,-1,1000 -Demand,Exchange,his_L,Extracellular,-1,1000 -Demand,Exchange,thr_L,Extracellular,-1,1000 -Demand,Exchange,gln_L,Extracellular,-1,1000 -Demand,Exchange,phe_L,Extracellular,-1,1000 -Demand,Exchange,tyr_L,Extracellular,-1,1000 -Demand,Exchange,arg_L,Extracellular,-1,1000 -Demand,Exchange,trp_L,Extracellular,-1,1000 -Demand,Exchange,eicostet,Extracellular,-1,1000 -Demand,Exchange,hdca,Extracellular,-1,1000 -Demand,Exchange,hdcea,Extracellular,-1,1000 -Demand,Exchange,lnlc,Extracellular,-1,1000 -Demand,Exchange,lnlnca,Extracellular,-1,1000 -Demand,Exchange,lnlncg,Extracellular,-1,1000 -Demand,Exchange,ocdca,Extracellular,-1,1000 -Demand,Exchange,ocdcea,Extracellular,-1,1000 -Demand,Exchange,thmmp,Extracellular,0,1000 -Demand,Exchange,thmtp,Extracellular,0,1000 -Demand,Exchange,ncam,Extracellular,0,1000 -Demand,Exchange,pnto_R,Extracellular,0,1000 -Demand,Exchange,pydxn,Extracellular,0,1000 -Demand,Exchange,pydx,Extracellular,0,1000 -Demand,Exchange,pydam,Extracellular,0,1000 -Demand,Exchange,btn,Extracellular,0,1000 -Demand,Exchange,fol,Extracellular,-10,1000 -Demand,Exchange,aqcobal,Extracellular,0,1000 -Demand,Exchange,vitd3,Extracellular,-1,1000 -Demand,Exchange,ascb_L,Extracellular,0,1000 -Demand,Exchange,retinol,Extracellular,-10,1000 -Demand,Exchange,retinal,Extracellular,-10,1000 -Demand,Exchange,ala_B,Extracellular,-1,1000 -Demand,Exchange,ala_D,Extracellular,-1,1000 -Demand,Exchange,h2co3,Extracellular,-1000,1000 -Demand,Exchange,h2o2,Extracellular,-1000,1000 -Demand,Exchange,hco3,Extracellular,-1000,1000 -Demand,Exchange,orn,Extracellular,-1,1000 -Demand,Exchange,orn_D,Extracellular,-1,1000 -Demand,Exchange,so4,Extracellular,-1000,1000 -Demand,Exchange,ribflv,Extracellular,-1,1000 -Demand,Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file +Boundary,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Exchange,glc_D,Extracellular,-100,1000 +Exchange,fe2,Extracellular,-1000,1000 +Exchange,gal,Extracellular,0,1000 +Exchange,zn2,Extracellular,-1000,1000 +Exchange,ca2,Extracellular,-1000,1000 +Exchange,na1,Extracellular,-1000,1000 +Exchange,cl,Extracellular,-1000,1000 +Exchange,k,Extracellular,-1000,1000 +Exchange,h,Extracellular,-1000,1000 +Exchange,h2o,Extracellular,-1000,1000 +Exchange,o2,Extracellular,-1000,1000 +Exchange,pi,Extracellular,-1000,1000 +Exchange,ppi,Extracellular,-1000,1000 +Exchange,co2,Extracellular,-1000,1000 +Exchange,nh4,Extracellular,-1000,1000 +Exchange,no2,Extracellular,-1000,1000 +Exchange,co,Extracellular,-1000,1000 +Exchange,asn_L,Extracellular,-1,1000 +Exchange,asp_L,Extracellular,-1,1000 +Exchange,glu_L,Extracellular,-1,1000 +Exchange,ile_L,Extracellular,-1,1000 +Exchange,leu_L,Extracellular,-1,1000 +Exchange,lys_L,Extracellular,-1,1000 +Exchange,met_L,Extracellular,-1,1000 +Exchange,pro_L,Extracellular,-1,1000 +Exchange,ser_L,Extracellular,-1,1000 +Exchange,val_L,Extracellular,-1,1000 +Exchange,gly,Extracellular,-1,1000 +Exchange,cys_L,Extracellular,-1,1000 +Exchange,ala_L,Extracellular,-1,1000 +Exchange,his_L,Extracellular,-1,1000 +Exchange,thr_L,Extracellular,-1,1000 +Exchange,gln_L,Extracellular,-1,1000 +Exchange,phe_L,Extracellular,-1,1000 +Exchange,tyr_L,Extracellular,-1,1000 +Exchange,arg_L,Extracellular,-1,1000 +Exchange,trp_L,Extracellular,-1,1000 +Exchange,eicostet,Extracellular,-1,1000 +Exchange,hdca,Extracellular,-1,1000 +Exchange,hdcea,Extracellular,-1,1000 +Exchange,lnlc,Extracellular,-1,1000 +Exchange,lnlnca,Extracellular,-1,1000 +Exchange,lnlncg,Extracellular,-1,1000 +Exchange,ocdca,Extracellular,-1,1000 +Exchange,ocdcea,Extracellular,-1,1000 +Exchange,thmmp,Extracellular,0,1000 +Exchange,thmtp,Extracellular,0,1000 +Exchange,ncam,Extracellular,0,1000 +Exchange,pnto_R,Extracellular,0,1000 +Exchange,pydxn,Extracellular,0,1000 +Exchange,pydx,Extracellular,0,1000 +Exchange,pydam,Extracellular,0,1000 +Exchange,btn,Extracellular,0,1000 +Exchange,fol,Extracellular,-10,1000 +Exchange,aqcobal,Extracellular,0,1000 +Exchange,vitd3,Extracellular,-1,1000 +Exchange,ascb_L,Extracellular,0,1000 +Exchange,retinol,Extracellular,-10,1000 +Exchange,retinal,Extracellular,-10,1000 +Exchange,ala_B,Extracellular,-1,1000 +Exchange,ala_D,Extracellular,-1,1000 +Exchange,h2co3,Extracellular,-1000,1000 +Exchange,h2o2,Extracellular,-1000,1000 +Exchange,hco3,Extracellular,-1000,1000 +Exchange,orn,Extracellular,-1,1000 +Exchange,orn_D,Extracellular,-1,1000 +Exchange,so4,Extracellular,-1000,1000 +Exchange,ribflv,Extracellular,-1,1000 +Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file From a0754d6ac1396ab888af95e266d9125e1c278abe Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Dec 2024 15:43:06 -0600 Subject: [PATCH 26/62] fix: match new column names --- main/como/create_context_specific_model.py | 28 +++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 896c90cc..8945d8c1 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -562,31 +562,25 @@ def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}" ) - reactions: list[str] = [] + reactions: list[str] = [""] * len(df) boundary_type: list[str] = df["reaction"].tolist() reaction_abbreviation: list[str] = df["abbreviation"].tolist() reaction_compartment: list[str] = df["compartment"].tolist() - lower_bounds = df["minimum reaction rate"].tolist() - upper_bounds = df["maximum reaction rate"].tolist() + lower_bound = df["minimum reaction rate"].tolist() + upper_bound = df["maximum reaction rate"].tolist() + boundary_map = {"exchange": "EX", "demand": "DM", "sink": "SK"} for i in range(len(boundary_type)): - current_type: str = boundary_type[i] - temp_reaction: str = "" - - match current_type.lower(): - case "exchange": - temp_reaction += "EX_" - case "demand": - temp_reaction += "DM_" - case "sink": - temp_reaction += "SK_" + boundary: str = boundary_type[i].lower() + if boundary not in boundary_map: + raise ValueError(f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary[i]}") shorthand_compartment = Compartments.get(reaction_compartment[i]) - temp_reaction += f"{reaction_abbreviation[i]}[{shorthand_compartment}]" - reactions.append(temp_reaction) + reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]" + return _BoundaryReactions( reactions=reactions, - lower_bounds=lower_bounds, - upper_bounds=upper_bounds, + lower_bounds=lower_bound, + upper_bounds=upper_bound, ) From fc803bcc5298023bd563b29a1547fd488c9e4944 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 10:17:35 -0600 Subject: [PATCH 27/62] style: update log message, more pythonic code --- main/COMO.ipynb | 195 +++++++++++++++++---------------- main/como/rnaseq_preprocess.py | 8 +- 2 files changed, 104 insertions(+), 99 deletions(-) diff --git a/main/COMO.ipynb b/main/COMO.ipynb index b721bbbb..24d4b83c 100644 --- a/main/COMO.ipynb +++ b/main/COMO.ipynb @@ -223,6 +223,32 @@ "" ] }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-09T22:01:53.960204Z", + "start_time": "2024-12-09T22:01:53.957844Z" + } + }, + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "\n", + "taxon_id = 9606\n", + "context_names = [\"naiveB\"]\n", + "gene_info_filepath = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", + "como_context_dir = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", + "trna_matrix_filepath = [Path(f\"data/results/{context}/trna-rna/trna_{context}.csv\") for context in context_names]\n", + "polya_matrix_filepath = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", + "\n", + "# No single-cell data is provided by default; COMO accepts single-cell data in CSV or h5ad format\n", + "# If you are using single-cell data, adjust the following lines accordingly\n", + "scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.csv\") for context in context_names]\n", + "# scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.h5ad\") for context in context_names]\n" + ], + "outputs": [], + "execution_count": 3 + }, { "cell_type": "markdown", "metadata": {}, @@ -270,27 +296,19 @@ } ], "source": [ - "from pathlib import Path\n", - "\n", "from como.rnaseq_preprocess import rnaseq_preprocess\n", "from como.types import RNAPrepMethod\n", "\n", - "context_names = [\"naiveB\"]\n", - "gene_info_filepath = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", - "como_context_dir = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", - "trna_matrix_filepath = [Path(f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for context in context_names]\n", - "polya_matrix_filepath = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", - "\n", "\n", "for i in range(len(context_names)):\n", " await rnaseq_preprocess(\n", " context_name=context_names[i],\n", - " taxon=9606,\n", + " taxon=taxon_id,\n", " output_gene_info_filepath=gene_info_filepath[i],\n", " como_context_dir=como_context_dir[i],\n", " output_trna_config_filepath=Path(\"./data/config_sheets/trna_config.xlsx\"),\n", " output_trna_count_matrix_filepath=trna_matrix_filepath[i],\n", - " output_polya_config_filepath=Path(\"./data/config_sheets/mrna_config.xlsx\"),\n", + " output_polya_config_filepath=Path(\"./data/config_sheets/polya_config.xlsx\"),\n", " output_polya_count_matrix_filepath=polya_matrix_filepath[i],\n", " cache=True,\n", " log_level=\"INFO\",\n", @@ -373,35 +391,31 @@ "metadata": {}, "outputs": [], "source": [ - "from como.rnaseq_gen import rnaseq_gen\n", + "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", "\n", - "trnaseq_config_file = \"trnaseq_data_inputs_auto.xlsx\"\n", - "rep_ratio = 0.75\n", - "group_ratio = 0.75\n", - "rep_ratio_h = 1.0\n", - "group_ratio_h = 1.0\n", - "technique = \"zFPKM\"\n", - "minimum_cutoff = -3\n", - "taxon_id = \"human\"\n", - "\n", - "# fmt: off\n", - "cmd = \" \".join(\n", - " [\n", - " \"python3\", \"como/rnaseq_gen.py\",\n", - " \"--config-file\", trnaseq_config_file,\n", - " \"--replicate-ratio\", str(rep_ratio),\n", - " \"--batch-ratio\", str(group_ratio),\n", - " \"--high-replicate-ratio\", str(rep_ratio_h),\n", - " \"--high-batch-ratio\", str(group_ratio_h),\n", - " \"--minimum-cutoff\", str(minimum_cutoff),\n", - " \"--filt-technique\", f\"{technique}\",\n", - " \"--library-prep\", \"total\",\n", - " \"--taxon-id\", taxon_id\n", - " ]\n", - ")\n", - "# fmt: on\n", - "\n", - "!{cmd}" + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.zfpkm\n", + "cutoff = -3\n", + "\n", + "for i, context in enumerate(context_names):\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=trna_matrix_filepath[i],\n", + " input_gene_info_filepath=gene_info_filepath[i],\n", + " output_rnaseq_filepath=trna_matrix_filepath[i],\n", + " prep=RNAPrepMethod.TOTAL,\n", + " taxon=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/trna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" ] }, { @@ -429,38 +443,31 @@ "metadata": {}, "outputs": [], "source": [ - "mrnaseq_config_file = \"mrnaseq_data_inputs_auto.xlsx\"\n", - "rep_ratio = 0.75\n", - "group_ratio = 0.75\n", - "rep_ratio_h = 1.0\n", - "group_ratio_h = 1.0\n", - "technique = \"zfpkm\"\n", - "minimum_cutoff = -3\n", - "taxon_id = \"human\"\n", - "\n", - "await rnaseq_gen(\n", - " context_name=\"naiveB\",\n", - " input_rnaseq_filepath=\n", - ")\n", + "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", "\n", - "# fmt: off\n", - "cmd = \" \".join(\n", - " [\n", - " \"python3\", \"como/rnaseq_gen.py\",\n", - " \"--config-file\", mrnaseq_config_file,\n", - " \"--replicate-ratio\", str(rep_ratio),\n", - " \"--batch-ratio\", str(group_ratio),\n", - " \"--high-replicate-ratio\", str(rep_ratio_h),\n", - " \"--high-batch-ratio\", str(group_ratio_h),\n", - " \"--minimum-cutoff\", str(minimum_cutoff),\n", - " \"--filt-technique\", f\"{technique}\",\n", - " \"--library-prep\", \"mrna\",\n", - " \"--taxon-id\", taxon_id\n", - " ]\n", - ")\n", - "# fmt: on\n", - "\n", - "!{cmd}" + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.zfpkm\n", + "cutoff = -3\n", + "\n", + "for i, context in enumerate(context_names):\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=polya_matrix_filepath[i],\n", + " input_gene_info_filepath=gene_info_filepath[i],\n", + " output_rnaseq_filepath=polya_matrix_filepath[i],\n", + " prep=RNAPrepMethod.MRNA,\n", + " taxon=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/mrna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" ] }, { @@ -488,33 +495,31 @@ "metadata": {}, "outputs": [], "source": [ - "scrnaseq_config_file = \"scrnaseq_data_inputs_auto.xlsx\"\n", - "rep_ratio = 0.75\n", - "group_ratio = 0.75\n", - "rep_ratio_h = 1.0\n", - "group_ratio_h = 1.0\n", - "quantile = 50\n", - "minimum_cutoff = -3\n", - "taxon_id = \"human\"\n", - "\n", - "# fmt: off\n", - "cmd = \" \".join(\n", - " [\n", - " \"python3\", \"como/rnaseq_gen.py\",\n", - " \"--config-file\", scrnaseq_config_file,\n", - " \"--replicate-ratio\", str(rep_ratio),\n", - " \"--batch-ratio\", str(group_ratio),\n", - " \"--high-replicate-ratio\", str(rep_ratio_h),\n", - " \"--high-batch-ratio\", str(group_ratio_h),\n", - " \"--minimum-cutoff\", str(minimum_cutoff),\n", - " \"--filt-technique\", \"umi\",\n", - " \"--library-prep\", \"scrna\",\n", - " \"--taxon-id\", taxon_id\n", - " ]\n", - ")\n", - "# fmt: on\n", + "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", "\n", - "!{cmd}" + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.umi\n", + "cutoff = -3\n", + "\n", + "for i, context in enumerate(context_names):\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=scrna_matrix_filepath[i],\n", + " input_gene_info_filepath=gene_info_filepath[i],\n", + " output_rnaseq_filepath=scrna_matrix_filepath[i],\n", + " prep=RNAPrepMethod.SCRNA,\n", + " taxon=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/scrna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" ] }, { diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index ae8bcdea..bfa19bc7 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -448,11 +448,11 @@ async def read_counts(file: Path) -> list[str]: ) return conversion["entrez_gene_id"].tolist() - logger.info("Fetching gene info (this may take 1-5 minutes)") + logger.info( + "Fetching gene info (this may take 1-5 minutes depending on the number of genes and your internet connection)" + ) genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths]))) - - mygene = MyGene(cache=cache) - gene_data = await mygene.query(items=list(genes), taxon=taxon, scopes="entrezgene") + gene_data = await MyGene(cache=cache).query(items=list(genes), taxon=taxon, scopes="entrezgene") gene_info: pd.DataFrame = pd.DataFrame( data=None, columns=pd.Index(data=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "start_position", "end_position"]), From 8bdddd911680e2ab4a2865aef8dfef25c6707ba0 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 10:17:49 -0600 Subject: [PATCH 28/62] style: variable rename --- main/como/rnaseq_gen.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 13619059..05f185f3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -742,7 +742,7 @@ async def rnaseq_gen( # noqa: C901, allow complex function batch_ratio: float = 0.5, high_batch_ratio: float = 1.0, technique: FilteringTechnique | str = FilteringTechnique.tpm, - cut_off: int | float | None = None, + cutoff: int | float | None = None, ) -> None: """Generate a list of active and high-confidence genes from a gene count matrix. @@ -761,7 +761,7 @@ async def rnaseq_gen( # noqa: C901, allow complex function :param high_batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked "highly confident" in its expression :param technique: The filtering technique to use - :param cut_off: The cutoff value to use for the provided filtering technique + :param cutoff: The cutoff value to use for the provided filtering technique :return: None """ if not input_metadata_df and not input_metadata_filepath: @@ -773,18 +773,18 @@ async def rnaseq_gen( # noqa: C901, allow complex function match technique: case FilteringTechnique.tpm: - cut_off = cut_off or 25 - if cut_off < 1 or cut_off > 100: + cutoff = cutoff or 25 + if cutoff < 1 or cutoff > 100: raise ValueError("Quantile must be between 1 - 100") case FilteringTechnique.cpm: - if cut_off and cut_off < 0: + if cutoff and cutoff < 0: raise ValueError("Cutoff must be greater than 0") - elif cut_off: - cut_off = "default" + elif cutoff: + cutoff = "default" case FilteringTechnique.zfpkm: - cut_off = "default" if cut_off else cut_off + cutoff = "default" if cutoff else cutoff case FilteringTechnique.umi: pass case _: @@ -817,5 +817,5 @@ async def rnaseq_gen( # noqa: C901, allow complex function high_replicate_ratio=high_replicate_ratio, high_batch_ratio=high_batch_ratio, technique=technique, - cut_off=cut_off, + cut_off=cutoff, ) From e0d84bee1fbf4ea3a197bdd6af89f67d501d4872 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 10:18:31 -0600 Subject: [PATCH 29/62] feat: update to match new approach --- main/COMO.ipynb | 123 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 36 deletions(-) diff --git a/main/COMO.ipynb b/main/COMO.ipynb index 24d4b83c..758d719b 100644 --- a/main/COMO.ipynb +++ b/main/COMO.ipynb @@ -2,7 +2,11 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, "source": [ "# COMO: Constraint-based Optomization of Metabolic Objectives\n", "\n", @@ -56,7 +60,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, "source": [ "# Step 1: Data Preprocessing and Analysis\n", "\n", @@ -224,30 +232,33 @@ ] }, { + "cell_type": "code", + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2024-12-09T22:01:53.960204Z", - "start_time": "2024-12-09T22:01:53.957844Z" + "end_time": "2024-12-09T22:10:43.421233Z", + "start_time": "2024-12-09T22:10:43.418100Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "from pathlib import Path\n", "\n", + "from como.rnaseq_preprocess import rnaseq_preprocess\n", + "from como.types import RNAPrepMethod\n", + "\n", "taxon_id = 9606\n", "context_names = [\"naiveB\"]\n", "gene_info_filepath = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", "como_context_dir = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", - "trna_matrix_filepath = [Path(f\"data/results/{context}/trna-rna/trna_{context}.csv\") for context in context_names]\n", + "trna_matrix_filepath = [Path(f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for context in context_names]\n", "polya_matrix_filepath = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", "\n", "# No single-cell data is provided by default; COMO accepts single-cell data in CSV or h5ad format\n", "# If you are using single-cell data, adjust the following lines accordingly\n", "scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.csv\") for context in context_names]\n", "# scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.h5ad\") for context in context_names]\n" - ], - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", @@ -262,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2024-12-07T03:30:27.253112Z", @@ -274,32 +285,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[32m2024-12-06 23:12:10\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m629\u001B[0m - \u001B[1mTEST\u001B[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[32m2024-12-06 23:12:11\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'total' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001B[0m\n", - "\u001B[32m2024-12-06 23:12:11\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'polya' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/polya-rna/polyarna_naiveB.csv'\u001B[0m\n", - "\u001B[32m2024-12-06 23:12:11\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m451\u001B[0m - \u001B[1mFetching gene info (this may take 1-5 minutes)\u001B[0m\n", - "\u001B[32m2024-12-06 23:13:04\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m488\u001B[0m - \u001B[32m\u001B[1mGene Info file written at '/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info.csv'\u001B[0m\n" + "\u001b[32m2024-12-09 16:23:55\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m274\u001b[0m - \u001b[32m\u001b[1mWrote gene count matrix for 'polya' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/polya-rna/polyarna_naiveB.csv'\u001b[0m\n", + "\u001b[32m2024-12-09 16:23:55\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m274\u001b[0m - \u001b[32m\u001b[1mWrote gene count matrix for 'total' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001b[0m\n", + "\u001b[32m2024-12-09 16:23:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m451\u001b[0m - \u001b[1mFetching gene info (this may take 1-5 minutes)\u001b[0m\n", + "\u001b[32m2024-12-09 16:24:13\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m488\u001b[0m - \u001b[32m\u001b[1mGene Info file written at '/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info.csv'\u001b[0m\n" ] } ], "source": [ - "from como.rnaseq_preprocess import rnaseq_preprocess\n", - "from como.types import RNAPrepMethod\n", - "\n", - "\n", "for i in range(len(context_names)):\n", " await rnaseq_preprocess(\n", " context_name=context_names[i],\n", @@ -387,9 +380,67 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-09T22:13:42.060657Z", + "start_time": "2024-12-09T22:13:41.740347Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-12-09 16:18:43.958\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcomo.rnaseq_gen\u001b[0m:\u001b[36mrnaseq_gen\u001b[0m:\u001b[36m805\u001b[0m - \u001b[34m\u001b[1mStarting 'naiveB'\u001b[0m\n", + "\u001b[32m2024-12-09 16:18:43.961\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcomo.rnaseq_gen\u001b[0m:\u001b[36m_read_counts\u001b[0m:\u001b[36m175\u001b[0m - \u001b[34m\u001b[1mReading CSV file at 'data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " entrez_gene_id expressed high\n", + "0 7105 0 0\n", + "1 64102 0 0\n", + "2 8813 0 0\n", + "3 57147 0 0\n", + "4 55732 0 0\n", + "... ... ... ...\n", + "34396 124901321 0 0\n", + "34397 124902403 0 0\n", + "34398 101929614 0 0\n", + "34399 107984888 0 0\n", + "34400 124900697 0 0\n", + "\n", + "[34401 rows x 3 columns]\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'ensembl_gene_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'ensembl_gene_id'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m cutoff \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, context \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(context_names):\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m rnaseq_gen( \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 12\u001b[0m context_name\u001b[38;5;241m=\u001b[39mcontext,\n\u001b[1;32m 13\u001b[0m input_rnaseq_filepath\u001b[38;5;241m=\u001b[39mtrna_matrix_filepath[i],\n\u001b[1;32m 14\u001b[0m input_gene_info_filepath\u001b[38;5;241m=\u001b[39mgene_info_filepath[i],\n\u001b[1;32m 15\u001b[0m output_rnaseq_filepath\u001b[38;5;241m=\u001b[39mtrna_matrix_filepath[i],\n\u001b[1;32m 16\u001b[0m prep\u001b[38;5;241m=\u001b[39mRNAPrepMethod\u001b[38;5;241m.\u001b[39mTOTAL,\n\u001b[1;32m 17\u001b[0m taxon\u001b[38;5;241m=\u001b[39mtaxon_id,\n\u001b[1;32m 18\u001b[0m input_metadata_filepath\u001b[38;5;241m=\u001b[39mPath(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./data/config_sheets/trna_config.xlsx\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 19\u001b[0m replicate_ratio\u001b[38;5;241m=\u001b[39mreplicate_ratio,\n\u001b[1;32m 20\u001b[0m high_replicate_ratio\u001b[38;5;241m=\u001b[39mhigh_confidence_replicate_ratio,\n\u001b[1;32m 21\u001b[0m batch_ratio\u001b[38;5;241m=\u001b[39mbatch_ratio,\n\u001b[1;32m 22\u001b[0m high_batch_ratio\u001b[38;5;241m=\u001b[39mhigh_confidence_batch_ratio,\n\u001b[1;32m 23\u001b[0m technique\u001b[38;5;241m=\u001b[39mtechnique,\n\u001b[1;32m 24\u001b[0m cutoff\u001b[38;5;241m=\u001b[39mcutoff\n\u001b[1;32m 25\u001b[0m )\n", + "File \u001b[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:808\u001b[0m, in \u001b[0;36mrnaseq_gen\u001b[0;34m(context_name, input_rnaseq_filepath, input_gene_info_filepath, output_rnaseq_filepath, prep, taxon, input_metadata_filepath, input_metadata_df, replicate_ratio, high_replicate_ratio, batch_ratio, high_batch_ratio, technique, cutoff)\u001b[0m\n\u001b[1;32m 805\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStarting \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcontext_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 806\u001b[0m output_rnaseq_filepath\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 808\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m _save_rnaseq_tests(\n\u001b[1;32m 809\u001b[0m context_name\u001b[38;5;241m=\u001b[39mcontext_name,\n\u001b[1;32m 810\u001b[0m rnaseq_matrix\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mawait\u001b[39;00m _read_counts(input_rnaseq_filepath),\n\u001b[1;32m 811\u001b[0m metadata_df\u001b[38;5;241m=\u001b[39minput_metadata_df \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _create_metadata_df(input_metadata_filepath),\n\u001b[1;32m 812\u001b[0m gene_info_df\u001b[38;5;241m=\u001b[39mpd\u001b[38;5;241m.\u001b[39mread_csv(input_gene_info_filepath),\n\u001b[1;32m 813\u001b[0m output_filepath\u001b[38;5;241m=\u001b[39moutput_rnaseq_filepath,\n\u001b[1;32m 814\u001b[0m prep\u001b[38;5;241m=\u001b[39mprep,\n\u001b[1;32m 815\u001b[0m taxon\u001b[38;5;241m=\u001b[39mtaxon,\n\u001b[1;32m 816\u001b[0m replicate_ratio\u001b[38;5;241m=\u001b[39mreplicate_ratio,\n\u001b[1;32m 817\u001b[0m batch_ratio\u001b[38;5;241m=\u001b[39mbatch_ratio,\n\u001b[1;32m 818\u001b[0m high_replicate_ratio\u001b[38;5;241m=\u001b[39mhigh_replicate_ratio,\n\u001b[1;32m 819\u001b[0m high_batch_ratio\u001b[38;5;241m=\u001b[39mhigh_batch_ratio,\n\u001b[1;32m 820\u001b[0m technique\u001b[38;5;241m=\u001b[39mtechnique,\n\u001b[1;32m 821\u001b[0m cut_off\u001b[38;5;241m=\u001b[39mcutoff,\n\u001b[1;32m 822\u001b[0m )\n", + "File \u001b[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:671\u001b[0m, in \u001b[0;36m_save_rnaseq_tests\u001b[0;34m(context_name, rnaseq_matrix, metadata_df, gene_info_df, output_filepath, prep, taxon, replicate_ratio, batch_ratio, high_replicate_ratio, high_batch_ratio, technique, cut_off)\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Save the results of the RNA-Seq tests to a CSV file.\"\"\"\u001b[39;00m\n\u001b[1;32m 663\u001b[0m filtering_options \u001b[38;5;241m=\u001b[39m _FilteringOptions(\n\u001b[1;32m 664\u001b[0m replicate_ratio\u001b[38;5;241m=\u001b[39mreplicate_ratio,\n\u001b[1;32m 665\u001b[0m batch_ratio\u001b[38;5;241m=\u001b[39mbatch_ratio,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 668\u001b[0m high_batch_ratio\u001b[38;5;241m=\u001b[39mhigh_batch_ratio,\n\u001b[1;32m 669\u001b[0m )\n\u001b[0;32m--> 671\u001b[0m read_counts_results: _ReadMatrixResults \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m _build_matrix_results(\n\u001b[1;32m 672\u001b[0m matrix\u001b[38;5;241m=\u001b[39mrnaseq_matrix,\n\u001b[1;32m 673\u001b[0m gene_info\u001b[38;5;241m=\u001b[39mgene_info_df,\n\u001b[1;32m 674\u001b[0m metadata_df\u001b[38;5;241m=\u001b[39mmetadata_df,\n\u001b[1;32m 675\u001b[0m taxon\u001b[38;5;241m=\u001b[39mtaxon,\n\u001b[1;32m 676\u001b[0m )\n\u001b[1;32m 677\u001b[0m metrics \u001b[38;5;241m=\u001b[39m read_counts_results\u001b[38;5;241m.\u001b[39mmetrics\n\u001b[1;32m 678\u001b[0m entrez_gene_ids \u001b[38;5;241m=\u001b[39m read_counts_results\u001b[38;5;241m.\u001b[39mentrez_gene_ids\n", + "File \u001b[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:201\u001b[0m, in \u001b[0;36m_build_matrix_results\u001b[0;34m(matrix, gene_info, metadata_df, taxon)\u001b[0m\n\u001b[1;32m 199\u001b[0m gene_info \u001b[38;5;241m=\u001b[39m gene_info_migrations(gene_info)\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28mprint\u001b[39m(matrix)\n\u001b[0;32m--> 201\u001b[0m conversion \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m ensembl_to_gene_id_and_symbol(ids\u001b[38;5;241m=\u001b[39m\u001b[43mmatrix\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mensembl_gene_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mtolist(), taxon\u001b[38;5;241m=\u001b[39mtaxon)\n\u001b[1;32m 202\u001b[0m matrix \u001b[38;5;241m=\u001b[39m matrix\u001b[38;5;241m.\u001b[39mmerge(conversion, on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mensembl_gene_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, how\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mleft\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 204\u001b[0m \u001b[38;5;66;03m# Only include Entrez and Ensembl Gene IDs that are present in `gene_info`\u001b[39;00m\n", + "File \u001b[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'ensembl_gene_id'" + ] + } + ], "source": [ "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", "\n", @@ -401,7 +452,7 @@ "cutoff = -3\n", "\n", "for i, context in enumerate(context_names):\n", - " await rnaseq_gen(\n", + " await rnaseq_gen( # noqa\n", " context_name=context,\n", " input_rnaseq_filepath=trna_matrix_filepath[i],\n", " input_gene_info_filepath=gene_info_filepath[i],\n", From b0e4801b8a27e4d16f43f174526f042c630b2332 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:01:02 +0000 Subject: [PATCH 30/62] chore(deps): bump astral-sh/setup-uv from 3 to 4 Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 3 to 4. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/v3...v4) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/continuous_integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 8109f213..7bd2c871 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -14,7 +14,7 @@ jobs: uses: actions/checkout@v4 - name: Install uv - uses: astral-sh/setup-uv@v3 + uses: astral-sh/setup-uv@v4 - name: Create Virtual Environment run: uv venv From db6e683802f79a5010c9bb80e9adb07c477e34c9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:01:05 +0000 Subject: [PATCH 31/62] chore(deps): bump astral-sh/ruff-action from 1 to 2 Bumps [astral-sh/ruff-action](https://github.com/astral-sh/ruff-action) from 1 to 2. - [Release notes](https://github.com/astral-sh/ruff-action/releases) - [Commits](https://github.com/astral-sh/ruff-action/compare/v1...v2) --- updated-dependencies: - dependency-name: astral-sh/ruff-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/continuous_integration.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 8109f213..869f587d 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -26,17 +26,17 @@ jobs: run: uv run jupyter nbconvert --clear-output --inplace "main/COMO.ipynb" - name: Format Python Imports - uses: astral-sh/ruff-action@v1 + uses: astral-sh/ruff-action@v2 with: args: "check --fix --select I" - name: Format code - uses: astral-sh/ruff-action@v1 + uses: astral-sh/ruff-action@v2 with: args: "format" - name: Format Notebook - uses: astral-sh/ruff-action@v1 + uses: astral-sh/ruff-action@v2 with: args: "format main/COMO.ipynb" @@ -54,7 +54,7 @@ jobs: uses: actions/checkout@v4 - name: Check Lint - uses: astral-sh/ruff-action@v1 + uses: astral-sh/ruff-action@v2 with: args: "check --no-fix --verbose" From dd3eb0dbc186d6e6fff0b3f27e412cc0ea0f4abe Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 13:12:55 -0600 Subject: [PATCH 32/62] fix: set solver using cobra configuration Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 8945d8c1..35fb251b 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -442,6 +442,7 @@ def _build_model( # noqa: C901 # set solver reference_model.solver = solver.lower() + cobra.Configuration().solver = solver.lower() # check number of unsolvable reactions for reference model under media assumptions # incon_rxns, cobra_model = _feasibility_test(cobra_model, "before_seeding") @@ -626,10 +627,10 @@ def create_context_specific_model( # noqa: C901 raise ValueError(f"Output file type {output_type} not recognized. Must be one of: 'xml', 'mat', 'json'") if algorithm not in Algorithm: - raise ValueError(f"Algorithm {algorithm} not supported. Please use one of: GIMME, FASTCORE, or IMAT") + raise ValueError(f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}") if solver not in Solver: - raise ValueError(f"Solver '{solver}' not supported. Use 'GLPK' or 'GUROBI'") + raise ValueError(f"Solver '{solver}' not supported. Use one of {', '.join(s.value for s in Solver)}") if boundary_rxns_filepath: boundary_reactions = _collect_boundary_reactions(boundary_rxns_filepath) @@ -662,7 +663,7 @@ def create_context_specific_model( # noqa: C901 bound_ub=boundary_reactions.upper_bounds, exclude_rxns=exclude_rxns, force_rxns=force_rxns, - solver=solver.value, + solver=solver.value.lower(), low_thresh=low_threshold, high_thresh=high_threshold, ) From 967445906d260cff83429a6ef1c1fc43694ef929 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:48:32 -0600 Subject: [PATCH 33/62] revert: use mrna instead of polya --- main/como/rnaseq_preprocess.py | 26 +++++++++++++------------- main/como/types.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index bfa19bc7..d033f752 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -523,7 +523,7 @@ async def _process( if output_trna_config_filepath: rna_types.append(("total", output_trna_config_filepath, output_trna_matrix_filepath)) if output_mrna_config_filepath: - rna_types.append(("polya", output_mrna_config_filepath, output_mrna_matrix_filepath)) + rna_types.append(("mrna", output_mrna_config_filepath, output_mrna_matrix_filepath)) # if provided, iterate through como-input specific directories tasks = [] @@ -563,9 +563,9 @@ async def rnaseq_preprocess( input_matrix_filepath: Path | list[Path] | None = None, preparation_method: RNAPrepMethod | list[RNAPrepMethod] | None = None, output_trna_config_filepath: Path | None = None, - output_polya_config_filepath: Path | None = None, + output_mrna_config_filepath: Path | None = None, output_trna_count_matrix_filepath: Path | None = None, - output_polya_count_matrix_filepath: Path | None = None, + output_mrna_count_matrix_filepath: Path | None = None, cache: bool = True, log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", log_location: str | TextIOWrapper = sys.stderr, @@ -579,9 +579,9 @@ async def rnaseq_preprocess( :param taxon: The NCBI taxonomy ID :param output_gene_info_filepath: Path to the output gene information CSV file :param output_trna_config_filepath: Path to the output tRNA config file (if in "create" mode) - :param output_polya_config_filepath: Path to the output mRNA config file (if in "create" mode) + :param output_mrna_config_filepath: Path to the output mRNA config file (if in "create" mode) :param output_trna_count_matrix_filepath: The path to write total RNA count matrices - :param output_polya_count_matrix_filepath: The path to write messenger RNA count matrices + :param output_mrna_count_matrix_filepath: The path to write messenger RNA count matrices :param como_context_dir: If in "create" mode, the input path(s) to the COMO_input directory of the current context i.e., the directory containing "fragmentSizes", "geneCounts", "insertSizeMetrics", etc. directories :param input_matrix_filepath: If in "provide" mode, the path(s) to the count matrices to be processed @@ -604,18 +604,18 @@ async def rnaseq_preprocess( output_trna_config_filepath = ( output_trna_config_filepath.resolve() if output_trna_config_filepath else output_trna_config_filepath ) - output_polya_config_filepath = ( - output_polya_config_filepath.resolve() if output_polya_config_filepath else output_polya_config_filepath + output_mrna_config_filepath = ( + output_mrna_config_filepath.resolve() if output_mrna_config_filepath else output_mrna_config_filepath ) output_trna_count_matrix_filepath = ( output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else output_trna_count_matrix_filepath ) - output_polya_count_matrix_filepath = ( - output_polya_count_matrix_filepath.resolve() - if output_polya_count_matrix_filepath - else output_polya_count_matrix_filepath + output_mrna_count_matrix_filepath = ( + output_mrna_count_matrix_filepath.resolve() + if output_mrna_count_matrix_filepath + else output_mrna_count_matrix_filepath ) input_matrix_filepath = _listify(input_matrix_filepath) @@ -633,8 +633,8 @@ async def rnaseq_preprocess( input_matrix_filepath=input_matrix_filepath, output_gene_info_filepath=output_gene_info_filepath, output_trna_config_filepath=output_trna_config_filepath, - output_mrna_config_filepath=output_polya_config_filepath, + output_mrna_config_filepath=output_mrna_config_filepath, output_trna_matrix_filepath=output_trna_count_matrix_filepath, - output_mrna_matrix_filepath=output_polya_count_matrix_filepath, + output_mrna_matrix_filepath=output_mrna_count_matrix_filepath, cache=cache, ) diff --git a/main/como/types.py b/main/como/types.py index ebe44ecf..42908252 100644 --- a/main/como/types.py +++ b/main/como/types.py @@ -47,4 +47,4 @@ def from_string(value: str) -> RNAPrepMethod: type_path = str | Path -type_rna = Literal["total", "polya"] +type_rna = Literal["total", "mrna"] From cde66061b1f410b56f5ada7955870763c8247a7a Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:50:40 -0600 Subject: [PATCH 34/62] refactor: check files returned Instead of checking if the stem of directories match, check that the same number of files are returned. Because the directories are sorted (and this comes from FastqToGeneCounts), they should always match --- main/como/rnaseq_preprocess.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index d033f752..f7e65040 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -138,16 +138,18 @@ def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: # For each study, collect gene count files, fragment files, insert size files, layouts, and strandedness information study_metrics: list[_StudyMetrics] = [] for gene_dir, strand_dir in zip(gene_counts_directories, strandedness_directories): - if gene_dir.stem != strand_dir.stem: - raise ValueError( - f"Gene directory name of '{gene_dir.stem}' does not match stranded directory name of '{strand_dir.stem}'" # noqa: E501 - ) + count_files = list(gene_dir.glob("*.tab")) + strand_files = list(strand_dir.glob("*.txt")) + if len(count_files) == 0: + raise ValueError(f"No count files found for study '{gene_dir.stem}'.") + if len(strand_files) == 0: + raise ValueError(f"No strandedness files found for study '{gene_dir.stem}'.") study_metrics.append( _StudyMetrics( study_name=gene_dir.stem, - count_files=list(gene_dir.glob("*.tab")), - strand_files=list(strand_dir.glob("*.txt")), + count_files=count_files, + strand_files=strand_files, ) ) return study_metrics @@ -262,6 +264,7 @@ async def _write_counts_matrix( counts: list[pd.DataFrame] = await asyncio.gather( *[_create_sample_counts_matrix(metric) for metric in study_metrics] ) + final_matrix = pd.DataFrame() for count in counts: final_matrix = count if final_matrix.empty else pd.merge(final_matrix, count, on="ensembl_gene_id", how="outer") From b14d3f9b8c26699346f710c73f3e1cec91327fa5 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:51:10 -0600 Subject: [PATCH 35/62] feat: allow specifying specific directories --- main/como/rnaseq_preprocess.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index f7e65040..9da1a5e2 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -278,13 +278,28 @@ async def _write_counts_matrix( return final_matrix -async def _create_config_df(context_name: str, /, como_input_dir: Path) -> pd.DataFrame: # noqa: C901 +async def _create_config_df( + context_name: str, + /, + como_context_dir: Path, + gene_count_dirname: str = "geneCounts", + layout_dirname: str = "layouts", + strandedness_dirname: str = "strandedness", + fragment_sizes_dirname: str = "fragmentSizes", + prep_method_dirname: str = "prepMethods", +) -> pd.DataFrame: """Create configuration sheet. The configuration file created is based on the gene counts matrix. If using zFPKM normalization technique, mean fragment lengths will be fetched """ - gene_counts_files = list(Path(como_input_dir, context_name, "geneCounts").rglob("*.tab")) + gene_counts_dir = como_context_dir / gene_count_dirname + layout_dir = como_context_dir / layout_dirname + strandedness_dir = como_context_dir / strandedness_dirname + fragment_sizes_dir = como_context_dir / fragment_sizes_dirname + prep_method_dir = como_context_dir / prep_method_dirname + + gene_counts_files = list(gene_counts_dir.rglob("*.tab")) sample_names: list[str] = [] fragment_lengths: list[int | float] = [] layouts: list[str] = [] @@ -292,6 +307,9 @@ async def _create_config_df(context_name: str, /, como_input_dir: Path) -> pd.Da groups: list[str] = [] preparation_method: list[str] = [] + if len(gene_counts_files) == 0: + raise FileNotFoundError(f"No gene count files found in '{gene_counts_dir}'.") + for gene_count_filename in sorted(gene_counts_files): try: # Match S___R___r___ From d52b0ee1c766d6c15aa31ac98518590f03b3960b Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:52:01 -0600 Subject: [PATCH 36/62] style: use more descriptive variable names --- main/como/rnaseq_preprocess.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 9da1a5e2..98a283f2 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -311,27 +311,25 @@ async def _create_config_df( raise FileNotFoundError(f"No gene count files found in '{gene_counts_dir}'.") for gene_count_filename in sorted(gene_counts_files): - try: - # Match S___R___r___ - # \d{1,3} matches 1-3 digits - # (?:r\d{1,3})? matches an option "r" followed by three digits - label = re.findall(r"S\d{1,3}R\d{1,3}(?:r\d{1,3})?", gene_count_filename.as_posix())[0] - - except IndexError as e: - raise IndexError( + # Match S___R___r___ + # \d{1,3} matches 1-3 digits + # (?:r\d{1,3})? optionally matches a "r" followed by three digits + label = re.findall(r"S\d{1,3}R\d{1,3}(?:r\d{1,3})?", gene_count_filename.as_posix())[0] + if not label: + raise ValueError( f"\n\nFilename of '{gene_count_filename}' is not valid. " f"Should be 'contextName_SXRYrZ.tab', where X is the study/batch number, Y is the replicate number, " f"and Z is the run number." "\n\nIf not a multi-run sample, exclude 'rZ' from the filename." - ) from e + ) study_number = re.findall(r"S\d{1,3}", label)[0] rep_number = re.findall(r"R\d{1,3}", label)[0] - run = re.findall(r"r\d{1,3}", label) + run_number = re.findall(r"r\d{1,3}", label) multi_flag = 0 - if len(run) > 0: - if run[0] != "r1": + if len(run_number) > 0: + if run_number[0] != "r1": continue else: label_glob = study_number + rep_number + "r*" From fe1d406c4bd3571394b588a1ead61c64526a4555 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:52:35 -0600 Subject: [PATCH 37/62] refactor: use early continue Removes an extra indentation --- main/como/rnaseq_preprocess.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 98a283f2..0a0c7a35 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -331,23 +331,21 @@ async def _create_config_df( if len(run_number) > 0: if run_number[0] != "r1": continue - else: - label_glob = study_number + rep_number + "r*" - runs = [run for run in gene_counts_files if re.search(label_glob, run.as_posix())] - multi_flag = 1 - frag_files = [] - - for r in runs: - r_label = re.findall(r"r\d{1,3}", r.as_posix())[0] - R_label = re.findall(r"R\d{1,3}", r.as_posix())[0] # noqa: N806 - frag_filename = "".join([context_name, "_", study_number, R_label, r_label, "_fragment_size.txt"]) - frag_files.append(como_input_dir / context_name / "fragmentSizes" / study_number / frag_filename) - - context_path = como_input_dir / context_name - layout_files: list[Path] = list((context_path / "layouts").rglob(f"{context_name}_{label}_layout.txt")) - strand_files: list[Path] = list((context_path / "strandedness").rglob(f"{context_name}_{label}_strandedness.txt")) # fmt: skip # noqa: E501 - frag_files: list[Path] = list((context_path / "fragmentSizes").rglob(f"{context_name}_{label}_fragment_size.txt")) # fmt: skip # noqa: E501 - prep_files: list[Path] = list((context_path / "prepMethods").rglob(f"{context_name}_{label}_prep_method.txt")) + label_glob = f"{study_number}{rep_number}r*" # S__R__r* + runs = [run for run in gene_counts_files if re.search(label_glob, run.as_posix())] + multi_flag = 1 + frag_files = [] + + for run in runs: + run_number = re.findall(r"R\d{1,3}", run.as_posix())[0] + replicate = re.findall(r"r\d{1,3}", run.as_posix())[0] + frag_filename = "".join([context_name, "_", study_number, run_number, replicate, "_fragment_size.txt"]) + frag_files.append(como_context_dir / fragment_sizes_dirname / study_number / frag_filename) + + layout_files: list[Path] = list(layout_dir.rglob(f"{context_name}_{label}_layout.txt")) + strand_files: list[Path] = list(strandedness_dir.rglob(f"{context_name}_{label}_strandedness.txt")) + frag_files: list[Path] = list(fragment_sizes_dir.rglob(f"{context_name}_{label}_fragment_size.txt")) + prep_files: list[Path] = list(prep_method_dir.rglob(f"{context_name}_{label}_prep_method.txt")) layout = "UNKNOWN" if len(layout_files) == 0: From 8f1d02782550c70c23f447f61d7ed9b01906c7e4 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:52:50 -0600 Subject: [PATCH 38/62] style: update warning messages --- main/como/rnaseq_preprocess.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 0a0c7a35..5c2d8125 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -351,7 +351,7 @@ async def _create_config_df( if len(layout_files) == 0: logger.warning( f"No layout file found for {label}, writing as 'UNKNOWN', " - f"this should be defined by user if using zFPKM or rnaseq_gen.py will not run" + f"this should be defined if you are using zFPKM or downstream 'rnaseq_gen.py' will not run" ) elif len(layout_files) == 1: with layout_files[0].open("r") as file: @@ -380,7 +380,7 @@ async def _create_config_df( prep = "total" if len(prep_files) == 0: - logger.warning(f"No prep file found for {label}, assuming 'total' as in Total RNA library preparation") + logger.warning(f"No prep file found for {label}, assuming 'total', as in 'Total RNA' library preparation") elif len(prep_files) == 1: with prep_files[0].open("r") as file: prep = file.read().strip().lower() @@ -393,10 +393,10 @@ async def _create_config_df( ) mean_fragment_size = 100 - if len(frag_files) == 0: + if len(frag_files) == 0 and prep != RNAPrepMethod.TOTAL.value: logger.warning( f"No fragment file found for {label}, using '100'. " - f"This must be defined by the user in order to use zFPKM normalization" + "You should define this if you are going to use downstream zFPKM normalization" ) elif len(frag_files) == 1: if layout == "single-end": @@ -512,7 +512,7 @@ async def _create_matrix_file( output_counts_matrix_filepath: Path, rna: type_rna, ) -> None: - config_df = await _create_config_df(context_name, como_input_dir=como_context_dir) + config_df = await _create_config_df(context_name, como_context_dir=como_context_dir) await _write_counts_matrix( config_df=config_df, como_context_dir=como_context_dir, From 0ef736e34b5479fa87d17c1db48002a32a38910d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 14:53:20 -0600 Subject: [PATCH 39/62] style: rename variables --- main/COMO.ipynb | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/main/COMO.ipynb b/main/COMO.ipynb index 758d719b..52d7fb88 100644 --- a/main/COMO.ipynb +++ b/main/COMO.ipynb @@ -233,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:10:43.421233Z", @@ -285,10 +285,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-12-09 16:23:55\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m274\u001b[0m - \u001b[32m\u001b[1mWrote gene count matrix for 'polya' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/polya-rna/polyarna_naiveB.csv'\u001b[0m\n", - "\u001b[32m2024-12-09 16:23:55\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m274\u001b[0m - \u001b[32m\u001b[1mWrote gene count matrix for 'total' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001b[0m\n", - "\u001b[32m2024-12-09 16:23:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m451\u001b[0m - \u001b[1mFetching gene info (this may take 1-5 minutes)\u001b[0m\n", - "\u001b[32m2024-12-09 16:24:13\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mcomo.rnaseq_preprocess\u001b[0m:\u001b[36m488\u001b[0m - \u001b[32m\u001b[1mGene Info file written at '/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info.csv'\u001b[0m\n" + "\u001B[32m2024-12-09 16:23:55\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'polya' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/polya-rna/polyarna_naiveB.csv'\u001B[0m\n", + "\u001B[32m2024-12-09 16:23:55\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'total' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001B[0m\n", + "\u001B[32m2024-12-09 16:23:55\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m451\u001B[0m - \u001B[1mFetching gene info (this may take 1-5 minutes)\u001B[0m\n", + "\u001B[32m2024-12-09 16:24:13\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m488\u001B[0m - \u001B[32m\u001B[1mGene Info file written at '/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info.csv'\u001B[0m\n" ] } ], @@ -301,8 +301,8 @@ " como_context_dir=como_context_dir[i],\n", " output_trna_config_filepath=Path(\"./data/config_sheets/trna_config.xlsx\"),\n", " output_trna_count_matrix_filepath=trna_matrix_filepath[i],\n", - " output_polya_config_filepath=Path(\"./data/config_sheets/polya_config.xlsx\"),\n", - " output_polya_count_matrix_filepath=polya_matrix_filepath[i],\n", + " output_mrna_config_filepath=Path(\"./data/config_sheets/polya_config.xlsx\"),\n", + " output_mrna_count_matrix_filepath=polya_matrix_filepath[i],\n", " cache=True,\n", " log_level=\"INFO\",\n", " )" @@ -392,8 +392,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-12-09 16:18:43.958\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcomo.rnaseq_gen\u001b[0m:\u001b[36mrnaseq_gen\u001b[0m:\u001b[36m805\u001b[0m - \u001b[34m\u001b[1mStarting 'naiveB'\u001b[0m\n", - "\u001b[32m2024-12-09 16:18:43.961\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcomo.rnaseq_gen\u001b[0m:\u001b[36m_read_counts\u001b[0m:\u001b[36m175\u001b[0m - \u001b[34m\u001b[1mReading CSV file at 'data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001b[0m\n" + "\u001B[32m2024-12-09 16:18:43.958\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mcomo.rnaseq_gen\u001B[0m:\u001B[36mrnaseq_gen\u001B[0m:\u001B[36m805\u001B[0m - \u001B[34m\u001B[1mStarting 'naiveB'\u001B[0m\n", + "\u001B[32m2024-12-09 16:18:43.961\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mcomo.rnaseq_gen\u001B[0m:\u001B[36m_read_counts\u001B[0m:\u001B[36m175\u001B[0m - \u001B[34m\u001B[1mReading CSV file at 'data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001B[0m\n" ] }, { @@ -421,23 +421,23 @@ "evalue": "'ensembl_gene_id'", "output_type": "error", "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'ensembl_gene_id'", + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", + "File \u001B[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m-> 3805\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", + "File \u001B[0;32mindex.pyx:167\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n", + "File \u001B[0;32mindex.pyx:196\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n", + "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n", + "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n", + "\u001B[0;31mKeyError\u001B[0m: 'ensembl_gene_id'", "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m cutoff \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, context \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(context_names):\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m rnaseq_gen( \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 12\u001b[0m context_name\u001b[38;5;241m=\u001b[39mcontext,\n\u001b[1;32m 13\u001b[0m input_rnaseq_filepath\u001b[38;5;241m=\u001b[39mtrna_matrix_filepath[i],\n\u001b[1;32m 14\u001b[0m input_gene_info_filepath\u001b[38;5;241m=\u001b[39mgene_info_filepath[i],\n\u001b[1;32m 15\u001b[0m output_rnaseq_filepath\u001b[38;5;241m=\u001b[39mtrna_matrix_filepath[i],\n\u001b[1;32m 16\u001b[0m prep\u001b[38;5;241m=\u001b[39mRNAPrepMethod\u001b[38;5;241m.\u001b[39mTOTAL,\n\u001b[1;32m 17\u001b[0m taxon\u001b[38;5;241m=\u001b[39mtaxon_id,\n\u001b[1;32m 18\u001b[0m input_metadata_filepath\u001b[38;5;241m=\u001b[39mPath(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./data/config_sheets/trna_config.xlsx\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 19\u001b[0m replicate_ratio\u001b[38;5;241m=\u001b[39mreplicate_ratio,\n\u001b[1;32m 20\u001b[0m high_replicate_ratio\u001b[38;5;241m=\u001b[39mhigh_confidence_replicate_ratio,\n\u001b[1;32m 21\u001b[0m batch_ratio\u001b[38;5;241m=\u001b[39mbatch_ratio,\n\u001b[1;32m 22\u001b[0m high_batch_ratio\u001b[38;5;241m=\u001b[39mhigh_confidence_batch_ratio,\n\u001b[1;32m 23\u001b[0m technique\u001b[38;5;241m=\u001b[39mtechnique,\n\u001b[1;32m 24\u001b[0m cutoff\u001b[38;5;241m=\u001b[39mcutoff\n\u001b[1;32m 25\u001b[0m )\n", - "File \u001b[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:808\u001b[0m, in \u001b[0;36mrnaseq_gen\u001b[0;34m(context_name, input_rnaseq_filepath, input_gene_info_filepath, output_rnaseq_filepath, prep, taxon, input_metadata_filepath, input_metadata_df, replicate_ratio, high_replicate_ratio, batch_ratio, high_batch_ratio, technique, cutoff)\u001b[0m\n\u001b[1;32m 805\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStarting \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcontext_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 806\u001b[0m output_rnaseq_filepath\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 808\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m _save_rnaseq_tests(\n\u001b[1;32m 809\u001b[0m context_name\u001b[38;5;241m=\u001b[39mcontext_name,\n\u001b[1;32m 810\u001b[0m rnaseq_matrix\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mawait\u001b[39;00m _read_counts(input_rnaseq_filepath),\n\u001b[1;32m 811\u001b[0m metadata_df\u001b[38;5;241m=\u001b[39minput_metadata_df \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _create_metadata_df(input_metadata_filepath),\n\u001b[1;32m 812\u001b[0m gene_info_df\u001b[38;5;241m=\u001b[39mpd\u001b[38;5;241m.\u001b[39mread_csv(input_gene_info_filepath),\n\u001b[1;32m 813\u001b[0m output_filepath\u001b[38;5;241m=\u001b[39moutput_rnaseq_filepath,\n\u001b[1;32m 814\u001b[0m prep\u001b[38;5;241m=\u001b[39mprep,\n\u001b[1;32m 815\u001b[0m taxon\u001b[38;5;241m=\u001b[39mtaxon,\n\u001b[1;32m 816\u001b[0m replicate_ratio\u001b[38;5;241m=\u001b[39mreplicate_ratio,\n\u001b[1;32m 817\u001b[0m batch_ratio\u001b[38;5;241m=\u001b[39mbatch_ratio,\n\u001b[1;32m 818\u001b[0m high_replicate_ratio\u001b[38;5;241m=\u001b[39mhigh_replicate_ratio,\n\u001b[1;32m 819\u001b[0m high_batch_ratio\u001b[38;5;241m=\u001b[39mhigh_batch_ratio,\n\u001b[1;32m 820\u001b[0m technique\u001b[38;5;241m=\u001b[39mtechnique,\n\u001b[1;32m 821\u001b[0m cut_off\u001b[38;5;241m=\u001b[39mcutoff,\n\u001b[1;32m 822\u001b[0m )\n", - "File \u001b[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:671\u001b[0m, in \u001b[0;36m_save_rnaseq_tests\u001b[0;34m(context_name, rnaseq_matrix, metadata_df, gene_info_df, output_filepath, prep, taxon, replicate_ratio, batch_ratio, high_replicate_ratio, high_batch_ratio, technique, cut_off)\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Save the results of the RNA-Seq tests to a CSV file.\"\"\"\u001b[39;00m\n\u001b[1;32m 663\u001b[0m filtering_options \u001b[38;5;241m=\u001b[39m _FilteringOptions(\n\u001b[1;32m 664\u001b[0m replicate_ratio\u001b[38;5;241m=\u001b[39mreplicate_ratio,\n\u001b[1;32m 665\u001b[0m batch_ratio\u001b[38;5;241m=\u001b[39mbatch_ratio,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 668\u001b[0m high_batch_ratio\u001b[38;5;241m=\u001b[39mhigh_batch_ratio,\n\u001b[1;32m 669\u001b[0m )\n\u001b[0;32m--> 671\u001b[0m read_counts_results: _ReadMatrixResults \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m _build_matrix_results(\n\u001b[1;32m 672\u001b[0m matrix\u001b[38;5;241m=\u001b[39mrnaseq_matrix,\n\u001b[1;32m 673\u001b[0m gene_info\u001b[38;5;241m=\u001b[39mgene_info_df,\n\u001b[1;32m 674\u001b[0m metadata_df\u001b[38;5;241m=\u001b[39mmetadata_df,\n\u001b[1;32m 675\u001b[0m taxon\u001b[38;5;241m=\u001b[39mtaxon,\n\u001b[1;32m 676\u001b[0m )\n\u001b[1;32m 677\u001b[0m metrics \u001b[38;5;241m=\u001b[39m read_counts_results\u001b[38;5;241m.\u001b[39mmetrics\n\u001b[1;32m 678\u001b[0m entrez_gene_ids \u001b[38;5;241m=\u001b[39m read_counts_results\u001b[38;5;241m.\u001b[39mentrez_gene_ids\n", - "File \u001b[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:201\u001b[0m, in \u001b[0;36m_build_matrix_results\u001b[0;34m(matrix, gene_info, metadata_df, taxon)\u001b[0m\n\u001b[1;32m 199\u001b[0m gene_info \u001b[38;5;241m=\u001b[39m gene_info_migrations(gene_info)\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28mprint\u001b[39m(matrix)\n\u001b[0;32m--> 201\u001b[0m conversion \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m ensembl_to_gene_id_and_symbol(ids\u001b[38;5;241m=\u001b[39m\u001b[43mmatrix\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mensembl_gene_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mtolist(), taxon\u001b[38;5;241m=\u001b[39mtaxon)\n\u001b[1;32m 202\u001b[0m matrix \u001b[38;5;241m=\u001b[39m matrix\u001b[38;5;241m.\u001b[39mmerge(conversion, on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mensembl_gene_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, how\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mleft\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 204\u001b[0m \u001b[38;5;66;03m# Only include Entrez and Ensembl Gene IDs that are present in `gene_info`\u001b[39;00m\n", - "File \u001b[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'ensembl_gene_id'" + "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[4], line 11\u001B[0m\n\u001B[1;32m 8\u001B[0m cutoff \u001B[38;5;241m=\u001B[39m \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m3\u001B[39m\n\u001B[1;32m 10\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i, context \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(context_names):\n\u001B[0;32m---> 11\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m rnaseq_gen( \u001B[38;5;66;03m# noqa\u001B[39;00m\n\u001B[1;32m 12\u001B[0m context_name\u001B[38;5;241m=\u001B[39mcontext,\n\u001B[1;32m 13\u001B[0m input_rnaseq_filepath\u001B[38;5;241m=\u001B[39mtrna_matrix_filepath[i],\n\u001B[1;32m 14\u001B[0m input_gene_info_filepath\u001B[38;5;241m=\u001B[39mgene_info_filepath[i],\n\u001B[1;32m 15\u001B[0m output_rnaseq_filepath\u001B[38;5;241m=\u001B[39mtrna_matrix_filepath[i],\n\u001B[1;32m 16\u001B[0m prep\u001B[38;5;241m=\u001B[39mRNAPrepMethod\u001B[38;5;241m.\u001B[39mTOTAL,\n\u001B[1;32m 17\u001B[0m taxon\u001B[38;5;241m=\u001B[39mtaxon_id,\n\u001B[1;32m 18\u001B[0m input_metadata_filepath\u001B[38;5;241m=\u001B[39mPath(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m./data/config_sheets/trna_config.xlsx\u001B[39m\u001B[38;5;124m\"\u001B[39m),\n\u001B[1;32m 19\u001B[0m replicate_ratio\u001B[38;5;241m=\u001B[39mreplicate_ratio,\n\u001B[1;32m 20\u001B[0m high_replicate_ratio\u001B[38;5;241m=\u001B[39mhigh_confidence_replicate_ratio,\n\u001B[1;32m 21\u001B[0m batch_ratio\u001B[38;5;241m=\u001B[39mbatch_ratio,\n\u001B[1;32m 22\u001B[0m high_batch_ratio\u001B[38;5;241m=\u001B[39mhigh_confidence_batch_ratio,\n\u001B[1;32m 23\u001B[0m technique\u001B[38;5;241m=\u001B[39mtechnique,\n\u001B[1;32m 24\u001B[0m cutoff\u001B[38;5;241m=\u001B[39mcutoff\n\u001B[1;32m 25\u001B[0m )\n", + "File \u001B[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:808\u001B[0m, in \u001B[0;36mrnaseq_gen\u001B[0;34m(context_name, input_rnaseq_filepath, input_gene_info_filepath, output_rnaseq_filepath, prep, taxon, input_metadata_filepath, input_metadata_df, replicate_ratio, high_replicate_ratio, batch_ratio, high_batch_ratio, technique, cutoff)\u001B[0m\n\u001B[1;32m 805\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mStarting \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontext_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 806\u001B[0m output_rnaseq_filepath\u001B[38;5;241m.\u001B[39mparent\u001B[38;5;241m.\u001B[39mmkdir(parents\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m, exist_ok\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[0;32m--> 808\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m _save_rnaseq_tests(\n\u001B[1;32m 809\u001B[0m context_name\u001B[38;5;241m=\u001B[39mcontext_name,\n\u001B[1;32m 810\u001B[0m rnaseq_matrix\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mawait\u001B[39;00m _read_counts(input_rnaseq_filepath),\n\u001B[1;32m 811\u001B[0m metadata_df\u001B[38;5;241m=\u001B[39minput_metadata_df \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28;01mawait\u001B[39;00m _create_metadata_df(input_metadata_filepath),\n\u001B[1;32m 812\u001B[0m gene_info_df\u001B[38;5;241m=\u001B[39mpd\u001B[38;5;241m.\u001B[39mread_csv(input_gene_info_filepath),\n\u001B[1;32m 813\u001B[0m output_filepath\u001B[38;5;241m=\u001B[39moutput_rnaseq_filepath,\n\u001B[1;32m 814\u001B[0m prep\u001B[38;5;241m=\u001B[39mprep,\n\u001B[1;32m 815\u001B[0m taxon\u001B[38;5;241m=\u001B[39mtaxon,\n\u001B[1;32m 816\u001B[0m replicate_ratio\u001B[38;5;241m=\u001B[39mreplicate_ratio,\n\u001B[1;32m 817\u001B[0m batch_ratio\u001B[38;5;241m=\u001B[39mbatch_ratio,\n\u001B[1;32m 818\u001B[0m high_replicate_ratio\u001B[38;5;241m=\u001B[39mhigh_replicate_ratio,\n\u001B[1;32m 819\u001B[0m high_batch_ratio\u001B[38;5;241m=\u001B[39mhigh_batch_ratio,\n\u001B[1;32m 820\u001B[0m technique\u001B[38;5;241m=\u001B[39mtechnique,\n\u001B[1;32m 821\u001B[0m cut_off\u001B[38;5;241m=\u001B[39mcutoff,\n\u001B[1;32m 822\u001B[0m )\n", + "File \u001B[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:671\u001B[0m, in \u001B[0;36m_save_rnaseq_tests\u001B[0;34m(context_name, rnaseq_matrix, metadata_df, gene_info_df, output_filepath, prep, taxon, replicate_ratio, batch_ratio, high_replicate_ratio, high_batch_ratio, technique, cut_off)\u001B[0m\n\u001B[1;32m 662\u001B[0m \u001B[38;5;250m\u001B[39m\u001B[38;5;124;03m\"\"\"Save the results of the RNA-Seq tests to a CSV file.\"\"\"\u001B[39;00m\n\u001B[1;32m 663\u001B[0m filtering_options \u001B[38;5;241m=\u001B[39m _FilteringOptions(\n\u001B[1;32m 664\u001B[0m replicate_ratio\u001B[38;5;241m=\u001B[39mreplicate_ratio,\n\u001B[1;32m 665\u001B[0m batch_ratio\u001B[38;5;241m=\u001B[39mbatch_ratio,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 668\u001B[0m high_batch_ratio\u001B[38;5;241m=\u001B[39mhigh_batch_ratio,\n\u001B[1;32m 669\u001B[0m )\n\u001B[0;32m--> 671\u001B[0m read_counts_results: _ReadMatrixResults \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mawait\u001B[39;00m _build_matrix_results(\n\u001B[1;32m 672\u001B[0m matrix\u001B[38;5;241m=\u001B[39mrnaseq_matrix,\n\u001B[1;32m 673\u001B[0m gene_info\u001B[38;5;241m=\u001B[39mgene_info_df,\n\u001B[1;32m 674\u001B[0m metadata_df\u001B[38;5;241m=\u001B[39mmetadata_df,\n\u001B[1;32m 675\u001B[0m taxon\u001B[38;5;241m=\u001B[39mtaxon,\n\u001B[1;32m 676\u001B[0m )\n\u001B[1;32m 677\u001B[0m metrics \u001B[38;5;241m=\u001B[39m read_counts_results\u001B[38;5;241m.\u001B[39mmetrics\n\u001B[1;32m 678\u001B[0m entrez_gene_ids \u001B[38;5;241m=\u001B[39m read_counts_results\u001B[38;5;241m.\u001B[39mentrez_gene_ids\n", + "File \u001B[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:201\u001B[0m, in \u001B[0;36m_build_matrix_results\u001B[0;34m(matrix, gene_info, metadata_df, taxon)\u001B[0m\n\u001B[1;32m 199\u001B[0m gene_info \u001B[38;5;241m=\u001B[39m gene_info_migrations(gene_info)\n\u001B[1;32m 200\u001B[0m \u001B[38;5;28mprint\u001B[39m(matrix)\n\u001B[0;32m--> 201\u001B[0m conversion \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mawait\u001B[39;00m ensembl_to_gene_id_and_symbol(ids\u001B[38;5;241m=\u001B[39m\u001B[43mmatrix\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mensembl_gene_id\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241m.\u001B[39mtolist(), taxon\u001B[38;5;241m=\u001B[39mtaxon)\n\u001B[1;32m 202\u001B[0m matrix \u001B[38;5;241m=\u001B[39m matrix\u001B[38;5;241m.\u001B[39mmerge(conversion, on\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mensembl_gene_id\u001B[39m\u001B[38;5;124m\"\u001B[39m, how\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mleft\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 204\u001B[0m \u001B[38;5;66;03m# Only include Entrez and Ensembl Gene IDs that are present in `gene_info`\u001B[39;00m\n", + "File \u001B[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 4100\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[1;32m 4101\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[0;32m-> 4102\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 4103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[1;32m 4104\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", + "File \u001B[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3807\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[1;32m 3808\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[1;32m 3809\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[1;32m 3810\u001B[0m ):\n\u001B[1;32m 3811\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[0;32m-> 3812\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[1;32m 3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[1;32m 3814\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[1;32m 3815\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[1;32m 3816\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[1;32m 3817\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", + "\u001B[0;31mKeyError\u001B[0m: 'ensembl_gene_id'" ] } ], From 3825aa259e4d3c71a648700c31dfa32962b5f45e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 15:48:06 -0600 Subject: [PATCH 40/62] refactor: move filtering technique to types --- main/como/rnaseq_gen.py | 27 +-------------------------- main/como/types.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 05f185f3..5d2e364e 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -27,7 +27,7 @@ from como.migrations import gene_info_migrations from como.project import Config -from como.types import RNAPrepMethod +from como.types import FilteringTechnique, RNAPrepMethod class _FilteringOptions(NamedTuple): @@ -38,31 +38,6 @@ class _FilteringOptions(NamedTuple): high_batch_ratio: float -class FilteringTechnique(Enum): - """RNA sequencing filtering capabilities.""" - - cpm = "cpm" - zfpkm = "zfpkm" - tpm = "quantile" - umi = "umi" - - @staticmethod - def from_string(value: str) -> FilteringTechnique: - """Create a filtering technique object from a string.""" - match value.lower(): - case "cpm": - return FilteringTechnique.cpm - case "zfpkm": - return FilteringTechnique.zfpkm - case "quantile": - return FilteringTechnique.tpm - case "umi": - return FilteringTechnique.umi - case _: - possible_values = [t.value for t in FilteringTechnique] - raise ValueError(f"Got a filtering technique of '{value}'; should be one of: {possible_values}") - - class LayoutMethod(Enum): """RNA sequencing layout method.""" diff --git a/main/como/types.py b/main/como/types.py index 42908252..916d3a25 100644 --- a/main/como/types.py +++ b/main/como/types.py @@ -46,5 +46,30 @@ def from_string(value: str) -> RNAPrepMethod: raise ValueError(f"Filtering technique must be one of {possible_values}; got: {value}") +class FilteringTechnique(Enum): + """RNA sequencing filtering capabilities.""" + + cpm = "cpm" + zfpkm = "zfpkm" + tpm = "quantile" + umi = "umi" + + @staticmethod + def from_string(value: str) -> FilteringTechnique: + """Create a filtering technique object from a string.""" + match value.lower(): + case "cpm": + return FilteringTechnique.cpm + case "zfpkm": + return FilteringTechnique.zfpkm + case "quantile": + return FilteringTechnique.tpm + case "umi": + return FilteringTechnique.umi + case _: + possible_values = [t.value for t in FilteringTechnique] + raise ValueError(f"Got a filtering technique of '{value}'; should be one of: {possible_values}") + + type_path = str | Path type_rna = Literal["total", "mrna"] From b0fa60eb3d80f1ebe994dad4c26043099aefaa59 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 15:48:17 -0600 Subject: [PATCH 41/62] fix: do not use more cores than necessary --- main/como/rnaseq_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 5d2e364e..8274a0d3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -356,7 +356,7 @@ def zfpkm_transform( total = len(fpkm_df.columns) update_per_step: int = int(np.ceil(total * update_every_percent)) - cores = multiprocessing.cpu_count() - 2 + cores = min(multiprocessing.cpu_count() - 2, total) logger.debug(f"Processing {total:,} samples through zFPKM transform using {cores} cores") logger.debug( f"Will update every {update_per_step:,} steps as this is approximately " From e8d3fd5ce67365244e99407f17024aee217b69a7 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 15:49:25 -0600 Subject: [PATCH 42/62] fix: set cobra solver earlier --- main/como/create_context_specific_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 35fb251b..2628eac3 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -419,6 +419,7 @@ def _build_model( # noqa: C901 force excluded even if they meet GPR association requirements using the force exclude file. """ config = Config() + cobra.Configuration().solver = solver.lower() reference_model: cobra.Model match general_model_file.suffix: case ".mat": @@ -442,7 +443,6 @@ def _build_model( # noqa: C901 # set solver reference_model.solver = solver.lower() - cobra.Configuration().solver = solver.lower() # check number of unsolvable reactions for reference model under media assumptions # incon_rxns, cobra_model = _feasibility_test(cobra_model, "before_seeding") From 322f948ce9f387d1c3f8dc0ecf12e46c2f0d1982 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Tue, 10 Dec 2024 16:26:18 -0600 Subject: [PATCH 43/62] refactor: remove troppo and cobamp for CC testing Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 7 ++++++- pyproject.toml | 8 ++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 2628eac3..bb7324ee 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -419,7 +419,12 @@ def _build_model( # noqa: C901 force excluded even if they meet GPR association requirements using the force exclude file. """ config = Config() - cobra.Configuration().solver = solver.lower() + + cobra_config = cobra.Configuration() + print(cobra_config.solver) + cobra_config.solver = solver.lower() + print(cobra_config.solver) + reference_model: cobra.Model match general_model_file.suffix: case ".mat": diff --git a/pyproject.toml b/pyproject.toml index 2c6d42c5..f10dc4fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "openpyxl>=3.1.5", "aiofiles>=24.1.0", "aioftp>=0.23.1", - "troppo", - "cobamp", +# "troppo", +# "cobamp", ] [build-system] @@ -48,5 +48,5 @@ dev-dependencies = [ ] [tool.uv.sources] -troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } -cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } +#troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } +#cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } From 6cc2a7b1f6e89d160b019f41ed1e5340cefc5b78 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 09:10:29 -0600 Subject: [PATCH 44/62] chore: bump version --- main/como/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/__init__.py b/main/como/__init__.py index 05100d09..f92ebf4b 100644 --- a/main/como/__init__.py +++ b/main/como/__init__.py @@ -4,7 +4,7 @@ from como.utils import stringlist_to_list __all__ = ["stringlist_to_list", "Config"] -__version__ = "1.10.0" +__version__ = "1.11.0" def return_placeholder_data() -> pd.DataFrame: From 4a331802b27f76e3ecaa5cbf65cfd8f31b124ebe Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 09:10:29 -0600 Subject: [PATCH 45/62] chore: bump version --- main/como/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/__init__.py b/main/como/__init__.py index 05100d09..f92ebf4b 100644 --- a/main/como/__init__.py +++ b/main/como/__init__.py @@ -4,7 +4,7 @@ from como.utils import stringlist_to_list __all__ = ["stringlist_to_list", "Config"] -__version__ = "1.10.0" +__version__ = "1.11.0" def return_placeholder_data() -> pd.DataFrame: From 3e495f6e6edd137e56dbd44640a3e22e15ad3b62 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 09:12:28 -0600 Subject: [PATCH 46/62] revert: require troppo and cobamp This requires troppo and cobamp but does not yet set their version Signed-off-by: Josh Loecker --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f10dc4fe..67cb76f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "openpyxl>=3.1.5", "aiofiles>=24.1.0", "aioftp>=0.23.1", -# "troppo", -# "cobamp", + "troppo", + "cobamp", ] [build-system] @@ -47,6 +47,6 @@ dev-dependencies = [ "pytest-cov>=6.0.0", ] -[tool.uv.sources] +#[tool.uv.sources] #troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } #cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } From 74d55c9e52dc261ebc13091c0f2517d97dc0ad45 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 11:57:55 -0600 Subject: [PATCH 47/62] revert: install troppo and cobamp from modified repo Signed-off-by: Josh Loecker --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 67cb76f6..2c6d42c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,6 @@ dev-dependencies = [ "pytest-cov>=6.0.0", ] -#[tool.uv.sources] -#troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } -#cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } +[tool.uv.sources] +troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } +cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } From b2a6e0f21e38e6aa86233bdf54beb84eb85fafee Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 11:58:22 -0600 Subject: [PATCH 48/62] feat: set troppo solver This also sets the downstream sovler in cobamp Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index bb7324ee..970f82da 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -308,9 +308,16 @@ def _build_with_imat( expr_vector: npt.NDArray, expr_thesh: tuple[float, float], force_gene_ids: Sequence[int], + solver: str, ) -> (cobra.Model, pd.DataFrame): expr_vector = np.array(expr_vector) - properties = IMATProperties(exp_vector=expr_vector, exp_thresholds=expr_thesh, core=force_gene_ids, epsilon=0.01) + properties = IMATProperties( + exp_vector=expr_vector, + exp_thresholds=expr_thesh, + core=force_gene_ids, + epsilon=0.01, + solver=solver, + ) algorithm = IMAT(s_matrix, np.array(lb), np.array(ub), properties) context_rxns: npt.NDArray = algorithm.run() fluxes: pd.Series = algorithm.sol.to_series() From baca9e769d4113c10928f91144ffcdc57ddee606 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 12:02:15 -0600 Subject: [PATCH 49/62] fix: provide solver to imat build function Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 970f82da..9abf0d3f 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -520,7 +520,14 @@ def _build_model( # noqa: C901 elif recon_algorithm == Algorithm.IMAT: context_model_cobra: cobra.Model context_model_cobra, flux_df = _build_with_imat( - reference_model, s_matrix, lb, ub, expr_vector, exp_thresh, idx_force + reference_model, + s_matrix, + lb, + ub, + expr_vector, + exp_thresh, + idx_force, + solver=solver, ) imat_reactions = flux_df.rxn model_reactions = [reaction.id for reaction in context_model_cobra.reactions] From bb2611c5c85a7c597e4de15ea02f3eebfea6cd86 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 12:05:34 -0600 Subject: [PATCH 50/62] fix: temporarily install troppo from develop Signed-off-by: Josh Loecker --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2c6d42c5..465173e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,5 +48,5 @@ dev-dependencies = [ ] [tool.uv.sources] -troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } +troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "feat/allow-setting-solver" } cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } From cced1b6f34f9f2c862a4e23dc9c7549412f00687 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 12:11:26 -0600 Subject: [PATCH 51/62] fix: install from branch Signed-off-by: Josh Loecker --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 465173e6..b1c10adf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,5 +48,5 @@ dev-dependencies = [ ] [tool.uv.sources] -troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "feat/allow-setting-solver" } -cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } +troppo = { git = "https://github.com/JoshLoecker/troppo", branch = "feat/allow-setting-solver" } +cobamp = { git = "https://github.com/JoshLoecker/cobamp", branch = "update_packages" } From 2e64db496d6ae46c6aa613d2bd0d79fc37b09938 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 12:16:16 -0600 Subject: [PATCH 52/62] refactor: install troppo/cobamp from github Signed-off-by: Josh Loecker --- pyproject.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b1c10adf..e4535908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "openpyxl>=3.1.5", "aiofiles>=24.1.0", "aioftp>=0.23.1", - "troppo", - "cobamp", + "git+https://github.com/JoshLoecker/troppo@feat/allow-setting-solver", + "git+https://github.com/JoshLoecker/cobamp@update_packages", ] [build-system] @@ -47,6 +47,6 @@ dev-dependencies = [ "pytest-cov>=6.0.0", ] -[tool.uv.sources] -troppo = { git = "https://github.com/JoshLoecker/troppo", branch = "feat/allow-setting-solver" } -cobamp = { git = "https://github.com/JoshLoecker/cobamp", branch = "update_packages" } +#[tool.uv.sources] +#troppo = { git = "https://github.com/JoshLoecker/troppo", branch = "feat/allow-setting-solver" } +#cobamp = { git = "https://github.com/JoshLoecker/cobamp", branch = "update_packages" } From 1c6743b2c7b67c49141cf01593151915a124e3eb Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 13:03:48 -0600 Subject: [PATCH 53/62] fix: use proper installation for troppo/cobamp Signed-off-by: Josh Loecker --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e4535908..58bc6129 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "openpyxl>=3.1.5", "aiofiles>=24.1.0", "aioftp>=0.23.1", - "git+https://github.com/JoshLoecker/troppo@feat/allow-setting-solver", - "git+https://github.com/JoshLoecker/cobamp@update_packages", + "troppo@git+https://github.com/JoshLoecker/troppo@feat/allow-setting-solver", + "cobamp@git+https://github.com/JoshLoecker/cobamp@update_packages", ] [build-system] From 0a0ac4d51127398dbd8c6326becbce1d9ad1620d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 13:04:54 -0600 Subject: [PATCH 54/62] fix: use correct branch for cobamp Signed-off-by: Josh Loecker --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 58bc6129..2dd7d11c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "aiofiles>=24.1.0", "aioftp>=0.23.1", "troppo@git+https://github.com/JoshLoecker/troppo@feat/allow-setting-solver", - "cobamp@git+https://github.com/JoshLoecker/cobamp@update_packages", + "cobamp@git+https://github.com/JoshLoecker/cobamp@master", ] [build-system] From 091f4bc6a02e7424cf85c38620848d503a3d34d9 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 13:56:51 -0600 Subject: [PATCH 55/62] chore: reset troppo branch Signed-off-by: Josh Loecker --- pyproject.toml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2dd7d11c..2f4fdb2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "openpyxl>=3.1.5", "aiofiles>=24.1.0", "aioftp>=0.23.1", - "troppo@git+https://github.com/JoshLoecker/troppo@feat/allow-setting-solver", + "troppo@git+https://github.com/JoshLoecker/troppo@master", "cobamp@git+https://github.com/JoshLoecker/cobamp@master", ] @@ -46,7 +46,3 @@ dev-dependencies = [ "hypothesis>=6.122.1", "pytest-cov>=6.0.0", ] - -#[tool.uv.sources] -#troppo = { git = "https://github.com/JoshLoecker/troppo", branch = "feat/allow-setting-solver" } -#cobamp = { git = "https://github.com/JoshLoecker/cobamp", branch = "update_packages" } From 5417157ef5fa4fd846c658ead7e8467cfb9e15f7 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 13:59:52 -0600 Subject: [PATCH 56/62] fix: solver must be uppercase Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 9abf0d3f..fc6ce6be 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -316,7 +316,7 @@ def _build_with_imat( exp_thresholds=expr_thesh, core=force_gene_ids, epsilon=0.01, - solver=solver, + solver=solver.upper(), ) algorithm = IMAT(s_matrix, np.array(lb), np.array(ub), properties) context_rxns: npt.NDArray = algorithm.run() From 157f2e513ce682b4c79c126e52d735d2baf0553d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 14:04:30 -0600 Subject: [PATCH 57/62] chore: remove print Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index fc6ce6be..3d280d4e 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -316,7 +316,8 @@ def _build_with_imat( exp_thresholds=expr_thesh, core=force_gene_ids, epsilon=0.01, - solver=solver.upper(), + solver=solver.upper()z + , ) algorithm = IMAT(s_matrix, np.array(lb), np.array(ub), properties) context_rxns: npt.NDArray = algorithm.run() @@ -426,12 +427,6 @@ def _build_model( # noqa: C901 force excluded even if they meet GPR association requirements using the force exclude file. """ config = Config() - - cobra_config = cobra.Configuration() - print(cobra_config.solver) - cobra_config.solver = solver.lower() - print(cobra_config.solver) - reference_model: cobra.Model match general_model_file.suffix: case ".mat": From 4fa324c5208501cf765830372d2d39aca2c700c5 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 14:09:36 -0600 Subject: [PATCH 58/62] chore: fix typo Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 3d280d4e..f6ac8b91 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -316,8 +316,7 @@ def _build_with_imat( exp_thresholds=expr_thesh, core=force_gene_ids, epsilon=0.01, - solver=solver.upper()z - , + solver=solver.upper(), ) algorithm = IMAT(s_matrix, np.array(lb), np.array(ub), properties) context_rxns: npt.NDArray = algorithm.run() From 6c08b7bdb3f2c5e0ad810c6aa6f97612526353a3 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 14:14:58 -0600 Subject: [PATCH 59/62] chore: bump version Signed-off-by: Josh Loecker --- main/como/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/__init__.py b/main/como/__init__.py index f92ebf4b..c6ee72e2 100644 --- a/main/como/__init__.py +++ b/main/como/__init__.py @@ -4,7 +4,7 @@ from como.utils import stringlist_to_list __all__ = ["stringlist_to_list", "Config"] -__version__ = "1.11.0" +__version__ = "1.11.1" def return_placeholder_data() -> pd.DataFrame: From 6bd06e1875e8c37747bb8adf4eddfbb45608c240 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 14:37:58 -0600 Subject: [PATCH 60/62] fix: use parenthesis to validate calculations Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 8274a0d3..364bd4c3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -505,7 +505,7 @@ def cpm_filter( cutoff = ( 10e6 / (np.median(np.sum(counts[:, i]))) if cut_off == "default" - else 1e6 * cut_off / np.median(np.sum(counts[:, i])) + else (1e6 * cut_off) / np.median(np.sum(counts[:, i])) ) test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) @@ -637,7 +637,7 @@ async def _save_rnaseq_tests( filtering_options = _FilteringOptions( replicate_ratio=replicate_ratio, batch_ratio=batch_ratio, - cut_off=cut_off, + cut_off=float(cut_off), high_replicate_ratio=high_replicate_ratio, high_batch_ratio=high_batch_ratio, ) @@ -725,9 +725,14 @@ async def rnaseq_gen( # noqa: C901, allow complex function then study/batch numbers are checked for consensus according to batch ratios. The zFPKM method is outlined here: https://pubmed.ncbi.nlm.nih.gov/24215113/ - :param metadata_filepath: The configuration filename to read + :param context_name: The name of the context being processed + :param input_rnaseq_filepath: The filepath to the gene count matrix + :param input_gene_info_filepath: The filepath to the gene info file + :param output_rnaseq_filepath: The filepath to write the output gene count matrix :param prep: The preparation method :param taxon: The NCBI Taxon ID + :param input_metadata_filepath: The filepath to the metadata file + :param input_metadata_df: The metadata dataframe :param replicate_ratio: The percentage of replicates that a gene must appear in for a gene to be marked as "active" in a batch/study :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" @@ -759,7 +764,7 @@ async def rnaseq_gen( # noqa: C901, allow complex function cutoff = "default" case FilteringTechnique.zfpkm: - cutoff = "default" if cutoff else cutoff + cutoff = cutoff or -3 case FilteringTechnique.umi: pass case _: From f24347d14e00f6817114ef049ab6b5a09cd659b4 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 14:41:54 -0600 Subject: [PATCH 61/62] refactor: ignore missing variables for now Signed-off-by: Josh Loecker --- main/COMO.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main/COMO.ipynb b/main/COMO.ipynb index 52d7fb88..bbaeb152 100644 --- a/main/COMO.ipynb +++ b/main/COMO.ipynb @@ -442,7 +442,7 @@ } ], "source": [ - "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", "\n", "replicate_ratio = 0.75\n", "high_confidence_replicate_ratio = 1.0\n", @@ -452,7 +452,7 @@ "cutoff = -3\n", "\n", "for i, context in enumerate(context_names):\n", - " await rnaseq_gen( # noqa\n", + " await rnaseq_gen(\n", " context_name=context,\n", " input_rnaseq_filepath=trna_matrix_filepath[i],\n", " input_gene_info_filepath=gene_info_filepath[i],\n", @@ -494,7 +494,7 @@ "metadata": {}, "outputs": [], "source": [ - "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", "\n", "replicate_ratio = 0.75\n", "high_confidence_replicate_ratio = 1.0\n", @@ -546,7 +546,7 @@ "metadata": {}, "outputs": [], "source": [ - "from como.rnaseq_gen import rnaseq_gen, FilteringTechnique\n", + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", "\n", "replicate_ratio = 0.75\n", "high_confidence_replicate_ratio = 1.0\n", @@ -744,8 +744,8 @@ " [\n", " \"python3\", \"como/merge_xomics.py\",\n", " \"--merge-zfpkm-distribution\",\n", - " \"--total-rnaseq-config-file\", trnaseq_config_file,\n", - " \"--mrnaseq-config-file\", mrnaseq_config_file,\n", + " \"--total-rnaseq-config-file\", trnaseq_config_file, # noqa: F821\n", + " \"--mrnaseq-config-file\", mrnaseq_config_file, # noqa: F821\n", " # \"--scrnaseq-config-file\", scrnaseq_config_file, # If using single-cell data, uncomment the start of this line\n", " # \"--proteomics-config-file\", proteomics_config_file, # If using proteomics data, uncomment the start of this line\n", " \"--requirement-adjust\", requirement_adjust,\n", From 1956721275442f45eae829d592671810647f0832 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 11 Dec 2024 14:43:14 -0600 Subject: [PATCH 62/62] chore: ignore complex function Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 5c2d8125..02702b38 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -278,7 +278,7 @@ async def _write_counts_matrix( return final_matrix -async def _create_config_df( +async def _create_config_df( # noqa: C901 context_name: str, /, como_context_dir: Path,