diff --git a/main/COMO.ipynb b/main/COMO.ipynb index d361f9b2..bbaeb152 100644 --- a/main/COMO.ipynb +++ b/main/COMO.ipynb @@ -2,7 +2,11 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, "source": [ "# COMO: Constraint-based Optomization of Metabolic Objectives\n", "\n", @@ -56,7 +60,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, "source": [ "# Step 1: Data Preprocessing and Analysis\n", "\n", @@ -223,6 +231,35 @@ "" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-09T22:10:43.421233Z", + "start_time": "2024-12-09T22:10:43.418100Z" + } + }, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from como.rnaseq_preprocess import rnaseq_preprocess\n", + "from como.types import RNAPrepMethod\n", + "\n", + "taxon_id = 9606\n", + "context_names = [\"naiveB\"]\n", + "gene_info_filepath = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", + "como_context_dir = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", + "trna_matrix_filepath = [Path(f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for context in context_names]\n", + "polya_matrix_filepath = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", + "\n", + "# No single-cell data is provided by default; COMO accepts single-cell data in CSV or h5ad format\n", + "# If you are using single-cell data, adjust the following lines accordingly\n", + "scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.csv\") for context in context_names]\n", + "# scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.h5ad\") for context in context_names]\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -236,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2024-12-07T03:30:27.253112Z", @@ -248,50 +285,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[32m2024-12-06 23:12:10\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m629\u001B[0m - \u001B[1mTEST\u001B[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[32m2024-12-06 23:12:11\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'total' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001B[0m\n", - "\u001B[32m2024-12-06 23:12:11\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'polya' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/polya-rna/polyarna_naiveB.csv'\u001B[0m\n", - "\u001B[32m2024-12-06 23:12:11\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m451\u001B[0m - \u001B[1mFetching gene info (this may take 1-5 minutes)\u001B[0m\n", - "\u001B[32m2024-12-06 23:13:04\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m488\u001B[0m - \u001B[32m\u001B[1mGene Info file written at '/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info.csv'\u001B[0m\n" + "\u001B[32m2024-12-09 16:23:55\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'polya' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/polya-rna/polyarna_naiveB.csv'\u001B[0m\n", + "\u001B[32m2024-12-09 16:23:55\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m274\u001B[0m - \u001B[32m\u001B[1mWrote gene count matrix for 'total' RNA at '/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001B[0m\n", + "\u001B[32m2024-12-09 16:23:55\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m451\u001B[0m - \u001B[1mFetching gene info (this may take 1-5 minutes)\u001B[0m\n", + "\u001B[32m2024-12-09 16:24:13\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mcomo.rnaseq_preprocess\u001B[0m:\u001B[36m488\u001B[0m - \u001B[32m\u001B[1mGene Info file written at '/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info.csv'\u001B[0m\n" ] } ], "source": [ - "from pathlib import Path\n", - "\n", - "from como.rnaseq_preprocess import rnaseq_preprocess\n", - "from como.types import RNAPrepMethod\n", - "\n", - "context_names = [\"naiveB\"]\n", - "output_gene_info_filepaths = [Path(f\"data/results/{context}/gene_info.csv\") for context in context_names]\n", - "como_context_dirs = [Path(f\"data/COMO_input/{context}\") for context in context_names]\n", - "output_trna_filepaths = [Path(f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for context in context_names]\n", - "output_polya_filepaths = [Path(f\"data/results/{context}/polya-rna/polyarna_{context}.csv\") for context in context_names]\n", - "\n", - "\n", "for i in range(len(context_names)):\n", " await rnaseq_preprocess(\n", " context_name=context_names[i],\n", - " taxon=9606,\n", - " output_gene_info_filepath=output_gene_info_filepaths[i],\n", - " como_context_dir=como_context_dirs[i],\n", + " taxon=taxon_id,\n", + " output_gene_info_filepath=gene_info_filepath[i],\n", + " como_context_dir=como_context_dir[i],\n", " output_trna_config_filepath=Path(\"./data/config_sheets/trna_config.xlsx\"),\n", - " output_trna_count_matrix_filepath=output_trna_filepaths[i],\n", - " output_polya_config_filepath=Path(\"./data/config_sheets/mrna_config.xlsx\"),\n", - " output_polya_count_matrix_filepath=output_polya_filepaths[i],\n", + " output_trna_count_matrix_filepath=trna_matrix_filepath[i],\n", + " output_mrna_config_filepath=Path(\"./data/config_sheets/polya_config.xlsx\"),\n", + " output_mrna_count_matrix_filepath=polya_matrix_filepath[i],\n", " cache=True,\n", " log_level=\"INFO\",\n", " )" @@ -369,39 +380,93 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-09T22:13:42.060657Z", + "start_time": "2024-12-09T22:13:41.740347Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2024-12-09 16:18:43.958\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mcomo.rnaseq_gen\u001B[0m:\u001B[36mrnaseq_gen\u001B[0m:\u001B[36m805\u001B[0m - \u001B[34m\u001B[1mStarting 'naiveB'\u001B[0m\n", + "\u001B[32m2024-12-09 16:18:43.961\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mcomo.rnaseq_gen\u001B[0m:\u001B[36m_read_counts\u001B[0m:\u001B[36m175\u001B[0m - \u001B[34m\u001B[1mReading CSV file at 'data/results/naiveB/total-rna/totalrna_naiveB.csv'\u001B[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " entrez_gene_id expressed high\n", + "0 7105 0 0\n", + "1 64102 0 0\n", + "2 8813 0 0\n", + "3 57147 0 0\n", + "4 55732 0 0\n", + "... ... ... ...\n", + "34396 124901321 0 0\n", + "34397 124902403 0 0\n", + "34398 101929614 0 0\n", + "34399 107984888 0 0\n", + "34400 124900697 0 0\n", + "\n", + "[34401 rows x 3 columns]\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'ensembl_gene_id'", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", + "File \u001B[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m-> 3805\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", + "File \u001B[0;32mindex.pyx:167\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n", + "File \u001B[0;32mindex.pyx:196\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n", + "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n", + "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n", + "\u001B[0;31mKeyError\u001B[0m: 'ensembl_gene_id'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[4], line 11\u001B[0m\n\u001B[1;32m 8\u001B[0m cutoff \u001B[38;5;241m=\u001B[39m \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m3\u001B[39m\n\u001B[1;32m 10\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i, context \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(context_names):\n\u001B[0;32m---> 11\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m rnaseq_gen( \u001B[38;5;66;03m# noqa\u001B[39;00m\n\u001B[1;32m 12\u001B[0m context_name\u001B[38;5;241m=\u001B[39mcontext,\n\u001B[1;32m 13\u001B[0m input_rnaseq_filepath\u001B[38;5;241m=\u001B[39mtrna_matrix_filepath[i],\n\u001B[1;32m 14\u001B[0m input_gene_info_filepath\u001B[38;5;241m=\u001B[39mgene_info_filepath[i],\n\u001B[1;32m 15\u001B[0m output_rnaseq_filepath\u001B[38;5;241m=\u001B[39mtrna_matrix_filepath[i],\n\u001B[1;32m 16\u001B[0m prep\u001B[38;5;241m=\u001B[39mRNAPrepMethod\u001B[38;5;241m.\u001B[39mTOTAL,\n\u001B[1;32m 17\u001B[0m taxon\u001B[38;5;241m=\u001B[39mtaxon_id,\n\u001B[1;32m 18\u001B[0m input_metadata_filepath\u001B[38;5;241m=\u001B[39mPath(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m./data/config_sheets/trna_config.xlsx\u001B[39m\u001B[38;5;124m\"\u001B[39m),\n\u001B[1;32m 19\u001B[0m replicate_ratio\u001B[38;5;241m=\u001B[39mreplicate_ratio,\n\u001B[1;32m 20\u001B[0m high_replicate_ratio\u001B[38;5;241m=\u001B[39mhigh_confidence_replicate_ratio,\n\u001B[1;32m 21\u001B[0m batch_ratio\u001B[38;5;241m=\u001B[39mbatch_ratio,\n\u001B[1;32m 22\u001B[0m high_batch_ratio\u001B[38;5;241m=\u001B[39mhigh_confidence_batch_ratio,\n\u001B[1;32m 23\u001B[0m technique\u001B[38;5;241m=\u001B[39mtechnique,\n\u001B[1;32m 24\u001B[0m cutoff\u001B[38;5;241m=\u001B[39mcutoff\n\u001B[1;32m 25\u001B[0m )\n", + "File \u001B[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:808\u001B[0m, in \u001B[0;36mrnaseq_gen\u001B[0;34m(context_name, input_rnaseq_filepath, input_gene_info_filepath, output_rnaseq_filepath, prep, taxon, input_metadata_filepath, input_metadata_df, replicate_ratio, high_replicate_ratio, batch_ratio, high_batch_ratio, technique, cutoff)\u001B[0m\n\u001B[1;32m 805\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mStarting \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontext_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 806\u001B[0m output_rnaseq_filepath\u001B[38;5;241m.\u001B[39mparent\u001B[38;5;241m.\u001B[39mmkdir(parents\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m, exist_ok\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[0;32m--> 808\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m _save_rnaseq_tests(\n\u001B[1;32m 809\u001B[0m context_name\u001B[38;5;241m=\u001B[39mcontext_name,\n\u001B[1;32m 810\u001B[0m rnaseq_matrix\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mawait\u001B[39;00m _read_counts(input_rnaseq_filepath),\n\u001B[1;32m 811\u001B[0m metadata_df\u001B[38;5;241m=\u001B[39minput_metadata_df \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28;01mawait\u001B[39;00m _create_metadata_df(input_metadata_filepath),\n\u001B[1;32m 812\u001B[0m gene_info_df\u001B[38;5;241m=\u001B[39mpd\u001B[38;5;241m.\u001B[39mread_csv(input_gene_info_filepath),\n\u001B[1;32m 813\u001B[0m output_filepath\u001B[38;5;241m=\u001B[39moutput_rnaseq_filepath,\n\u001B[1;32m 814\u001B[0m prep\u001B[38;5;241m=\u001B[39mprep,\n\u001B[1;32m 815\u001B[0m taxon\u001B[38;5;241m=\u001B[39mtaxon,\n\u001B[1;32m 816\u001B[0m replicate_ratio\u001B[38;5;241m=\u001B[39mreplicate_ratio,\n\u001B[1;32m 817\u001B[0m batch_ratio\u001B[38;5;241m=\u001B[39mbatch_ratio,\n\u001B[1;32m 818\u001B[0m high_replicate_ratio\u001B[38;5;241m=\u001B[39mhigh_replicate_ratio,\n\u001B[1;32m 819\u001B[0m high_batch_ratio\u001B[38;5;241m=\u001B[39mhigh_batch_ratio,\n\u001B[1;32m 820\u001B[0m technique\u001B[38;5;241m=\u001B[39mtechnique,\n\u001B[1;32m 821\u001B[0m cut_off\u001B[38;5;241m=\u001B[39mcutoff,\n\u001B[1;32m 822\u001B[0m )\n", + "File \u001B[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:671\u001B[0m, in \u001B[0;36m_save_rnaseq_tests\u001B[0;34m(context_name, rnaseq_matrix, metadata_df, gene_info_df, output_filepath, prep, taxon, replicate_ratio, batch_ratio, high_replicate_ratio, high_batch_ratio, technique, cut_off)\u001B[0m\n\u001B[1;32m 662\u001B[0m \u001B[38;5;250m\u001B[39m\u001B[38;5;124;03m\"\"\"Save the results of the RNA-Seq tests to a CSV file.\"\"\"\u001B[39;00m\n\u001B[1;32m 663\u001B[0m filtering_options \u001B[38;5;241m=\u001B[39m _FilteringOptions(\n\u001B[1;32m 664\u001B[0m replicate_ratio\u001B[38;5;241m=\u001B[39mreplicate_ratio,\n\u001B[1;32m 665\u001B[0m batch_ratio\u001B[38;5;241m=\u001B[39mbatch_ratio,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 668\u001B[0m high_batch_ratio\u001B[38;5;241m=\u001B[39mhigh_batch_ratio,\n\u001B[1;32m 669\u001B[0m )\n\u001B[0;32m--> 671\u001B[0m read_counts_results: _ReadMatrixResults \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mawait\u001B[39;00m _build_matrix_results(\n\u001B[1;32m 672\u001B[0m matrix\u001B[38;5;241m=\u001B[39mrnaseq_matrix,\n\u001B[1;32m 673\u001B[0m gene_info\u001B[38;5;241m=\u001B[39mgene_info_df,\n\u001B[1;32m 674\u001B[0m metadata_df\u001B[38;5;241m=\u001B[39mmetadata_df,\n\u001B[1;32m 675\u001B[0m taxon\u001B[38;5;241m=\u001B[39mtaxon,\n\u001B[1;32m 676\u001B[0m )\n\u001B[1;32m 677\u001B[0m metrics \u001B[38;5;241m=\u001B[39m read_counts_results\u001B[38;5;241m.\u001B[39mmetrics\n\u001B[1;32m 678\u001B[0m entrez_gene_ids \u001B[38;5;241m=\u001B[39m read_counts_results\u001B[38;5;241m.\u001B[39mentrez_gene_ids\n", + "File \u001B[0;32m~/Projects/COMO/main/como/rnaseq_gen.py:201\u001B[0m, in \u001B[0;36m_build_matrix_results\u001B[0;34m(matrix, gene_info, metadata_df, taxon)\u001B[0m\n\u001B[1;32m 199\u001B[0m gene_info \u001B[38;5;241m=\u001B[39m gene_info_migrations(gene_info)\n\u001B[1;32m 200\u001B[0m \u001B[38;5;28mprint\u001B[39m(matrix)\n\u001B[0;32m--> 201\u001B[0m conversion \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mawait\u001B[39;00m ensembl_to_gene_id_and_symbol(ids\u001B[38;5;241m=\u001B[39m\u001B[43mmatrix\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mensembl_gene_id\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241m.\u001B[39mtolist(), taxon\u001B[38;5;241m=\u001B[39mtaxon)\n\u001B[1;32m 202\u001B[0m matrix \u001B[38;5;241m=\u001B[39m matrix\u001B[38;5;241m.\u001B[39mmerge(conversion, on\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mensembl_gene_id\u001B[39m\u001B[38;5;124m\"\u001B[39m, how\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mleft\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 204\u001B[0m \u001B[38;5;66;03m# Only include Entrez and Ensembl Gene IDs that are present in `gene_info`\u001B[39;00m\n", + "File \u001B[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 4100\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[1;32m 4101\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[0;32m-> 4102\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 4103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[1;32m 4104\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", + "File \u001B[0;32m~/Projects/COMO/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3807\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[1;32m 3808\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[1;32m 3809\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[1;32m 3810\u001B[0m ):\n\u001B[1;32m 3811\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[0;32m-> 3812\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[1;32m 3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[1;32m 3814\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[1;32m 3815\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[1;32m 3816\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[1;32m 3817\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", + "\u001B[0;31mKeyError\u001B[0m: 'ensembl_gene_id'" + ] + } + ], "source": [ - "# step 2.2 RNA-seq Analysis for Total RNA-seq library preparation\n", + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", "\n", - "trnaseq_config_file = \"trnaseq_data_inputs_auto.xlsx\"\n", - "rep_ratio = 0.75\n", - "group_ratio = 0.75\n", - "rep_ratio_h = 1.0\n", - "group_ratio_h = 1.0\n", - "technique = \"zFPKM\"\n", - "minimum_cutoff = -3\n", - "taxon_id = \"human\"\n", - "\n", - "# fmt: off\n", - "cmd = \" \".join(\n", - " [\n", - " \"python3\", \"como/rnaseq_gen.py\",\n", - " \"--config-file\", trnaseq_config_file,\n", - " \"--replicate-ratio\", str(rep_ratio),\n", - " \"--batch-ratio\", str(group_ratio),\n", - " \"--high-replicate-ratio\", str(rep_ratio_h),\n", - " \"--high-batch-ratio\", str(group_ratio_h),\n", - " \"--minimum-cutoff\", str(minimum_cutoff),\n", - " \"--filt-technique\", f\"{technique}\",\n", - " \"--library-prep\", \"total\",\n", - " \"--taxon-id\", taxon_id\n", - " ]\n", - ")\n", - "# fmt: on\n", - "\n", - "!{cmd}" + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.zfpkm\n", + "cutoff = -3\n", + "\n", + "for i, context in enumerate(context_names):\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=trna_matrix_filepath[i],\n", + " input_gene_info_filepath=gene_info_filepath[i],\n", + " output_rnaseq_filepath=trna_matrix_filepath[i],\n", + " prep=RNAPrepMethod.TOTAL,\n", + " taxon=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/trna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" ] }, { @@ -429,33 +494,31 @@ "metadata": {}, "outputs": [], "source": [ - "mrnaseq_config_file = \"mrnaseq_data_inputs_auto.xlsx\"\n", - "rep_ratio = 0.75\n", - "group_ratio = 0.75\n", - "rep_ratio_h = 1.0\n", - "group_ratio_h = 1.0\n", - "technique = \"zfpkm\"\n", - "minimum_cutoff = -3\n", - "taxon_id = \"human\"\n", - "\n", - "# fmt: off\n", - "cmd = \" \".join(\n", - " [\n", - " \"python3\", \"como/rnaseq_gen.py\",\n", - " \"--config-file\", mrnaseq_config_file,\n", - " \"--replicate-ratio\", str(rep_ratio),\n", - " \"--batch-ratio\", str(group_ratio),\n", - " \"--high-replicate-ratio\", str(rep_ratio_h),\n", - " \"--high-batch-ratio\", str(group_ratio_h),\n", - " \"--minimum-cutoff\", str(minimum_cutoff),\n", - " \"--filt-technique\", f\"{technique}\",\n", - " \"--library-prep\", \"mrna\",\n", - " \"--taxon-id\", taxon_id\n", - " ]\n", - ")\n", - "# fmt: on\n", + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", "\n", - "!{cmd}" + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.zfpkm\n", + "cutoff = -3\n", + "\n", + "for i, context in enumerate(context_names):\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=polya_matrix_filepath[i],\n", + " input_gene_info_filepath=gene_info_filepath[i],\n", + " output_rnaseq_filepath=polya_matrix_filepath[i],\n", + " prep=RNAPrepMethod.MRNA,\n", + " taxon=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/mrna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" ] }, { @@ -483,33 +546,31 @@ "metadata": {}, "outputs": [], "source": [ - "scrnaseq_config_file = \"scrnaseq_data_inputs_auto.xlsx\"\n", - "rep_ratio = 0.75\n", - "group_ratio = 0.75\n", - "rep_ratio_h = 1.0\n", - "group_ratio_h = 1.0\n", - "quantile = 50\n", - "minimum_cutoff = -3\n", - "taxon_id = \"human\"\n", + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", "\n", - "# fmt: off\n", - "cmd = \" \".join(\n", - " [\n", - " \"python3\", \"como/rnaseq_gen.py\",\n", - " \"--config-file\", scrnaseq_config_file,\n", - " \"--replicate-ratio\", str(rep_ratio),\n", - " \"--batch-ratio\", str(group_ratio),\n", - " \"--high-replicate-ratio\", str(rep_ratio_h),\n", - " \"--high-batch-ratio\", str(group_ratio_h),\n", - " \"--minimum-cutoff\", str(minimum_cutoff),\n", - " \"--filt-technique\", \"umi\",\n", - " \"--library-prep\", \"scrna\",\n", - " \"--taxon-id\", taxon_id\n", - " ]\n", - ")\n", - "# fmt: on\n", - "\n", - "!{cmd}" + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.umi\n", + "cutoff = -3\n", + "\n", + "for i, context in enumerate(context_names):\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=scrna_matrix_filepath[i],\n", + " input_gene_info_filepath=gene_info_filepath[i],\n", + " output_rnaseq_filepath=scrna_matrix_filepath[i],\n", + " prep=RNAPrepMethod.SCRNA,\n", + " taxon=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/scrna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" ] }, { @@ -683,8 +744,8 @@ " [\n", " \"python3\", \"como/merge_xomics.py\",\n", " \"--merge-zfpkm-distribution\",\n", - " \"--total-rnaseq-config-file\", trnaseq_config_file,\n", - " \"--mrnaseq-config-file\", mrnaseq_config_file,\n", + " \"--total-rnaseq-config-file\", trnaseq_config_file, # noqa: F821\n", + " \"--mrnaseq-config-file\", mrnaseq_config_file, # noqa: F821\n", " # \"--scrnaseq-config-file\", scrnaseq_config_file, # If using single-cell data, uncomment the start of this line\n", " # \"--proteomics-config-file\", proteomics_config_file, # If using proteomics data, uncomment the start of this line\n", " \"--requirement-adjust\", requirement_adjust,\n", diff --git a/main/como/__init__.py b/main/como/__init__.py index 05100d09..c6ee72e2 100644 --- a/main/como/__init__.py +++ b/main/como/__init__.py @@ -4,7 +4,7 @@ from como.utils import stringlist_to_list __all__ = ["stringlist_to_list", "Config"] -__version__ = "1.10.0" +__version__ = "1.11.1" def return_placeholder_data() -> pd.DataFrame: diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 373dab92..f6ac8b91 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -308,9 +308,16 @@ def _build_with_imat( expr_vector: npt.NDArray, expr_thesh: tuple[float, float], force_gene_ids: Sequence[int], + solver: str, ) -> (cobra.Model, pd.DataFrame): expr_vector = np.array(expr_vector) - properties = IMATProperties(exp_vector=expr_vector, exp_thresholds=expr_thesh, core=force_gene_ids, epsilon=0.01) + properties = IMATProperties( + exp_vector=expr_vector, + exp_thresholds=expr_thesh, + core=force_gene_ids, + epsilon=0.01, + solver=solver.upper(), + ) algorithm = IMAT(s_matrix, np.array(lb), np.array(ub), properties) context_rxns: npt.NDArray = algorithm.run() fluxes: pd.Series = algorithm.sol.to_series() @@ -507,7 +514,14 @@ def _build_model( # noqa: C901 elif recon_algorithm == Algorithm.IMAT: context_model_cobra: cobra.Model context_model_cobra, flux_df = _build_with_imat( - reference_model, s_matrix, lb, ub, expr_vector, exp_thresh, idx_force + reference_model, + s_matrix, + lb, + ub, + expr_vector, + exp_thresh, + idx_force, + solver=solver, ) imat_reactions = flux_df.rxn model_reactions = [reaction.id for reaction in context_model_cobra.reactions] @@ -550,6 +564,7 @@ def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: df = _create_df(path) for column in df.columns: if column not in [ + "boundary", "reaction", "abbreviation", "compartment", @@ -561,31 +576,25 @@ def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}" ) - reactions: list[str] = [] + reactions: list[str] = [""] * len(df) boundary_type: list[str] = df["reaction"].tolist() reaction_abbreviation: list[str] = df["abbreviation"].tolist() reaction_compartment: list[str] = df["compartment"].tolist() - lower_bounds = df["minimum reaction rate"].tolist() - upper_bounds = df["maximum reaction rate"].tolist() + lower_bound = df["minimum reaction rate"].tolist() + upper_bound = df["maximum reaction rate"].tolist() + boundary_map = {"exchange": "EX", "demand": "DM", "sink": "SK"} for i in range(len(boundary_type)): - current_type: str = boundary_type[i] - temp_reaction: str = "" - - match current_type.lower(): - case "exchange": - temp_reaction += "EX_" - case "demand": - temp_reaction += "DM_" - case "sink": - temp_reaction += "SK_" + boundary: str = boundary_type[i].lower() + if boundary not in boundary_map: + raise ValueError(f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary[i]}") shorthand_compartment = Compartments.get(reaction_compartment[i]) - temp_reaction += f"{reaction_abbreviation[i]}[{shorthand_compartment}]" - reactions.append(temp_reaction) + reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]" + return _BoundaryReactions( reactions=reactions, - lower_bounds=lower_bounds, - upper_bounds=upper_bounds, + lower_bounds=lower_bound, + upper_bounds=upper_bound, ) @@ -631,10 +640,10 @@ def create_context_specific_model( # noqa: C901 raise ValueError(f"Output file type {output_type} not recognized. Must be one of: 'xml', 'mat', 'json'") if algorithm not in Algorithm: - raise ValueError(f"Algorithm {algorithm} not supported. Please use one of: GIMME, FASTCORE, or IMAT") + raise ValueError(f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}") if solver not in Solver: - raise ValueError(f"Solver '{solver}' not supported. Use 'GLPK' or 'GUROBI'") + raise ValueError(f"Solver '{solver}' not supported. Use one of {', '.join(s.value for s in Solver)}") if boundary_rxns_filepath: boundary_reactions = _collect_boundary_reactions(boundary_rxns_filepath) @@ -667,7 +676,7 @@ def create_context_specific_model( # noqa: C901 bound_ub=boundary_reactions.upper_bounds, exclude_rxns=exclude_rxns, force_rxns=force_rxns, - solver=solver.value, + solver=solver.value.lower(), low_thresh=low_threshold, high_thresh=high_threshold, ) diff --git a/main/como/rnaseq.py b/main/como/rnaseq.py deleted file mode 100644 index cf45f39c..00000000 --- a/main/como/rnaseq.py +++ /dev/null @@ -1,764 +0,0 @@ -from __future__ import annotations - -import gc -import math -import multiprocessing -import time -from collections import namedtuple -from dataclasses import dataclass, field -from enum import Enum -from functools import partial -from multiprocessing.pool import Pool -from pathlib import Path -from typing import Callable, NamedTuple - -import numpy as np -import numpy.typing as npt -import pandas as pd -import plotly.graph_objs as go -import scanpy as sc -import sklearn -import sklearn.neighbors -from fast_bioservices import Taxon -from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol -from loguru import logger -from pandas import DataFrame -from plotly.subplots import make_subplots -from scipy.signal import find_peaks -from sklearn.neighbors import KernelDensity - -from como.migrations import gene_info_migrations -from como.types import RNAPrepMethod -from como.utils import convert_gene_data - - -class _FilteringOptions(NamedTuple): - replicate_ratio: float - batch_ratio: float - cut_off: float - high_replicate_ratio: float - high_batch_ratio: float - - -class FilteringTechnique(Enum): - """RNA sequencing filtering capabilities.""" - - cpm = "cpm" - zfpkm = "zfpkm" - tpm = "quantile" - umi = "umi" - - @staticmethod - def from_string(value: str) -> FilteringTechnique: - """Create a filtering technique object from a string.""" - match value.lower(): - case "cpm": - return FilteringTechnique.cpm - case "zfpkm": - return FilteringTechnique.zfpkm - case "quantile": - return FilteringTechnique.tpm - case "umi": - return FilteringTechnique.umi - case _: - possible_values = [t.value for t in FilteringTechnique] - raise ValueError(f"Filtering technique must be one of {possible_values}; got: {value}") - - -class LayoutMethod(Enum): - """RNA sequencing layout method.""" - - paired_end = "paired-end" - single_end = "single-end" - - -@dataclass -class _StudyMetrics: - study: str - num_samples: int - count_matrix: pd.DataFrame - fragment_lengths: npt.NDArray[np.float32] - sample_names: list[str] - layout: list[LayoutMethod] - entrez_gene_ids: list[str] - gene_sizes: npt.NDArray[np.float32] - __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) - __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) - __high_confidence_entrez_gene_ids: list[str] = field(default=list) - - def __post_init__(self): - for layout in self.layout: - if layout not in LayoutMethod: - raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}") - - @property - def normalization_matrix(self) -> pd.DataFrame: - return self.__normalization_matrix - - @normalization_matrix.setter - def normalization_matrix(self, value: pd.DataFrame) -> None: - self.__normalization_matrix = value - - @property - def z_score_matrix(self) -> pd.DataFrame: - return self.__z_score_matrix - - @z_score_matrix.setter - def z_score_matrix(self, value: pd.DataFrame) -> None: - self.__z_score_matrix = value - - @property - def high_confidence_entrez_gene_ids(self) -> list[str]: - return self.__high_confidence_entrez_gene_ids - - @high_confidence_entrez_gene_ids.setter - def high_confidence_entrez_gene_ids(self, values: list[str]) -> None: - self.__high_confidence_entrez_gene_ids = values - - -Density = namedtuple("Density", ["x", "y"]) - - -class _ZFPKMResult(NamedTuple): - zfpkm: pd.Series - density: Density - mu: float - std_dev: float - max_fpkm: float - - -class _ReadMatrixResults(NamedTuple): - metrics: dict[str, _StudyMetrics] - entrez_gene_ids: list[str] - - -NamedMetrics = dict[str, _StudyMetrics] - - -def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]: - """Return a function that filters rows of an array based on the sum of elements being greater than or equal to A at least k times. - - This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA - - :param k: The minimum number of times the sum of elements must be greater than or equal to A. - :param a: The value to compare the sum of elements to. - :return: A function that accepts a NumPy array to perform the actual filtering - """ # noqa: E501 - - def filter_func(row: npt.NDArray) -> bool: - return np.sum(row >= a) >= k - - return filter_func - - -def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDArray], bool]) -> npt.NDArray: - """Apply a filter function to the rows of the data and return the filtered array. - - This code is based on the `genefilter` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/genefilter - - :param data: The data to filter - :param filter_func: THe function to filter the data by - :return: A NumPy array of the filtered data. - """ - match type(data): - case pd.DataFrame: - return data.apply(filter_func, axis=1).values - case npt.NDArray: - return np.apply_along_axis(filter_func, axis=1, arr=data) - case _: - raise ValueError("Unsupported data type. Must be a Pandas DataFrame or a NumPy array.") - - -async def _read_counts_matrix( - *, - context_name: str, - counts_matrix_filepath: Path, - config_filepath: Path, - gene_info_filepath: Path, - taxon_id: Taxon, -) -> _ReadMatrixResults: - """Read the counts matrix and returns the results. - - :param context_name: The context name being processed. Usually a cell type, but can be any string - :param counts_matrix_filepath: The file path to the gene count matrix - :param config_filepath: The file path to the Excel configuration file - :param gene_info_filepath: The file path to gene information generated by `rnaseq_preprocess.py` - :param taxon_id: The NCBI Taxon ID - :return: A dataclass `ReadMatrixResults` - """ - logger.trace(f"Reading config_filepath at '{config_filepath}'") - config_df: pd.DataFrame = pd.read_excel(config_filepath, sheet_name=context_name, header=0) - gene_info: pd.DataFrame = pd.read_csv(gene_info_filepath) - gene_info = gene_info[gene_info["ensembl_gene_id"] != "-"].reset_index(drop=True) - gene_info = gene_info_migrations(gene_info) - - match counts_matrix_filepath.suffix: - case ".csv": - logger.debug(f"Reading CSV file at '{counts_matrix_filepath}'") - counts_matrix: pd.DataFrame = pd.read_csv(counts_matrix_filepath, header=0) - if "ensembl_gene_id" not in counts_matrix.columns: - raise ValueError( - f"Counts matrix must contain a column named 'ensembl_gene_id'. " - f"Ensure the file '{counts_matrix_filepath}' contains this column." - ) - conversion = await ensembl_to_gene_id_and_symbol( - ids=counts_matrix["ensembl_gene_id"].tolist(), taxon=taxon_id - ) - counts_matrix = counts_matrix.merge(conversion, on="ensembl_gene_id", how="left") - - case ".h5ad": - logger.debug(f"Reading h5ad file at '{counts_matrix_filepath}'") - adata: sc.AnnData = sc.read_h5ad(counts_matrix_filepath) - counts_matrix: pd.DataFrame = adata.to_df().T # Make sample names the columns and gene data the index - - # Coherce the incoming gene data (i.e., Gene Symbols) into Entrez and Ensembl Gene IDs - conversion = await convert_gene_data(counts_matrix.index.tolist(), taxon_id) - counts_matrix.index.name = conversion.index.name - counts_matrix = counts_matrix.merge(conversion, left_index=True, right_index=True) - counts_matrix = counts_matrix[counts_matrix["entrez_gene_id"] != "-"] - counts_matrix.reset_index(inplace=True) - - # explicit garbage collection because this function runs for a little while - del adata - del conversion - gc.collect() - - case _: - raise ValueError( - f"Unknown file extension '{counts_matrix_filepath.suffix}'. Valid options are '.csv' or '.h5ad'." - ) - - if counts_matrix.empty: - raise ValueError( - f"Counts matrix is empty. Ensure the file contains data. " - f"Attempted to process file '{counts_matrix_filepath}'" - ) - - # Only include Entrez and Ensembl Gene IDs that are present in `gene_info` - counts_matrix["entrez_gene_id"] = counts_matrix["entrez_gene_id"].str.split("//") - counts_matrix = counts_matrix.explode("entrez_gene_id") - counts_matrix = counts_matrix.replace(to_replace="-", value=pd.NA).dropna() - counts_matrix["entrez_gene_id"] = counts_matrix["entrez_gene_id"].astype(int) - - gene_info = gene_info.replace(to_replace="-", value=pd.NA).dropna() - gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) - - counts_matrix = counts_matrix.merge( - gene_info[["entrez_gene_id", "ensembl_gene_id"]], - on=["entrez_gene_id", "ensembl_gene_id"], - how="inner", - ) - gene_info = gene_info.merge( - counts_matrix[["entrez_gene_id", "ensembl_gene_id"]], - on=["entrez_gene_id", "ensembl_gene_id"], - how="inner", - ) - - entrez_gene_ids: list[str] = gene_info["entrez_gene_id"].tolist() - metrics: NamedMetrics = {} - for study in config_df["study"].unique().tolist(): - study_sample_names = config_df[config_df["study"] == study]["sample_name"].tolist() - layouts = config_df[config_df["study"] == study]["layout"].tolist() - metrics[study] = _StudyMetrics( - count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)], - fragment_lengths=config_df[config_df["study"] == study]["fragment_length"].values, - sample_names=study_sample_names, - layout=[LayoutMethod(layout) for layout in layouts], - num_samples=len(study_sample_names), - entrez_gene_ids=entrez_gene_ids, - gene_sizes=np.array(gene_info["size"].values).astype(np.float32), - study=study, - ) - metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0 - metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id") - - return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist()) - - -def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary.""" - for sample in metrics: - count_matrix = metrics[sample].count_matrix - - gene_sizes = metrics[sample].gene_sizes - - tpm_matrix = pd.DataFrame(data=None, index=count_matrix.index, columns=count_matrix.columns) - for i in range(len(count_matrix.columns)): - values: pd.Series = count_matrix.iloc[:, i] + 1 # Add 1 to prevent division by 0 - rate = np.log(values.tolist()) - np.log(gene_sizes) - denominator = np.log(np.sum(np.exp(rate))) - tpm_value = np.exp(rate - denominator + np.log(1e6)) - tpm_matrix.iloc[:, i] = tpm_value - metrics[sample].normalization_matrix = tpm_matrix - - return metrics - - -def calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary.""" # noqa: E501 - matrix_values = [] - for study in metrics: - for sample in range(metrics[study].num_samples): - layout = metrics[study].layout[sample] - count_matrix: npt.NDArray = metrics[study].count_matrix.iloc[:, sample].values - gene_size = metrics[study].gene_sizes - - count_matrix = count_matrix.astype(np.float32) - gene_size = gene_size.astype(np.float32) - - match layout: - case LayoutMethod.paired_end: # FPKM - mean_fragment_lengths = metrics[study].fragment_lengths[sample] - # Ensure non-negative value - effective_length = [max(0, size - (mean_fragment_lengths + 1)) for size in gene_size] - n = count_matrix.sum() - fpkm = ((count_matrix + 1) * 1e9) / (np.array(effective_length) * n) - matrix_values.append(fpkm) - case LayoutMethod.single_end: # RPKM - # Add a pseudocount before log to ensure log(0) does not happen - rate = np.log(count_matrix + 1) - np.log(gene_size) - exp_rate = np.exp(rate - np.log(np.sum(count_matrix)) + np.log(1e9)) - matrix_values.append(exp_rate) - case _: - raise ValueError("Invalid normalization method specified") - - fpkm_matrix = pd.DataFrame(matrix_values).T # Transpose is needed because values were appended as rows - fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)] - metrics[study].normalization_matrix = fpkm_matrix - - metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns - - return metrics - - -def _zfpkm_calculation(col: pd.Series, kernel: KernelDensity, peak_parameters: tuple[float, float]) -> _ZFPKMResult: - """Log2 Transformations. - - Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting - - Kernel Density Estimation (kde) - - Non-parametric method to estimate the probability density function (PDF) of a random variable - - Estimates the distribution of log2-transformed FPKM values - - Bandwidth parameter controls the smoothness of the density estimate - - KDE Explanation - - A way to smooth a histogram to get a better idea of the underlying distribution of the data - - Given a set of data points, we want to understand how they are distributed. - Histograms can be useful, but are sensitive to bin size and number - - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point - - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel, - and less weight to points farther away - - Kernel functions are summed along each point on the x-axis - - A smooth curve is created that represents the estimated density of the data - - Peak Finding - - Identifies that are above a certain height and separated by a minimum distance - - Represent potential local maxima in the distribution - - Peak Selection - - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu) - of the "inactive" gene distribution - - The peak representing unexpressed or inactive genes should be at a lower FPKM - value compared to the peak representing expressed genes - - Standard Deviation Estimation - - The mean of log2-FPKM values are greater than the calculated mu - - Standard deviation is estimated based on the assumption that the right tail of the distribution - This represents expressed genes) can be approximated by a half-normal distribution - - zFPKM Transformation - - Centers disbribution around 0 and scales it by the standard deviation. - This makes it easier to compare gene expression across different samples - - Represents the number of standard deviations away from the mean of the "inactive" gene distribution - - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak - - A zFPKM value of 0 represents the mean of the "inactive" distribution - - Research shows that a zFPKM value of -3 or greater can be used as - a threshold for calling a gene as "expressed" - : https://doi.org/10.1186/1471-2164-14-778 - """ - col_log2: npt.NDArray = np.log2(col + 1) - col_log2 = np.nan_to_num(col_log2, nan=0) - refit: KernelDensity = kernel.fit(col_log2.reshape(-1, 1)) # type: ignore - - # kde: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(col_log2.reshape(-1, 1)) - x_range = np.linspace(col_log2.min(), col_log2.max(), 1000) - density = np.exp(refit.score_samples(x_range.reshape(-1, 1))) - peaks, _ = find_peaks(density, height=peak_parameters[0], distance=peak_parameters[1]) - peak_positions = x_range[peaks] - - mu = 0 - max_fpkm = 0 - stddev = 1 - - if len(peaks) != 0: - mu = peak_positions.max() - max_fpkm = density[peaks[np.argmax(peak_positions)]] - u = col_log2[col_log2 > mu].mean() - stddev = (u - mu) * np.sqrt(np.pi / 2) - zfpkm = pd.Series((col_log2 - mu) / stddev, dtype=np.float32, name=col.name) - - return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm) - - -def zfpkm_transform( - fpkm_df: pd.DataFrame, - bandwidth: int = 0.5, - peak_parameters: tuple[float, float] = (0.02, 1.0), - update_every_percent: float = 0.1, -) -> tuple[dict[str, _ZFPKMResult], DataFrame]: - """Perform zFPKM calculation/transformation.""" - if update_every_percent > 1: - logger.warning( - f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - " - f"will convert to percentage" - ) - update_every_percent /= 100 - - total = len(fpkm_df.columns) - update_per_step: int = int(np.ceil(total * update_every_percent)) - cores = multiprocessing.cpu_count() - 2 - logger.debug(f"Processing {total:,} samples through zFPKM transform using {cores} cores") - logger.debug( - f"Will update every {update_per_step:,} steps as this is approximately " - f"{update_every_percent:.1%} of {total:,}" - ) - - with Pool(processes=cores) as pool: - kernel = KernelDensity(kernel="gaussian", bandwidth=bandwidth) - chunksize = int(math.ceil(len(fpkm_df.columns) / (4 * cores))) - partial_func = partial(_zfpkm_calculation, kernel=kernel, peak_parameters=peak_parameters) - chunk_time = time.time() - start_time = time.time() - - log_padding = len(str(f"{total:,}")) - zfpkm_df = pd.DataFrame(data=0, index=fpkm_df.index, columns=fpkm_df.columns) - results: dict[str, _ZFPKMResult] = {} - result: _ZFPKMResult - for i, result in enumerate( - pool.imap( - partial_func, - (fpkm_df[col] for col in fpkm_df.columns), - chunksize=chunksize, - ) - ): - key = str(result.zfpkm.name) - results[key] = result - zfpkm_df[key] = result.zfpkm - - # show updates every X% and at the end, but skip on first iteration - if i != 0 and (i % update_per_step == 0 or i == total): - current_time = time.time() - chunk = current_time - chunk_time - total_time = current_time - start_time - formatted = f"{i:,}" - logger.debug( - f"Processed {formatted:>{log_padding}} of {total:,} - " - f"chunk took {chunk:.1f} seconds - " - f"running for {total_time:.1f} seconds" - ) - chunk_time = current_time - return results, zfpkm_df - - -def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): - """Plot the log2(FPKM) density and fitted Gaussian for each sample. - - :param results: A dictionary of intermediate results from zfpkm_transform. - :param: subplot_titles: Whether to display facet titles (sample names). - :param plot_xfloor: Lower limit for the x-axis. - :param subplot_titles: Whether to display facet titles (sample names). - """ - mega_df = pd.DataFrame(columns=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"]) - for name, result in results.items(): - stddev = result.std_dev - x = np.array(result.density.x) - y = np.array(result.density.y) - - fitted = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) - max_fpkm = y.max() - max_fitted = fitted.max() - scale_fitted = fitted * (max_fpkm / max_fitted) - - df = pd.DataFrame( - { - "sample_name": [name] * len(x), - "log2fpkm": x, - "fpkm_density": y, - "fitted_density_scaled": scale_fitted, - } - ) - mega_df = pd.concat([mega_df, df], ignore_index=True) - - mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") - subplot_titles = list(results.keys()) if subplot_titles else None - fig = make_subplots( - rows=len(results), - cols=1, - subplot_titles=subplot_titles, - vertical_spacing=min(0.05, (1 / (len(results) - 1))), - ) - - for i, (name, group) in enumerate(mega_df.groupby("sample_name"), start=1): - fig.add_trace( - trace=go.Scatter(x=group["log2fpkm"], y=group["density"], mode="lines", name=name, legendgroup=name), - row=i, - col=1, - ) - fig.update_xaxes(title_text="log2(FPKM)", range=[plot_xfloor, max(group["log2fpkm"].tolist())], row=i, col=1) - fig.update_yaxes(title_text="density [scaled]", row=i, col=1) - fig.update_layout(legend_tracegroupgap=0) - - fig.update_layout(height=600 * len(results), width=1000, title_text="zFPKM Plots", showlegend=True) - fig.write_image("zfpkm_plot.png") - - -def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: - """Calculate the z-score for each sample in the metrics dictionary.""" - for sample in metrics: - log_matrix = np.log(metrics[sample].normalization_matrix) - z_matrix = pd.DataFrame( - data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names - ) - metrics[sample].z_score_matrix = z_matrix - return metrics - - -def cpm_filter( - *, - metrics: NamedMetrics, - filtering_options: _FilteringOptions, - output_csv_filepath: Path, -) -> NamedMetrics: - """Apply Counts Per Million (CPM) filtering to the count matrix for a given sample.""" - n_exp = filtering_options.replicate_ratio - n_top = filtering_options.high_replicate_ratio - cut_off = filtering_options.cut_off - - metric: _StudyMetrics - for metric in metrics.values(): - counts: pd.DataFrame = metric.count_matrix - entrez_ids: list[str] = metric.entrez_gene_ids - library_size: pd.DataFrame = counts.sum(axis=1) - - # For library_sizes equal to 0, add 1 to prevent divide by 0 - # This will not impact the final counts per million calculation because the original counts are still 0 - # thus, (0 / 1) * 1_000_000 = 0 - library_size[library_size == 0] = 1 - - output_csv_filepath.parent.mkdir(parents=True, exist_ok=True) - counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000 - counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids)) - logger.debug(f"Writing CPM matrix to {output_csv_filepath}") - counts_per_million.to_csv(output_csv_filepath, index=False) - - # TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason. - # Most likely due to multiplying by 1_000_000, not exactly sure why - - min_samples = round(n_exp * len(counts.columns)) # noqa: F841 - top_samples = round(n_top * len(counts.columns)) # noqa: F841 - test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) - for i in range(len(counts_per_million.columns)): - cutoff = ( - 10e6 / (np.median(np.sum(counts[:, i]))) - if cut_off == "default" - else 1e6 * cut_off / np.median(np.sum(counts[:, i])) - ) - test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) - - return metrics - - -def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics: - """Apply quantile-based filtering to the TPM matrix for a given sample.""" - # TODO: Write the TPM matrix to disk - - n_exp = filtering_options.replicate_ratio - n_top = filtering_options.high_replicate_ratio - cut_off = filtering_options.cut_off - metrics = calculate_tpm(metrics) - - sample: str - metric: _StudyMetrics - for sample, metric in metrics.items(): - entrez_ids = metric.entrez_gene_ids - gene_size = metric.gene_sizes - tpm_matrix: pd.DataFrame = metric.normalization_matrix - - min_samples = round(n_exp * len(tpm_matrix.columns)) - top_samples = round(n_top * len(tpm_matrix.columns)) - - tpm_quantile = tpm_matrix[tpm_matrix > 0] - quantile_cutoff = np.quantile( - a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0 - ) # Compute quantile across columns - boolean_expression = pd.DataFrame( - data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns - ).astype(int) - - min_func = k_over_a(min_samples, 0.9) - top_func = k_over_a(top_samples, 0.9) - - min_genes: npt.NDArray[bool] = genefilter(boolean_expression, min_func) - top_genes: npt.NDArray[bool] = genefilter(boolean_expression, top_func) - - # Only keep `entrez_gene_ids` that pass `min_genes` - metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes) if keep] - metric.gene_sizes = [gene for gene, keep in zip(gene_size, min_genes) if keep] - metric.count_matrix = metric.count_matrix.iloc[min_genes, :] - metric.normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :] - - keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes) if keep] - metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes) if keep] - - metrics = calculate_z_score(metrics) - - return metrics - - -def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: - """Apply zFPKM filtering to the FPKM matrix for a given sample.""" - min_sample_expression = filtering_options.replicate_ratio - high_confidence_sample_expression = filtering_options.high_replicate_ratio - cut_off = filtering_options.cut_off - - if calcualte_fpkm: - metrics = calculate_fpkm(metrics) - - metric: _StudyMetrics - for metric in metrics.values(): - # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead - matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix - matrix = matrix[matrix.sum(axis=1) > 0] - - minimums = matrix == 0 - results, zfpkm_df = zfpkm_transform(matrix) - zfpkm_df[minimums] = -4 - zfpkm_plot(results) - - # determine which genes are expressed - min_samples = round(min_sample_expression * len(zfpkm_df.columns)) - min_func = k_over_a(min_samples, cut_off) - min_genes: npt.NDArray[bool] = genefilter(zfpkm_df, min_func) - metric.entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, min_genes) if keep] - - # determine which genes are confidently expressed - top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns)) - top_func = k_over_a(top_samples, cut_off) - top_genes: npt.NDArray[bool] = genefilter(zfpkm_df, top_func) - metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, top_genes) if keep] - - return metrics - - -def filter_counts( - *, - metrics: NamedMetrics, - technique: FilteringTechnique, - filtering_options: _FilteringOptions, - cpm_output_filepath: Path | None = None, -) -> NamedMetrics: - """Filter the count matrix based on the specified technique.""" - match technique: - case FilteringTechnique.cpm: - if cpm_output_filepath is None: - raise ValueError("CPM output filepath must be provided") - return cpm_filter( - metrics=metrics, - filtering_options=filtering_options, - output_csv_filepath=cpm_output_filepath, - ) - - case FilteringTechnique.tpm: - return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) - - case FilteringTechnique.zfpkm: - return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True) - - case FilteringTechnique.umi: - return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False) - - case _: - raise ValueError(f"Technique must be one of {FilteringTechnique}") - - -async def save_rnaseq_tests( - context_name: str, - counts_matrix_filepath: Path, - config_filepath: Path, - gene_info_filepath: Path, - output_filepath: Path, - prep: RNAPrepMethod, - taxon_id: Taxon, - replicate_ratio: float, - batch_ratio: float, - high_replicate_ratio: float, - high_batch_ratio: float, - technique: FilteringTechnique, - cut_off: int | float, -): - """Save the results of the RNA-Seq tests to a CSV file.""" - filtering_options = _FilteringOptions( - replicate_ratio=replicate_ratio, - batch_ratio=batch_ratio, - cut_off=cut_off, - high_replicate_ratio=high_replicate_ratio, - high_batch_ratio=high_batch_ratio, - ) - - if prep == RNAPrepMethod.SCRNA: - technique = FilteringTechnique.umi - logger.warning( - "Single cell filtration does not normalize and assumes " - "gene counts are counted with Unique Molecular Identifiers (UMIs). " - "Setting filtering technique to UMI now." - ) - - read_counts_results: _ReadMatrixResults = await _read_counts_matrix( - context_name=context_name, - counts_matrix_filepath=counts_matrix_filepath, - config_filepath=config_filepath, - gene_info_filepath=gene_info_filepath, - taxon_id=taxon_id, - ) - metrics = read_counts_results.metrics - entrez_gene_ids = read_counts_results.entrez_gene_ids - - metrics = filter_counts( - metrics=metrics, - technique=technique, - filtering_options=filtering_options, - ) - - expressed_genes: list[str] = [] - top_genes: list[str] = [] - for metric in metrics.values(): - expressed_genes.extend(metric.entrez_gene_ids) - top_genes.extend(metric.high_confidence_entrez_gene_ids) - - expression_frequency = pd.Series(expressed_genes).value_counts() - expression_df = pd.DataFrame( - {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values} - ) - expression_df["prop"] = expression_df["frequency"] / len(metrics) - expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio] - - top_frequency = pd.Series(top_genes).value_counts() - top_df = pd.DataFrame({"entrez_gene_id": top_frequency.index, "frequency": top_frequency.values}) - top_df["prop"] = top_df["frequency"] / len(metrics) - top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio] - - boolean_matrix = pd.DataFrame(data={"entrez_gene_id": entrez_gene_ids, "expressed": 0, "high": 0}) - for gene in entrez_gene_ids: - if gene in expression_df["entrez_gene_id"]: - boolean_matrix.loc[gene, "expressed"] = 1 - if gene in top_df["entrez_gene_id"]: - boolean_matrix.loc[gene, "high"] = 1 - - expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1]) - high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1]) - - boolean_matrix.to_csv(output_filepath, index=False) - logger.info( - f"{context_name} - Found {expressed_count} expressed genes, " - f"{high_confidence_count} of which are confidently expressed" - ) - logger.success(f"Wrote boolean matrix to {output_filepath}") diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 47aeb8ee..364bd4c3 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -1,130 +1,723 @@ from __future__ import annotations -import argparse -import asyncio -from dataclasses import dataclass +import math +import multiprocessing +import time +from collections import namedtuple +from dataclasses import dataclass, field +from enum import Enum +from functools import partial +from multiprocessing.pool import Pool +from pathlib import Path +from typing import Callable, NamedTuple +import numpy as np +import numpy.typing as npt import pandas as pd -from fast_bioservices import Taxon +import plotly.graph_objs as go +import scanpy as sc +import sklearn +import sklearn.neighbors +from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol from loguru import logger +from pandas import DataFrame +from plotly.subplots import make_subplots +from scipy.signal import find_peaks +from sklearn.neighbors import KernelDensity -from como import Config -from como.custom_types import RNAPrepMethod -from como.rnaseq import FilteringTechnique, save_rnaseq_tests +from como.migrations import gene_info_migrations +from como.project import Config +from como.types import FilteringTechnique, RNAPrepMethod -@dataclass -class _Arguments: - config_file: str +class _FilteringOptions(NamedTuple): replicate_ratio: float batch_ratio: float + cut_off: float high_replicate_ratio: float high_batch_ratio: float - filtering_technique: FilteringTechnique - minimum_cutoff: int | str - library_prep: RNAPrepMethod - taxon: Taxon + + +class LayoutMethod(Enum): + """RNA sequencing layout method.""" + + paired_end = "paired-end" + single_end = "single-end" + + +@dataclass +class _StudyMetrics: + study: str + num_samples: int + count_matrix: pd.DataFrame + fragment_lengths: npt.NDArray[np.float32] + sample_names: list[str] + layout: list[LayoutMethod] + entrez_gene_ids: list[str] + gene_sizes: npt.NDArray[np.float32] + __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __high_confidence_entrez_gene_ids: list[str] = field(default=list) def __post_init__(self): - self.library_prep = RNAPrepMethod.from_string(str(self.library_prep)) - self.filtering_technique = FilteringTechnique.from_string(str(self.filtering_technique)) + for layout in self.layout: + if layout not in LayoutMethod: + raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}") - if self.minimum_cutoff is None: - if self.filtering_technique == FilteringTechnique.tpm: - self.minimum_cutoff = 25 - elif self.filtering_technique == FilteringTechnique.cpm: - self.minimum_cutoff = "default" - elif self.filtering_technique == FilteringTechnique.zfpkm: - self.minimum_cutoff = -3 + @property + def normalization_matrix(self) -> pd.DataFrame: + return self.__normalization_matrix + @normalization_matrix.setter + def normalization_matrix(self, value: pd.DataFrame) -> None: + self.__normalization_matrix = value -async def _handle_context_batch( - config_filename: str, - replicate_ratio: float, - batch_ratio: float, - replicate_ratio_high: float, - batch_ratio_high: float, - technique: FilteringTechnique, - cut_off: int | float | str, - prep: RNAPrepMethod, - taxon: Taxon, -) -> None: - """Iterate through each context type and create rnaseq expression file. + @property + def z_score_matrix(self) -> pd.DataFrame: + return self.__z_score_matrix - :param config_filename: The configuration filename to read - :param replicate_ratio: The percentage of replicates that a gene must - appear in for a gene to be marked as "active" in a batch/study - :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" - :param replicate_ratio_high: The percentage of replicates that a gene must - appear in for a gene to be marked "highly confident" in its expression in a batch/study - :param batch_ratio_high: The percentage of batches that a gene must - appear in for a gene to be marked "highly confident" in its expression - :param technique: The filtering technique to use - :param cut_off: The cutoff value to use for the provided filtering technique - :param prep: The library preparation method + @z_score_matrix.setter + def z_score_matrix(self, value: pd.DataFrame) -> None: + self.__z_score_matrix = value + + @property + def high_confidence_entrez_gene_ids(self) -> list[str]: + return self.__high_confidence_entrez_gene_ids + + @high_confidence_entrez_gene_ids.setter + def high_confidence_entrez_gene_ids(self, values: list[str]) -> None: + self.__high_confidence_entrez_gene_ids = values + + +class _ZFPKMResult(NamedTuple): + zfpkm: pd.Series + density: Density + mu: float + std_dev: float + max_fpkm: float + + +class _ReadMatrixResults(NamedTuple): + metrics: dict[str, _StudyMetrics] + entrez_gene_ids: list[str] + + +Density = namedtuple("Density", ["x", "y"]) +NamedMetrics = dict[str, _StudyMetrics] + + +def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]: + """Return a function that filters rows of an array based on the sum of elements being greater than or equal to A at least k times. + + This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA + + :param k: The minimum number of times the sum of elements must be greater than or equal to A. + :param a: The value to compare the sum of elements to. + :return: A function that accepts a NumPy array to perform the actual filtering + """ # noqa: E501 + + def filter_func(row: npt.NDArray) -> bool: + return np.sum(row >= a) >= k + + return filter_func + + +def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDArray], bool]) -> npt.NDArray: + """Apply a filter function to the rows of the data and return the filtered array. + + This code is based on the `genefilter` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/genefilter + + :param data: The data to filter + :param filter_func: THe function to filter the data by + :return: A NumPy array of the filtered data. + """ + if not isinstance(data, (pd.DataFrame, npt.NDArray)): + raise TypeError("Unsupported data type. Must be a Pandas DataFrame or a NumPy array.") + + return ( + data.apply(filter_func, axis=1).values + if isinstance(data, pd.DataFrame) + else np.apply_along_axis(filter_func, axis=1, arr=data) + ) + + +async def _read_counts(path: Path) -> pd.DataFrame: + if path.suffix not in {".csv", ".h5ad"}: + raise ValueError(f"Unknown file extension '{path.suffix}'. Valid options are '.csv' or '.h5ad'.") + + matrix: pd.DataFrame + if path.suffix == ".csv": + logger.debug(f"Reading CSV file at '{path}'") + matrix = pd.read_csv(path, header=0) + elif path.suffix == ".h5ad": + logger.debug(f"Reading h5ad file at '{path}'") + # Make sample names the columns and gene data the index + matrix = sc.read_h5ad(path).to_df().T + + return matrix + + +async def _build_matrix_results( + *, + matrix: pd.DataFrame, + gene_info: pd.DataFrame, + metadata_df: pd.DataFrame, + taxon: int, +) -> _ReadMatrixResults: + """Read the counts matrix and returns the results. + + :param matrix: The gene counts matrix to process + :param metadata_df: The configuration dataframe related to the current context :param taxon: The NCBI Taxon ID - :return: None + :return: A dataclass `ReadMatrixResults` """ - config = Config() + gene_info = gene_info_migrations(gene_info) + conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) + matrix = matrix.merge(conversion, on="ensembl_gene_id", how="left") - config_filepath = config.config_dir / config_filename - if not config_filepath.exists(): - raise FileNotFoundError(f"Unable to find '{config_filename}' at the path: '{config_filepath}'") - xl = pd.ExcelFile(config_filepath) - sheet_names = xl.sheet_names + # Only include Entrez and Ensembl Gene IDs that are present in `gene_info` + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].str.split("//") + matrix = matrix.explode("entrez_gene_id") + matrix = matrix.replace(to_replace="-", value=pd.NA).dropna() + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) - logger.info(f"Reading config file: {config_filepath}") + gene_info = gene_info.replace(to_replace="-", value=pd.NA).dropna() + gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) - for context_name in sheet_names: - logger.debug(f"Starting '{context_name}'") + counts_matrix = matrix.merge( + gene_info[["entrez_gene_id", "ensembl_gene_id"]], + on=["entrez_gene_id", "ensembl_gene_id"], + how="inner", + ) + gene_info = gene_info.merge( + counts_matrix[["entrez_gene_id", "ensembl_gene_id"]], + on=["entrez_gene_id", "ensembl_gene_id"], + how="inner", + ) - rnaseq_input_filepath = ( - config.data_dir / "data_matrices" / context_name / f"gene_counts_matrix_{prep.value}_{context_name}" + entrez_gene_ids: list[str] = gene_info["entrez_gene_id"].tolist() + metrics: NamedMetrics = {} + for study in metadata_df["study"].unique().tolist(): + study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist() + layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist() + metrics[study] = _StudyMetrics( + count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)], + fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values, + sample_names=study_sample_names, + layout=[LayoutMethod(layout) for layout in layouts], + num_samples=len(study_sample_names), + entrez_gene_ids=entrez_gene_ids, + gene_sizes=np.array(gene_info["size"].values).astype(np.float32), + study=study, ) - if prep == RNAPrepMethod.SCRNA: - rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".h5ad") - elif prep in {RNAPrepMethod.TOTAL, RNAPrepMethod.MRNA}: - rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".csv") - - if not rnaseq_input_filepath.exists(): - logger.warning(f"Gene counts matrix not found at {rnaseq_input_filepath}, skipping...") - continue - - gene_info_filepath = config.data_dir / "gene_info.csv" - rnaseq_output_filepath = ( - config.result_dir / context_name / prep.value / f"rnaseq_{prep.value}_{context_name}.csv" + metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0 + metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id") + + return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist()) + + +def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary.""" + for sample in metrics: + count_matrix = metrics[sample].count_matrix + + gene_sizes = metrics[sample].gene_sizes + + tpm_matrix = pd.DataFrame(data=None, index=count_matrix.index, columns=count_matrix.columns) + for i in range(len(count_matrix.columns)): + values: pd.Series = count_matrix.iloc[:, i] + 1 # Add 1 to prevent division by 0 + rate = np.log(values.tolist()) - np.log(gene_sizes) + denominator = np.log(np.sum(np.exp(rate))) + tpm_value = np.exp(rate - denominator + np.log(1e6)) + tpm_matrix.iloc[:, i] = tpm_value + metrics[sample].normalization_matrix = tpm_matrix + + return metrics + + +def calculate_fpkm(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary.""" # noqa: E501 + matrix_values = [] + for study in metrics: + for sample in range(metrics[study].num_samples): + layout = metrics[study].layout[sample] + count_matrix: npt.NDArray = metrics[study].count_matrix.iloc[:, sample].values + gene_size = metrics[study].gene_sizes + + count_matrix = count_matrix.astype(np.float32) + gene_size = gene_size.astype(np.float32) + + match layout: + case LayoutMethod.paired_end: # FPKM + mean_fragment_lengths = metrics[study].fragment_lengths[sample] + # Ensure non-negative value + effective_length = [max(0, size - (mean_fragment_lengths + 1)) for size in gene_size] + n = count_matrix.sum() + fpkm = ((count_matrix + 1) * 1e9) / (np.array(effective_length) * n) + matrix_values.append(fpkm) + case LayoutMethod.single_end: # RPKM + # Add a pseudocount before log to ensure log(0) does not happen + rate = np.log(count_matrix + 1) - np.log(gene_size) + exp_rate = np.exp(rate - np.log(np.sum(count_matrix)) + np.log(1e9)) + matrix_values.append(exp_rate) + case _: + raise ValueError("Invalid normalization method specified") + + fpkm_matrix = pd.DataFrame(matrix_values).T # Transpose is needed because values were appended as rows + fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)] + metrics[study].normalization_matrix = fpkm_matrix + + metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns + + return metrics + + +def _zfpkm_calculation(col: pd.Series, kernel: KernelDensity, peak_parameters: tuple[float, float]) -> _ZFPKMResult: + """Log2 Transformations. + + Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting + + Kernel Density Estimation (kde) + - Non-parametric method to estimate the probability density function (PDF) of a random variable + - Estimates the distribution of log2-transformed FPKM values + - Bandwidth parameter controls the smoothness of the density estimate + - KDE Explanation + - A way to smooth a histogram to get a better idea of the underlying distribution of the data + - Given a set of data points, we want to understand how they are distributed. + Histograms can be useful, but are sensitive to bin size and number + - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point + - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel, + and less weight to points farther away + - Kernel functions are summed along each point on the x-axis + - A smooth curve is created that represents the estimated density of the data + + Peak Finding + - Identifies that are above a certain height and separated by a minimum distance + - Represent potential local maxima in the distribution + + Peak Selection + - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu) + of the "inactive" gene distribution + - The peak representing unexpressed or inactive genes should be at a lower FPKM + value compared to the peak representing expressed genes + + Standard Deviation Estimation + - The mean of log2-FPKM values are greater than the calculated mu + - Standard deviation is estimated based on the assumption that the right tail of the distribution + This represents expressed genes) can be approximated by a half-normal distribution + + zFPKM Transformation + - Centers disbribution around 0 and scales it by the standard deviation. + This makes it easier to compare gene expression across different samples + - Represents the number of standard deviations away from the mean of the "inactive" gene distribution + - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak + - A zFPKM value of 0 represents the mean of the "inactive" distribution + - Research shows that a zFPKM value of -3 or greater can be used as + a threshold for calling a gene as "expressed" + : https://doi.org/10.1186/1471-2164-14-778 + """ + col_log2: npt.NDArray = np.log2(col + 1) + col_log2 = np.nan_to_num(col_log2, nan=0) + refit: KernelDensity = kernel.fit(col_log2.reshape(-1, 1)) # type: ignore + + # kde: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(col_log2.reshape(-1, 1)) + x_range = np.linspace(col_log2.min(), col_log2.max(), 1000) + density = np.exp(refit.score_samples(x_range.reshape(-1, 1))) + peaks, _ = find_peaks(density, height=peak_parameters[0], distance=peak_parameters[1]) + peak_positions = x_range[peaks] + + mu = 0 + max_fpkm = 0 + stddev = 1 + + if len(peaks) != 0: + mu = peak_positions.max() + max_fpkm = density[peaks[np.argmax(peak_positions)]] + u = col_log2[col_log2 > mu].mean() + stddev = (u - mu) * np.sqrt(np.pi / 2) + zfpkm = pd.Series((col_log2 - mu) / stddev, dtype=np.float32, name=col.name) + + return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm) + + +def zfpkm_transform( + fpkm_df: pd.DataFrame, + bandwidth: int = 0.5, + peak_parameters: tuple[float, float] = (0.02, 1.0), + update_every_percent: float = 0.1, +) -> tuple[dict[str, _ZFPKMResult], DataFrame]: + """Perform zFPKM calculation/transformation.""" + if update_every_percent > 1: + logger.warning( + f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - " + f"will convert to percentage" ) - rnaseq_output_filepath.parent.mkdir(parents=True, exist_ok=True) - - await save_rnaseq_tests( - context_name=context_name, - counts_matrix_filepath=rnaseq_input_filepath, - config_filepath=config_filepath, - output_filepath=rnaseq_output_filepath.as_posix(), - gene_info_filepath=gene_info_filepath, - prep=prep, - replicate_ratio=replicate_ratio, - batch_ratio=batch_ratio, - high_replicate_ratio=replicate_ratio_high, - high_batch_ratio=batch_ratio_high, - technique=technique, - cut_off=cut_off, - taxon_id=taxon, + update_every_percent /= 100 + + total = len(fpkm_df.columns) + update_per_step: int = int(np.ceil(total * update_every_percent)) + cores = min(multiprocessing.cpu_count() - 2, total) + logger.debug(f"Processing {total:,} samples through zFPKM transform using {cores} cores") + logger.debug( + f"Will update every {update_per_step:,} steps as this is approximately " + f"{update_every_percent:.1%} of {total:,}" + ) + + with Pool(processes=cores) as pool: + kernel = KernelDensity(kernel="gaussian", bandwidth=bandwidth) + chunksize = int(math.ceil(len(fpkm_df.columns) / (4 * cores))) + partial_func = partial(_zfpkm_calculation, kernel=kernel, peak_parameters=peak_parameters) + chunk_time = time.time() + start_time = time.time() + + log_padding = len(str(f"{total:,}")) + zfpkm_df = pd.DataFrame(data=0, index=fpkm_df.index, columns=fpkm_df.columns) + results: dict[str, _ZFPKMResult] = {} + result: _ZFPKMResult + for i, result in enumerate( + pool.imap( + partial_func, + (fpkm_df[col] for col in fpkm_df.columns), + chunksize=chunksize, + ) + ): + key = str(result.zfpkm.name) + results[key] = result + zfpkm_df[key] = result.zfpkm + + # show updates every X% and at the end, but skip on first iteration + if i != 0 and (i % update_per_step == 0 or i == total): + current_time = time.time() + chunk = current_time - chunk_time + total_time = current_time - start_time + formatted = f"{i:,}" + logger.debug( + f"Processed {formatted:>{log_padding}} of {total:,} - " + f"chunk took {chunk:.1f} seconds - " + f"running for {total_time:.1f} seconds" + ) + chunk_time = current_time + return results, zfpkm_df + + +def zfpkm_plot(results, *, plot_xfloor: int = -4, subplot_titles: bool = True): + """Plot the log2(FPKM) density and fitted Gaussian for each sample. + + :param results: A dictionary of intermediate results from zfpkm_transform. + :param: subplot_titles: Whether to display facet titles (sample names). + :param plot_xfloor: Lower limit for the x-axis. + :param subplot_titles: Whether to display facet titles (sample names). + """ + mega_df = pd.DataFrame(columns=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"]) + for name, result in results.items(): + stddev = result.std_dev + x = np.array(result.density.x) + y = np.array(result.density.y) + + fitted = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) + max_fpkm = y.max() + max_fitted = fitted.max() + scale_fitted = fitted * (max_fpkm / max_fitted) + + df = pd.DataFrame( + { + "sample_name": [name] * len(x), + "log2fpkm": x, + "fpkm_density": y, + "fitted_density_scaled": scale_fitted, + } ) - logger.success(f"Results saved at '{rnaseq_output_filepath}'") + mega_df = pd.concat([mega_df, df], ignore_index=True) + + mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") + subplot_titles = list(results.keys()) if subplot_titles else None + fig = make_subplots( + rows=len(results), + cols=1, + subplot_titles=subplot_titles, + vertical_spacing=min(0.05, (1 / (len(results) - 1))), + ) + + for i, (name, group) in enumerate(mega_df.groupby("sample_name"), start=1): + fig.add_trace( + trace=go.Scatter(x=group["log2fpkm"], y=group["density"], mode="lines", name=name, legendgroup=name), + row=i, + col=1, + ) + fig.update_xaxes(title_text="log2(FPKM)", range=[plot_xfloor, max(group["log2fpkm"].tolist())], row=i, col=1) + fig.update_yaxes(title_text="density [scaled]", row=i, col=1) + fig.update_layout(legend_tracegroupgap=0) + + fig.update_layout(height=600 * len(results), width=1000, title_text="zFPKM Plots", showlegend=True) + fig.write_image("zfpkm_plot.png") -async def rnaseq_gen( - # config_filepath: Path, - config_filename: str, +def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the z-score for each sample in the metrics dictionary.""" + for sample in metrics: + log_matrix = np.log(metrics[sample].normalization_matrix) + z_matrix = pd.DataFrame( + data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names + ) + metrics[sample].z_score_matrix = z_matrix + return metrics + + +def cpm_filter( + *, + context_name: str, + metrics: NamedMetrics, + filtering_options: _FilteringOptions, prep: RNAPrepMethod, - taxon_id: int | str | Taxon, +) -> NamedMetrics: + """Apply Counts Per Million (CPM) filtering to the count matrix for a given sample.""" + config = Config() + n_exp = filtering_options.replicate_ratio + n_top = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + + sample: str + metric: _StudyMetrics + for sample, metric in metrics.items(): + counts: pd.DataFrame = metric.count_matrix + entrez_ids: list[str] = metric.entrez_gene_ids + library_size: pd.DataFrame = counts.sum(axis=1) + + # For library_sizes equal to 0, add 1 to prevent divide by 0 + # This will not impact the final counts per million calculation because the original counts are still 0 + # thus, (0 / 1) * 1_000_000 = 0 + library_size[library_size == 0] = 1 + + output_filepath = config.result_dir / context_name / prep.value / f"CPM_Matrix_{prep.value}_{sample}.csv" + output_filepath.parent.mkdir(parents=True, exist_ok=True) + counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000 + counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids)) + logger.debug(f"Writing CPM matrix to {output_filepath}") + counts_per_million.to_csv(output_filepath, index=False) + + # TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason. + # Most likely due to multiplying by 1_000_000, not exactly sure why + + min_samples = round(n_exp * len(counts.columns)) # noqa: F841 + top_samples = round(n_top * len(counts.columns)) # noqa: F841 + test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) + for i in range(len(counts_per_million.columns)): + cutoff = ( + 10e6 / (np.median(np.sum(counts[:, i]))) + if cut_off == "default" + else (1e6 * cut_off) / np.median(np.sum(counts[:, i])) + ) + test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) + + return metrics + + +def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics: + """Apply quantile-based filtering to the TPM matrix for a given sample.""" + # TODO: Write the TPM matrix to disk + + n_exp = filtering_options.replicate_ratio + n_top = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + metrics = calculate_tpm(metrics) + + sample: str + metric: _StudyMetrics + for sample, metric in metrics.items(): + entrez_ids = metric.entrez_gene_ids + gene_size = metric.gene_sizes + tpm_matrix: pd.DataFrame = metric.normalization_matrix + + min_samples = round(n_exp * len(tpm_matrix.columns)) + top_samples = round(n_top * len(tpm_matrix.columns)) + + tpm_quantile = tpm_matrix[tpm_matrix > 0] + quantile_cutoff = np.quantile( + a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0 + ) # Compute quantile across columns + boolean_expression = pd.DataFrame( + data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns + ).astype(int) + + min_func = k_over_a(min_samples, 0.9) + top_func = k_over_a(top_samples, 0.9) + + min_genes: npt.NDArray[bool] = genefilter(boolean_expression, min_func) + top_genes: npt.NDArray[bool] = genefilter(boolean_expression, top_func) + + # Only keep `entrez_gene_ids` that pass `min_genes` + metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes) if keep] + metric.gene_sizes = [gene for gene, keep in zip(gene_size, min_genes) if keep] + metric.count_matrix = metric.count_matrix.iloc[min_genes, :] + metric.normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :] + + keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes) if keep] + metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes) if keep] + + metrics = calculate_z_score(metrics) + + return metrics + + +def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions, calcualte_fpkm: bool) -> NamedMetrics: + """Apply zFPKM filtering to the FPKM matrix for a given sample.""" + min_sample_expression = filtering_options.replicate_ratio + high_confidence_sample_expression = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + + if calcualte_fpkm: + metrics = calculate_fpkm(metrics) + + metric: _StudyMetrics + for metric in metrics.values(): + # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead + matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix + matrix = matrix[matrix.sum(axis=1) > 0] + + minimums = matrix == 0 + results, zfpkm_df = zfpkm_transform(matrix) + zfpkm_df[minimums] = -4 + zfpkm_plot(results) + + # determine which genes are expressed + min_samples = round(min_sample_expression * len(zfpkm_df.columns)) + min_func = k_over_a(min_samples, cut_off) + min_genes: npt.NDArray[bool] = genefilter(zfpkm_df, min_func) + metric.entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, min_genes) if keep] + + # determine which genes are confidently expressed + top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns)) + top_func = k_over_a(top_samples, cut_off) + top_genes: npt.NDArray[bool] = genefilter(zfpkm_df, top_func) + metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(metric.entrez_gene_ids, top_genes) if keep] + + return metrics + + +def filter_counts( + *, + context_name: str, + metrics: NamedMetrics, + technique: FilteringTechnique, + filtering_options: _FilteringOptions, + prep: RNAPrepMethod, +) -> NamedMetrics: + """Filter the count matrix based on the specified technique.""" + match technique: + case FilteringTechnique.cpm: + return cpm_filter( + context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep + ) + case FilteringTechnique.tpm: + return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) + case FilteringTechnique.zfpkm: + return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True) + case FilteringTechnique.umi: + return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False) + case _: + raise ValueError(f"Technique must be one of {FilteringTechnique}") + + +async def _save_rnaseq_tests( + context_name: str, + rnaseq_matrix: pd.DataFrame, + metadata_df: pd.DataFrame, + gene_info_df: pd.DataFrame, + output_filepath: Path, + prep: RNAPrepMethod, + taxon: int, + replicate_ratio: float, + batch_ratio: float, + high_replicate_ratio: float, + high_batch_ratio: float, + technique: FilteringTechnique, + cut_off: int | float, +): + """Save the results of the RNA-Seq tests to a CSV file.""" + filtering_options = _FilteringOptions( + replicate_ratio=replicate_ratio, + batch_ratio=batch_ratio, + cut_off=float(cut_off), + high_replicate_ratio=high_replicate_ratio, + high_batch_ratio=high_batch_ratio, + ) + + read_counts_results: _ReadMatrixResults = await _build_matrix_results( + matrix=rnaseq_matrix, + gene_info=gene_info_df, + metadata_df=metadata_df, + taxon=taxon, + ) + metrics = read_counts_results.metrics + entrez_gene_ids = read_counts_results.entrez_gene_ids + + metrics = filter_counts( + context_name=context_name, + metrics=metrics, + technique=technique, + filtering_options=filtering_options, + prep=prep, + ) + + expressed_genes: list[str] = [] + top_genes: list[str] = [] + for metric in metrics.values(): + expressed_genes.extend(metric.entrez_gene_ids) + top_genes.extend(metric.high_confidence_entrez_gene_ids) + + expression_frequency = pd.Series(expressed_genes).value_counts() + expression_df = pd.DataFrame( + {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values} + ) + expression_df["prop"] = expression_df["frequency"] / len(metrics) + expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio] + + top_frequency = pd.Series(top_genes).value_counts() + top_df = pd.DataFrame({"entrez_gene_id": top_frequency.index, "frequency": top_frequency.values}) + top_df["prop"] = top_df["frequency"] / len(metrics) + top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio] + + boolean_matrix = pd.DataFrame(data={"entrez_gene_id": entrez_gene_ids, "expressed": 0, "high": 0}) + for gene in entrez_gene_ids: + if gene in expression_df["entrez_gene_id"]: + boolean_matrix.loc[gene, "expressed"] = 1 + if gene in top_df["entrez_gene_id"]: + boolean_matrix.loc[gene, "high"] = 1 + + expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1]) + high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1]) + + boolean_matrix.to_csv(output_filepath, index=False) + logger.info( + f"{context_name} - Found {expressed_count} expressed and {high_confidence_count} confidently expressed genes" + ) + logger.success(f"Wrote boolean matrix to {output_filepath}") + + +async def _create_metadata_df(path: Path) -> pd.DataFrame: + if path.suffix not in {".xls", ".xlsx"}: + raise ValueError( + f"Expected an excel file with extension of '.xlsx' or '.xls', got '{path.suffix}'. " + f"Attempted to process: {path}" + ) + return pd.read_excel(path) + + +async def rnaseq_gen( # noqa: C901, allow complex function + context_name: str, + input_rnaseq_filepath: Path, + input_gene_info_filepath: Path, + output_rnaseq_filepath: Path, + prep: RNAPrepMethod, + taxon: int, + input_metadata_filepath: Path | None = None, + input_metadata_df: pd.DataFrame | None = None, replicate_ratio: float = 0.5, high_replicate_ratio: float = 1.0, batch_ratio: float = 0.5, high_batch_ratio: float = 1.0, technique: FilteringTechnique | str = FilteringTechnique.tpm, - cut_off: int | float | None = None, + cutoff: int | float | None = None, ) -> None: """Generate a list of active and high-confidence genes from a gene count matrix. @@ -132,9 +725,14 @@ async def rnaseq_gen( then study/batch numbers are checked for consensus according to batch ratios. The zFPKM method is outlined here: https://pubmed.ncbi.nlm.nih.gov/24215113/ - :param config_filename: The configuration filename to read + :param context_name: The name of the context being processed + :param input_rnaseq_filepath: The filepath to the gene count matrix + :param input_gene_info_filepath: The filepath to the gene info file + :param output_rnaseq_filepath: The filepath to write the output gene count matrix :param prep: The preparation method - :param taxon_id: The NCBI Taxon ID + :param taxon: The NCBI Taxon ID + :param input_metadata_filepath: The filepath to the metadata file + :param input_metadata_df: The metadata dataframe :param replicate_ratio: The percentage of replicates that a gene must appear in for a gene to be marked as "active" in a batch/study :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" @@ -143,168 +741,61 @@ async def rnaseq_gen( :param high_batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked "highly confident" in its expression :param technique: The filtering technique to use - :param cut_off: The cutoff value to use for the provided filtering technique + :param cutoff: The cutoff value to use for the provided filtering technique :return: None """ - if isinstance(technique, str): - technique = FilteringTechnique(technique.lower()) - if isinstance(taxon_id, (str, int)): - taxon_id = Taxon.from_string(str(taxon_id)) + if not input_metadata_df and not input_metadata_filepath: + raise ValueError("At least one of input_metadata_filepath or input_metadata_df must be provided") + + technique = ( + FilteringTechnique.from_string(str(technique.lower())) if isinstance(technique, (str, int)) else technique + ) match technique: case FilteringTechnique.tpm: - cut_off = 25 if cut_off is None else cut_off - if cut_off < 1 or cut_off > 100: + cutoff = cutoff or 25 + if cutoff < 1 or cutoff > 100: raise ValueError("Quantile must be between 1 - 100") case FilteringTechnique.cpm: - if cut_off is not None and cut_off < 0: + if cutoff and cutoff < 0: raise ValueError("Cutoff must be greater than 0") - elif cut_off is None: - cut_off = "default" + elif cutoff: + cutoff = "default" case FilteringTechnique.zfpkm: - cut_off = "default" if cut_off is None else cut_off + cutoff = cutoff or -3 case FilteringTechnique.umi: pass case _: raise ValueError(f"Technique must be one of {FilteringTechnique}") - await _handle_context_batch( - config_filename=config_filename, + if not input_rnaseq_filepath.exists(): + raise FileNotFoundError(f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'") + + if prep == RNAPrepMethod.SCRNA: + technique = FilteringTechnique.umi + logger.warning( + "Single cell filtration does not normalize and assumes " + "gene counts are counted with Unique Molecular Identifiers (UMIs). " + "Setting filtering technique to UMI now." + ) + + logger.debug(f"Starting '{context_name}'") + output_rnaseq_filepath.parent.mkdir(parents=True, exist_ok=True) + + await _save_rnaseq_tests( + context_name=context_name, + rnaseq_matrix=await _read_counts(input_rnaseq_filepath), + metadata_df=input_metadata_df or await _create_metadata_df(input_metadata_filepath), + gene_info_df=pd.read_csv(input_gene_info_filepath), + output_filepath=output_rnaseq_filepath, + prep=prep, + taxon=taxon, replicate_ratio=replicate_ratio, - replicate_ratio_high=high_replicate_ratio, batch_ratio=batch_ratio, - batch_ratio_high=high_batch_ratio, + high_replicate_ratio=high_replicate_ratio, + high_batch_ratio=high_batch_ratio, technique=technique, - cut_off=cut_off, - prep=prep, - taxon=taxon_id, - ) - - -def _parse_args() -> _Arguments: - parser = argparse.ArgumentParser( - prog="rnaseq_gen.py", - description="Generate a list of active and high-confidence genes from a counts matrix using a user defined " - "at normalization-technique at /work/data/results//rnaseq_.csv: " - "https://github.com/HelikarLab/FastqToGeneCounts", - epilog="For additional help, please post questions/issues in the MADRID GitHub repo at " - "https://github.com/HelikarLab/MADRID or email babessell@gmail.com", - ) - parser.add_argument( - "-c", - "--config-file", - type=str, - required=True, - dest="config_file", - help="Name of config .xlsx file in the /work/data/config_files/. Can be generated using " - "rnaseq_preprocess.py or manually created and imported into the Juypterlab", - ) - parser.add_argument( - "-r", - "--replicate-ratio", - type=float, - required=False, - default=0.5, - dest="replicate_ratio", - help="Ratio of replicates required for a gene to be active within that study/batch group " - "Example: 0.7 means that for a gene to be active, at least 70% of replicates in a group " - "must pass the cutoff after normalization", - ) - parser.add_argument( - "-g", - "--batch-ratio", - type=float, - required=False, - default=0.5, - dest="batch_ratio", - help="Ratio of groups (studies or batches) required for a gene to be active " - "Example: 0.7 means that for a gene to be active, at least 70% of groups in a study must " - "have passed the replicate ratio test", - ) - parser.add_argument( - "-rh", - "--high-replicate-ratio", - type=float, - required=False, - default=1.0, - dest="high_replicate_ratio", - help="Ratio of replicates required for a gene to be considered high-confidence. " - "High-confidence genes ignore consensus with other data-sources, such as proteomics. " - "Example: 0.9 means that for a gene to be high-confidence, " - "at least 90% of replicates in a group must pass the cutoff after normalization", - ) - parser.add_argument( - "-gh", - "--high-batch-ratio", - type=float, - required=False, - default=1.0, - dest="high_batch_ratio", - help="Ratio of studies/batches required for a gene to be considered high-confidence within that group. " - "High-confidence genes ignore consensus with other data-sources, like proteomics. " - "Example: 0.9 means that for a gene to be high-confidence, " - "at least 90% of groups in a study must have passed the replicate ratio test", - ) - parser.add_argument( - "--taxon", - "--taxon-id", - type=str, - required=True, - dest="taxon", - help="The NCBI Taxonomy ID that is being proessed. '9606' for humans, '10090' for mice.", - ) - parser.add_argument( - "-t", - "--filt-technique", - type=str, - required=False, - default="quantile", - dest="filtering_technique", - help="Technique to normalize and filter counts with. " - "Either 'zfpkm', 'quantile', or 'cpm'. More info about each method is discussed in pipeline.ipynb.", - ) - parser.add_argument( - "--minimum-cutoff", - type=int, - required=False, - default=None, - dest="minimum_cutoff", - help="The minimum cutoff used for the filtration technique. " - "If the filtering technique is zFPKM, the default is -3. " - "If the filtering technique is quantile-tpm, the default is 25. " - "If the filtering technique is flat-cpm, the default is determined dynamically. " - "If the filtering technique is quantile, the default is 25.", - ) - parser.add_argument( - "-p", - "--library-prep", - required=True, - choices=["total", "mrna", "scrna"], - dest="library_prep", - help="Library preparation method. " - "Will separate samples into groups to only compare similarly prepared libraries. " - "For example, mRNA, total-rna, scRNA, etc", - ) - args = parser.parse_args() - args.filtering_technique = args.filtering_technique.lower() - args.taxon = Taxon.from_int(int(args.taxon)) if str(args.taxon).isdigit() else Taxon.from_string(str(args.taxon)) # type: ignore - return _Arguments(**vars(args)) - - -if __name__ == "__main__": - args = _parse_args() - asyncio.run( - rnaseq_gen( - config_filename=args.config_file, - replicate_ratio=args.replicate_ratio, - batch_ratio=args.batch_ratio, - high_replicate_ratio=args.high_replicate_ratio, - high_batch_ratio=args.high_batch_ratio, - technique=args.filtering_technique, - cut_off=args.minimum_cutoff, - prep=args.library_prep, - taxon_id=args.taxon, - ) + cut_off=cutoff, ) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index ae8bcdea..02702b38 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -138,16 +138,18 @@ def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: # For each study, collect gene count files, fragment files, insert size files, layouts, and strandedness information study_metrics: list[_StudyMetrics] = [] for gene_dir, strand_dir in zip(gene_counts_directories, strandedness_directories): - if gene_dir.stem != strand_dir.stem: - raise ValueError( - f"Gene directory name of '{gene_dir.stem}' does not match stranded directory name of '{strand_dir.stem}'" # noqa: E501 - ) + count_files = list(gene_dir.glob("*.tab")) + strand_files = list(strand_dir.glob("*.txt")) + if len(count_files) == 0: + raise ValueError(f"No count files found for study '{gene_dir.stem}'.") + if len(strand_files) == 0: + raise ValueError(f"No strandedness files found for study '{gene_dir.stem}'.") study_metrics.append( _StudyMetrics( study_name=gene_dir.stem, - count_files=list(gene_dir.glob("*.tab")), - strand_files=list(strand_dir.glob("*.txt")), + count_files=count_files, + strand_files=strand_files, ) ) return study_metrics @@ -262,6 +264,7 @@ async def _write_counts_matrix( counts: list[pd.DataFrame] = await asyncio.gather( *[_create_sample_counts_matrix(metric) for metric in study_metrics] ) + final_matrix = pd.DataFrame() for count in counts: final_matrix = count if final_matrix.empty else pd.merge(final_matrix, count, on="ensembl_gene_id", how="outer") @@ -275,13 +278,28 @@ async def _write_counts_matrix( return final_matrix -async def _create_config_df(context_name: str, /, como_input_dir: Path) -> pd.DataFrame: # noqa: C901 +async def _create_config_df( # noqa: C901 + context_name: str, + /, + como_context_dir: Path, + gene_count_dirname: str = "geneCounts", + layout_dirname: str = "layouts", + strandedness_dirname: str = "strandedness", + fragment_sizes_dirname: str = "fragmentSizes", + prep_method_dirname: str = "prepMethods", +) -> pd.DataFrame: """Create configuration sheet. The configuration file created is based on the gene counts matrix. If using zFPKM normalization technique, mean fragment lengths will be fetched """ - gene_counts_files = list(Path(como_input_dir, context_name, "geneCounts").rglob("*.tab")) + gene_counts_dir = como_context_dir / gene_count_dirname + layout_dir = como_context_dir / layout_dirname + strandedness_dir = como_context_dir / strandedness_dirname + fragment_sizes_dir = como_context_dir / fragment_sizes_dirname + prep_method_dir = como_context_dir / prep_method_dirname + + gene_counts_files = list(gene_counts_dir.rglob("*.tab")) sample_names: list[str] = [] fragment_lengths: list[int | float] = [] layouts: list[str] = [] @@ -289,52 +307,51 @@ async def _create_config_df(context_name: str, /, como_input_dir: Path) -> pd.Da groups: list[str] = [] preparation_method: list[str] = [] + if len(gene_counts_files) == 0: + raise FileNotFoundError(f"No gene count files found in '{gene_counts_dir}'.") + for gene_count_filename in sorted(gene_counts_files): - try: - # Match S___R___r___ - # \d{1,3} matches 1-3 digits - # (?:r\d{1,3})? matches an option "r" followed by three digits - label = re.findall(r"S\d{1,3}R\d{1,3}(?:r\d{1,3})?", gene_count_filename.as_posix())[0] - - except IndexError as e: - raise IndexError( + # Match S___R___r___ + # \d{1,3} matches 1-3 digits + # (?:r\d{1,3})? optionally matches a "r" followed by three digits + label = re.findall(r"S\d{1,3}R\d{1,3}(?:r\d{1,3})?", gene_count_filename.as_posix())[0] + if not label: + raise ValueError( f"\n\nFilename of '{gene_count_filename}' is not valid. " f"Should be 'contextName_SXRYrZ.tab', where X is the study/batch number, Y is the replicate number, " f"and Z is the run number." "\n\nIf not a multi-run sample, exclude 'rZ' from the filename." - ) from e + ) study_number = re.findall(r"S\d{1,3}", label)[0] rep_number = re.findall(r"R\d{1,3}", label)[0] - run = re.findall(r"r\d{1,3}", label) + run_number = re.findall(r"r\d{1,3}", label) multi_flag = 0 - if len(run) > 0: - if run[0] != "r1": + if len(run_number) > 0: + if run_number[0] != "r1": continue - else: - label_glob = study_number + rep_number + "r*" - runs = [run for run in gene_counts_files if re.search(label_glob, run.as_posix())] - multi_flag = 1 - frag_files = [] - - for r in runs: - r_label = re.findall(r"r\d{1,3}", r.as_posix())[0] - R_label = re.findall(r"R\d{1,3}", r.as_posix())[0] # noqa: N806 - frag_filename = "".join([context_name, "_", study_number, R_label, r_label, "_fragment_size.txt"]) - frag_files.append(como_input_dir / context_name / "fragmentSizes" / study_number / frag_filename) - - context_path = como_input_dir / context_name - layout_files: list[Path] = list((context_path / "layouts").rglob(f"{context_name}_{label}_layout.txt")) - strand_files: list[Path] = list((context_path / "strandedness").rglob(f"{context_name}_{label}_strandedness.txt")) # fmt: skip # noqa: E501 - frag_files: list[Path] = list((context_path / "fragmentSizes").rglob(f"{context_name}_{label}_fragment_size.txt")) # fmt: skip # noqa: E501 - prep_files: list[Path] = list((context_path / "prepMethods").rglob(f"{context_name}_{label}_prep_method.txt")) + label_glob = f"{study_number}{rep_number}r*" # S__R__r* + runs = [run for run in gene_counts_files if re.search(label_glob, run.as_posix())] + multi_flag = 1 + frag_files = [] + + for run in runs: + run_number = re.findall(r"R\d{1,3}", run.as_posix())[0] + replicate = re.findall(r"r\d{1,3}", run.as_posix())[0] + frag_filename = "".join([context_name, "_", study_number, run_number, replicate, "_fragment_size.txt"]) + frag_files.append(como_context_dir / fragment_sizes_dirname / study_number / frag_filename) + + layout_files: list[Path] = list(layout_dir.rglob(f"{context_name}_{label}_layout.txt")) + strand_files: list[Path] = list(strandedness_dir.rglob(f"{context_name}_{label}_strandedness.txt")) + frag_files: list[Path] = list(fragment_sizes_dir.rglob(f"{context_name}_{label}_fragment_size.txt")) + prep_files: list[Path] = list(prep_method_dir.rglob(f"{context_name}_{label}_prep_method.txt")) layout = "UNKNOWN" if len(layout_files) == 0: logger.warning( f"No layout file found for {label}, writing as 'UNKNOWN', " - f"this should be defined by user if using zFPKM or rnaseq_gen.py will not run" + f"this should be defined if you are using zFPKM or downstream 'rnaseq_gen.py' will not run" ) elif len(layout_files) == 1: with layout_files[0].open("r") as file: @@ -363,7 +380,7 @@ async def _create_config_df(context_name: str, /, como_input_dir: Path) -> pd.Da prep = "total" if len(prep_files) == 0: - logger.warning(f"No prep file found for {label}, assuming 'total' as in Total RNA library preparation") + logger.warning(f"No prep file found for {label}, assuming 'total', as in 'Total RNA' library preparation") elif len(prep_files) == 1: with prep_files[0].open("r") as file: prep = file.read().strip().lower() @@ -376,10 +393,10 @@ async def _create_config_df(context_name: str, /, como_input_dir: Path) -> pd.Da ) mean_fragment_size = 100 - if len(frag_files) == 0: + if len(frag_files) == 0 and prep != RNAPrepMethod.TOTAL.value: logger.warning( f"No fragment file found for {label}, using '100'. " - f"This must be defined by the user in order to use zFPKM normalization" + "You should define this if you are going to use downstream zFPKM normalization" ) elif len(frag_files) == 1: if layout == "single-end": @@ -448,11 +465,11 @@ async def read_counts(file: Path) -> list[str]: ) return conversion["entrez_gene_id"].tolist() - logger.info("Fetching gene info (this may take 1-5 minutes)") + logger.info( + "Fetching gene info (this may take 1-5 minutes depending on the number of genes and your internet connection)" + ) genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths]))) - - mygene = MyGene(cache=cache) - gene_data = await mygene.query(items=list(genes), taxon=taxon, scopes="entrezgene") + gene_data = await MyGene(cache=cache).query(items=list(genes), taxon=taxon, scopes="entrezgene") gene_info: pd.DataFrame = pd.DataFrame( data=None, columns=pd.Index(data=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "start_position", "end_position"]), @@ -495,7 +512,7 @@ async def _create_matrix_file( output_counts_matrix_filepath: Path, rna: type_rna, ) -> None: - config_df = await _create_config_df(context_name, como_input_dir=como_context_dir) + config_df = await _create_config_df(context_name, como_context_dir=como_context_dir) await _write_counts_matrix( config_df=config_df, como_context_dir=como_context_dir, @@ -523,7 +540,7 @@ async def _process( if output_trna_config_filepath: rna_types.append(("total", output_trna_config_filepath, output_trna_matrix_filepath)) if output_mrna_config_filepath: - rna_types.append(("polya", output_mrna_config_filepath, output_mrna_matrix_filepath)) + rna_types.append(("mrna", output_mrna_config_filepath, output_mrna_matrix_filepath)) # if provided, iterate through como-input specific directories tasks = [] @@ -563,9 +580,9 @@ async def rnaseq_preprocess( input_matrix_filepath: Path | list[Path] | None = None, preparation_method: RNAPrepMethod | list[RNAPrepMethod] | None = None, output_trna_config_filepath: Path | None = None, - output_polya_config_filepath: Path | None = None, + output_mrna_config_filepath: Path | None = None, output_trna_count_matrix_filepath: Path | None = None, - output_polya_count_matrix_filepath: Path | None = None, + output_mrna_count_matrix_filepath: Path | None = None, cache: bool = True, log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", log_location: str | TextIOWrapper = sys.stderr, @@ -579,9 +596,9 @@ async def rnaseq_preprocess( :param taxon: The NCBI taxonomy ID :param output_gene_info_filepath: Path to the output gene information CSV file :param output_trna_config_filepath: Path to the output tRNA config file (if in "create" mode) - :param output_polya_config_filepath: Path to the output mRNA config file (if in "create" mode) + :param output_mrna_config_filepath: Path to the output mRNA config file (if in "create" mode) :param output_trna_count_matrix_filepath: The path to write total RNA count matrices - :param output_polya_count_matrix_filepath: The path to write messenger RNA count matrices + :param output_mrna_count_matrix_filepath: The path to write messenger RNA count matrices :param como_context_dir: If in "create" mode, the input path(s) to the COMO_input directory of the current context i.e., the directory containing "fragmentSizes", "geneCounts", "insertSizeMetrics", etc. directories :param input_matrix_filepath: If in "provide" mode, the path(s) to the count matrices to be processed @@ -604,18 +621,18 @@ async def rnaseq_preprocess( output_trna_config_filepath = ( output_trna_config_filepath.resolve() if output_trna_config_filepath else output_trna_config_filepath ) - output_polya_config_filepath = ( - output_polya_config_filepath.resolve() if output_polya_config_filepath else output_polya_config_filepath + output_mrna_config_filepath = ( + output_mrna_config_filepath.resolve() if output_mrna_config_filepath else output_mrna_config_filepath ) output_trna_count_matrix_filepath = ( output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else output_trna_count_matrix_filepath ) - output_polya_count_matrix_filepath = ( - output_polya_count_matrix_filepath.resolve() - if output_polya_count_matrix_filepath - else output_polya_count_matrix_filepath + output_mrna_count_matrix_filepath = ( + output_mrna_count_matrix_filepath.resolve() + if output_mrna_count_matrix_filepath + else output_mrna_count_matrix_filepath ) input_matrix_filepath = _listify(input_matrix_filepath) @@ -633,8 +650,8 @@ async def rnaseq_preprocess( input_matrix_filepath=input_matrix_filepath, output_gene_info_filepath=output_gene_info_filepath, output_trna_config_filepath=output_trna_config_filepath, - output_mrna_config_filepath=output_polya_config_filepath, + output_mrna_config_filepath=output_mrna_config_filepath, output_trna_matrix_filepath=output_trna_count_matrix_filepath, - output_mrna_matrix_filepath=output_polya_count_matrix_filepath, + output_mrna_matrix_filepath=output_mrna_count_matrix_filepath, cache=cache, ) diff --git a/main/como/types.py b/main/como/types.py index ebe44ecf..916d3a25 100644 --- a/main/como/types.py +++ b/main/como/types.py @@ -46,5 +46,30 @@ def from_string(value: str) -> RNAPrepMethod: raise ValueError(f"Filtering technique must be one of {possible_values}; got: {value}") +class FilteringTechnique(Enum): + """RNA sequencing filtering capabilities.""" + + cpm = "cpm" + zfpkm = "zfpkm" + tpm = "quantile" + umi = "umi" + + @staticmethod + def from_string(value: str) -> FilteringTechnique: + """Create a filtering technique object from a string.""" + match value.lower(): + case "cpm": + return FilteringTechnique.cpm + case "zfpkm": + return FilteringTechnique.zfpkm + case "quantile": + return FilteringTechnique.tpm + case "umi": + return FilteringTechnique.umi + case _: + possible_values = [t.value for t in FilteringTechnique] + raise ValueError(f"Got a filtering technique of '{value}'; should be one of: {possible_values}") + + type_path = str | Path -type_rna = Literal["total", "polya"] +type_rna = Literal["total", "mrna"] diff --git a/main/data/boundary_rxns/naiveB_boundary_rxns.csv b/main/data/boundary_rxns/naiveB_boundary_rxns.csv index 7130c9be..0b08556e 100644 --- a/main/data/boundary_rxns/naiveB_boundary_rxns.csv +++ b/main/data/boundary_rxns/naiveB_boundary_rxns.csv @@ -1,4 +1,4 @@ -Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Boundary,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate Exchange,glc_D,Extracellular,-100,1000 Exchange,fe2,Extracellular,-1000,1000 Exchange,gal,Extracellular,0,1000 diff --git a/main/data/boundary_rxns/smB_boundary_rxns.csv b/main/data/boundary_rxns/smB_boundary_rxns.csv index 9f0f9385..0b08556e 100644 --- a/main/data/boundary_rxns/smB_boundary_rxns.csv +++ b/main/data/boundary_rxns/smB_boundary_rxns.csv @@ -1,4 +1,4 @@ -Reaction,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate +Boundary,Abbreviation,Compartment,Minimum Reaction Rate,Maximum Reaction Rate Exchange,glc_D,Extracellular,-100,1000 Exchange,fe2,Extracellular,-1000,1000 Exchange,gal,Extracellular,0,1000 @@ -67,4 +67,4 @@ Exchange,orn,Extracellular,-1,1000 Exchange,orn_D,Extracellular,-1,1000 Exchange,so4,Extracellular,-1000,1000 Exchange,ribflv,Extracellular,-1,1000 -Exchange,Lcystin,Extracellular,-1,1000 +Exchange,Lcystin,Extracellular,-1,1000 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 85a1308f..fabd6c69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires-python = ">=3.10,<3.13" dependencies = [ "aiofiles>=24.1.0", "aioftp>=0.23.1", - "cobamp", + "cobamp@git+https://github.com/JoshLoecker/cobamp@master", "cobra>=0.28.0", "fast-bioservices>=0.3.9", "gurobipy>=11.0", @@ -20,7 +20,7 @@ dependencies = [ "scikit-learn>=1.5.2", "scipy>=1.7.3", "setuptools<60.0", - "troppo", + "troppo@git+https://github.com/JoshLoecker/troppo@master", ] [project.optional-dependencies] @@ -57,14 +57,9 @@ dev-dependencies = [ "cz-conventional-gitmoji>=0.6.1", ] -[tool.uv.sources] -troppo = { git = "https://github.com/JoshLoecker/troppo", rev = "update_dependencies" } -cobamp = { git = "https://github.com/JoshLoecker/cobamp", rev = "update_packages" } -pydantic-settings = { git = "https://github.com/pydantic/pydantic-settings" } - [tool.commitizen] name = "cz_conventional_commits" tag_format = "$version" version_scheme = "semver2" version_provider = "pep621" -update_changelog_on_bump = true +update_changelog_on_bump = true \ No newline at end of file