getwilds
diff --git a/‎modules/ww-ena/README.md‎
Lines changed: 6 additions & 5 deletions b/‎modules/ww-ena/README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎modules/ww-ena/ww-ena.wdl‎
Lines changed: 55 additions & 42 deletions b/‎modules/ww-ena/ww-ena.wdl‎
Lines changed: 55 additions & 42 deletions
diff --git a/‎modules/ww-salmon/README.md‎
Lines changed: 3 additions & 2 deletions b/‎modules/ww-salmon/README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎modules/ww-salmon/testrun.wdl‎
Lines changed: 27 additions & 32 deletions b/‎modules/ww-salmon/testrun.wdl‎
Lines changed: 27 additions & 32 deletions
diff --git a/‎modules/ww-salmon/ww-salmon.wdl‎
Lines changed: 13 additions & 6 deletions b/‎modules/ww-salmon/ww-salmon.wdl‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎modules/ww-star/README.md‎
Lines changed: 19 additions & 6 deletions b/‎modules/ww-star/README.md‎
Lines changed: 19 additions & 6 deletions
@@ -66,17 +66,18 @@ Downloads sequencing data files from ENA using a search query. Allows filtering
 
 ### `extract_fastq_pairs`
 
-Extracts R1 and R2 FASTQ files from downloaded ENA files for downstream paired-end processing. This task identifies all paired-end FASTQ files by common naming patterns, creates standardized outputs, and automatically extracts the accession ID from each filename. Supports multiple accessions in a single download.
+Extracts FASTQ files from downloaded ENA files for downstream processing. Supports both paired-end and single-end data. This task identifies FASTQ files by common naming patterns, creates standardized outputs, automatically extracts the accession ID from each filename, and detects whether each sample is paired-end or single-end. Supports multiple accessions in a single download.
 
 **Inputs:**
 - `downloaded_files` (Array[File]): Array of files downloaded from ENA (typically from `download_files` task)
 
 **Outputs:**
-- `r1_files` (Array[File]): Array of Read 1 FASTQ files, parallel with `r2_files` and `accessions`
-- `r2_files` (Array[File]): Array of Read 2 FASTQ files, parallel with `r1_files` and `accessions`
-- `accessions` (Array[String]): Array of ENA accession IDs extracted from filenames, parallel with `r1_files` and `r2_files`
+- `r1_files` (Array[File]): Array of Read 1 FASTQ files, parallel with `r2_files`, `accessions`, and `is_paired_end_list`
+- `r2_files` (Array[File]): Array of Read 2 FASTQ files (empty placeholder for single-end samples), parallel with `r1_files`, `accessions`, and `is_paired_end_list`
+- `accessions` (Array[String]): Array of ENA accession IDs extracted from filenames, parallel with `r1_files`, `r2_files`, and `is_paired_end_list`
+- `is_paired_end_list` (Array[String]): Array of strings ("true"/"false") indicating whether each sample is paired-end, parallel with `r1_files`, `r2_files`, and `accessions`
 
-**Usage Note:** This task is designed for FASTQ workflows requiring separate R1/R2 files. It searches for common paired-end naming patterns including `_1.fastq.gz`/`_2.fastq.gz`, `_R1.fastq.gz`/`_R2.fastq.gz`, and their uncompressed equivalents. The accession ID is automatically extracted from each filename (e.g., `ERR000001_1.fastq.gz` → `ERR000001`). The output arrays are parallel, meaning `r1_files[i]`, `r2_files[i]`, and `accessions[i]` all correspond to the same sample. If you're downloading other file formats (BAM, analysis files), you don't need this task.
+**Usage Note:** This task is designed for FASTQ workflows requiring separate R1/R2 files. It searches for common naming patterns including `_1.fastq.gz`/`_2.fastq.gz`, `_R1.fastq.gz`/`_R2.fastq.gz`, and their uncompressed equivalents. The accession ID is automatically extracted from each filename (e.g., `ERR000001_1.fastq.gz` → `ERR000001`). The output arrays are parallel, meaning `r1_files[i]`, `r2_files[i]`, `accessions[i]`, and `is_paired_end_list[i]` all correspond to the same sample. If you're downloading other file formats (BAM, analysis files), you don't need this task.
 
 ## Usage as a Module
 
 
@@ -177,12 +177,13 @@ task extract_fastq_pairs {
   meta {
     author: "Taylor Firman"
     email: "tfirman@fredhutch.org"
-    description: "Extract R1 and R2 FASTQ file pairs from ENA downloads for downstream processing."
+    description: "Extract FASTQ files from ENA downloads for downstream processing. Supports both paired-end and single-end data."
     url: "https://raw.githubusercontent.com/getwilds/wilds-wdl-library/refs/heads/main/modules/ww-ena/ww-ena.wdl"
     outputs: {
-        r1_files: "Array of Read 1 FASTQ files, parallel with r2_files and accessions",
-        r2_files: "Array of Read 2 FASTQ files, parallel with r1_files and accessions",
-        accessions: "Array of ENA accession IDs extracted from filenames, parallel with r1_files and r2_files"
+        r1_files: "Array of Read 1 FASTQ files, parallel with accessions",
+        r2_files: "Array of Read 2 FASTQ files (empty file for single-end samples), parallel with r1_files and accessions",
+        accessions: "Array of ENA accession IDs extracted from filenames, parallel with r1_files and r2_files",
+        is_paired_end_list: "Array of booleans indicating whether each sample is paired-end"
     }
   }
 
@@ -208,55 +209,67 @@ task extract_fastq_pairs {
     # Look for patterns: *_1.fastq.gz, *_R1.fastq.gz, *_1.fq.gz, *_R1.fq.gz, etc.
     R1_FILES=$(ls ~{sep=' ' downloaded_files} | grep -E "(_1\.fastq|_R1\.fastq|_1\.fq|_R1\.fq)" | sort || echo "")
 
-    if [ -z "$R1_FILES" ]; then
-      echo "ERROR: Could not identify any R1 FASTQ files"
-      echo "Looking for files matching pattern *_1.fastq.gz or *_R1.fastq.gz"
-      echo "Available files:"
-      ls -lh ~{sep=' ' downloaded_files}
-      exit 1
-    fi
-
     # Initialize output files
     > r1_files.txt
     > r2_files.txt
     > accessions.txt
+    > is_paired_end.txt
+
+    if [ -n "$R1_FILES" ]; then
+      # Paired-end: process each R1 file and find its matching R2
+      echo "$R1_FILES" | while read R1_FILE; do
+        echo "Processing R1: $R1_FILE"
+
+        # Extract accession ID from filename (everything before _1 or _R1)
+        BASENAME=$(basename "$R1_FILE")
+        ACCESSION=$(echo "$BASENAME" | sed -E 's/(_1\.fastq|_R1\.fastq|_1\.fq|_R1\.fq).*//')
+        echo "Extracted accession: $ACCESSION"
+
+        # Find matching R2 file
+        R2_FILE=$(ls ~{sep=' ' downloaded_files} | grep -E "^.*${ACCESSION}(_2\.fastq|_R2\.fastq|_2\.fq|_R2\.fq)" | head -1 || echo "")
+
+        if [ -z "$R2_FILE" ]; then
+          echo "WARNING: No matching R2 file for accession: $ACCESSION, treating as single-end"
+          cp "$R1_FILE" "fastq_pairs/${ACCESSION}_r1.fastq.gz"
+          touch "fastq_pairs/${ACCESSION}_r2.fastq.gz"
+          echo "false" >> is_paired_end.txt
+        else
+          echo "Matched R2: $R2_FILE"
+          cp "$R1_FILE" "fastq_pairs/${ACCESSION}_r1.fastq.gz"
+          cp "$R2_FILE" "fastq_pairs/${ACCESSION}_r2.fastq.gz"
+          echo "true" >> is_paired_end.txt
+        fi
+
+        echo "fastq_pairs/${ACCESSION}_r1.fastq.gz" >> r1_files.txt
+        echo "fastq_pairs/${ACCESSION}_r2.fastq.gz" >> r2_files.txt
+        echo "$ACCESSION" >> accessions.txt
+      done
+    else
+      # Single-end: no R1 pattern found, treat all FASTQ files as single-end reads
+      echo "No paired-end naming pattern found, treating files as single-end"
+      for FASTQ_FILE in $(ls ~{sep=' ' downloaded_files} | grep -E "\.(fastq|fq)" | sort); do
+        BASENAME=$(basename "$FASTQ_FILE")
+        ACCESSION=$(echo "$BASENAME" | sed -E 's/\.(fastq|fq).*//')
+        echo "Processing single-end: $ACCESSION"
+
+        cp "$FASTQ_FILE" "fastq_pairs/${ACCESSION}_r1.fastq.gz"
+        touch "fastq_pairs/${ACCESSION}_r2.fastq.gz"
+
+        echo "fastq_pairs/${ACCESSION}_r1.fastq.gz" >> r1_files.txt
+        echo "fastq_pairs/${ACCESSION}_r2.fastq.gz" >> r2_files.txt
+        echo "$ACCESSION" >> accessions.txt
+        echo "false" >> is_paired_end.txt
+      done
+    fi
 
-    # Process each R1 file and find its matching R2
-    echo "$R1_FILES" | while read R1_FILE; do
-      echo "Processing R1: $R1_FILE"
-
-      # Extract accession ID from filename (everything before _1 or _R1)
-      BASENAME=$(basename "$R1_FILE")
-      ACCESSION=$(echo "$BASENAME" | sed -E 's/(_1\.fastq|_R1\.fastq|_1\.fq|_R1\.fq).*//')
-      echo "Extracted accession: $ACCESSION"
-
-      # Find matching R2 file
-      R2_FILE=$(ls ~{sep=' ' downloaded_files} | grep -E "^.*${ACCESSION}(_2\.fastq|_R2\.fastq|_2\.fq|_R2\.fq)" | head -1 || echo "")
-
-      if [ -z "$R2_FILE" ]; then
-        echo "ERROR: Could not find matching R2 file for accession: $ACCESSION"
-        exit 1
-      fi
-
-      echo "Matched R2: $R2_FILE"
-
-      # Copy files with accession-prefixed names to maintain uniqueness
-      cp "$R1_FILE" "fastq_pairs/${ACCESSION}_r1.fastq.gz"
-      cp "$R2_FILE" "fastq_pairs/${ACCESSION}_r2.fastq.gz"
-
-      # Append to output lists
-      echo "fastq_pairs/${ACCESSION}_r1.fastq.gz" >> r1_files.txt
-      echo "fastq_pairs/${ACCESSION}_r2.fastq.gz" >> r2_files.txt
-      echo "$ACCESSION" >> accessions.txt
-    done
-
-    echo "Successfully processed $(wc -l < accessions.txt) FASTQ pairs"
+    echo "Successfully processed $(wc -l < accessions.txt) FASTQ sample(s)"
   >>>
 
   output {
     Array[File] r1_files = read_lines("r1_files.txt")
     Array[File] r2_files = read_lines("r2_files.txt")
     Array[String] accessions = read_lines("accessions.txt")
+    Array[String] is_paired_end_list = read_lines("is_paired_end.txt")
   }
 
   runtime {
 
@@ -32,13 +32,13 @@ Builds Salmon index from a reference transcriptome FASTA file.
 - `salmon_index` (File): Compressed tarball containing the Salmon index for quantification
 
 ### `quantify`
-Quantifies transcript expression from paired-end RNA-seq reads using Salmon.
+Quantifies transcript expression from RNA-seq reads using Salmon. Supports both paired-end and single-end data.
 
 **Inputs:**
 - `salmon_index_dir` (File): Compressed tarball containing Salmon index from `build_index`
 - `sample_name` (String): Sample name identifier for output files
 - `fastq_r1` (File): FASTQ file for read 1
-- `fastq_r2` (File): FASTQ file for read 2
+- `fastq_r2` (File?, optional): FASTQ file for read 2 (omit for single-end data)
 - `cpu_cores` (Int): Number of CPU cores (default: 8)
 - `memory_gb` (Int): Memory allocation in GB (default: 16)
 
@@ -182,6 +182,7 @@ The module supports flexible resource configuration:
 
 ## Features
 
+- **Single-end and paired-end support**: Works with both single-end and paired-end sequencing data
 - **Multi-sample support**: Process multiple samples in parallel using scatter-gather patterns
 - **Result aggregation**: Automatically merge quantification results across samples into TPM and count matrices
 - **Module integration**: Seamlessly combines with ww-sra, ww-testdata, and ww-deseq2 modules
 
@@ -1,15 +1,11 @@
 version 1.0
 
 # Import module in question as well as the testdata module for automatic demo functionality
-import "https://raw.githubusercontent.com/getwilds/wilds-wdl-library/refs/heads/main/modules/ww-salmon/ww-salmon.wdl" as ww_salmon
-import "https://raw.githubusercontent.com/getwilds/wilds-wdl-library/refs/heads/main/modules/ww-testdata/ww-testdata.wdl" as ww_testdata
-
-# Define data structure for sample inputs
-struct SalmonSample {
-    String name
-    File r1_fastq
-    File r2_fastq
-}
+# import "https://raw.githubusercontent.com/getwilds/wilds-wdl-library/refs/heads/fix-sra-star-jyoung/modules/ww-salmon/ww-salmon.wdl" as ww_salmon
+# import "https://raw.githubusercontent.com/getwilds/wilds-wdl-library/refs/heads/main/modules/ww-testdata/ww-testdata.wdl" as ww_testdata
+import "ww-salmon.wdl" as ww_salmon
+import "../ww-testdata/ww-testdata.wdl" as ww_testdata
+
 
 #### TEST WORKFLOW DEFINITION ####
 # Define test workflow to demonstrate module functionality
@@ -26,45 +22,44 @@ workflow salmon_example {
       memory_gb = 8
   }
 
-  # Create samples array using test data
-  Array[SalmonSample] final_samples = [
-    {
-      "name": "demo_sample",
-      "r1_fastq": download_demo_data.r1_fastq,
-      "r2_fastq": download_demo_data.r2_fastq
-    }
-  ]
-
-  # Quantify each sample
-  scatter (sample in final_samples) {
-    call ww_salmon.quantify { input:
-        salmon_index_dir = build_index.salmon_index,
-        sample_name = sample.name,
-        fastq_r1 = sample.r1_fastq,
-        fastq_r2 = sample.r2_fastq,
-        cpu_cores = 2,
-        memory_gb = 8
-    }
+  # Paired-end quantification
+  call ww_salmon.quantify as quantify_paired { input:
+      salmon_index_dir = build_index.salmon_index,
+      sample_name = "demo_paired",
+      fastq_r1 = download_demo_data.r1_fastq,
+      fastq_r2 = download_demo_data.r2_fastq,
+      cpu_cores = 2,
+      memory_gb = 8
+  }
+
+  # Single-end quantification (using R1 only)
+  call ww_salmon.quantify as quantify_single { input:
+      salmon_index_dir = build_index.salmon_index,
+      sample_name = "demo_single",
+      fastq_r1 = download_demo_data.r1_fastq,
+      cpu_cores = 2,
+      memory_gb = 8
   }
 
   # Merge results
   call ww_salmon.merge_results { input:
-      salmon_quant_dirs = quantify.salmon_quant_dir,
-      sample_names = quantify.output_sample_name,
+      salmon_quant_dirs = [quantify_paired.salmon_quant_dir, quantify_single.salmon_quant_dir],
+      sample_names = [quantify_paired.output_sample_name, quantify_single.output_sample_name],
       cpu_cores = 1,
       memory_gb = 4
   }
 
   # Validate outputs
   call validate_outputs { input:
-      salmon_quant_dirs = quantify.salmon_quant_dir,
+      salmon_quant_dirs = [quantify_paired.salmon_quant_dir, quantify_single.salmon_quant_dir],
       tpm_matrix = merge_results.tpm_matrix,
       counts_matrix = merge_results.counts_matrix
   }
 
   output {
     File salmon_index_tar = build_index.salmon_index
-    Array[File] salmon_quant_dirs = quantify.salmon_quant_dir
+    File salmon_quant_paired = quantify_paired.salmon_quant_dir
+    File salmon_quant_single = quantify_single.salmon_quant_dir
     File merged_tpm_matrix = merge_results.tpm_matrix
     File merged_counts_matrix = merge_results.counts_matrix
     File sample_list = merge_results.sample_list
 
@@ -66,7 +66,7 @@ task quantify {
   meta {
     author: "WILDS Team"
     email: "wilds@fredhutch.org"
-    description: "Quantify transcript expression from paired-end RNA-seq reads using Salmon"
+    description: "Quantify transcript expression from RNA-seq reads using Salmon. Supports both paired-end and single-end data."
     url: "https://raw.githubusercontent.com/getwilds/wilds-wdl-library/refs/heads/main/modules/ww-salmon/ww-salmon.wdl"
     outputs: {
         salmon_quant_dir: "Compressed tarball containing Salmon quantification results including abundance estimates",
@@ -78,7 +78,7 @@ task quantify {
     salmon_index_dir: "Compressed tarball containing Salmon genome index"
     sample_name: "Name identifier for the sample"
     fastq_r1: "FASTQ file for read 1"
-    fastq_r2: "FASTQ file for read 2"
+    fastq_r2: "Optional FASTQ file for read 2 (omit for single-end data)"
     cpu_cores: "Number of CPU cores allocated for the task"
     memory_gb: "Memory allocated for the task in GB"
   }
@@ -87,7 +87,7 @@ task quantify {
     File salmon_index_dir
     String sample_name
     File fastq_r1
-    File fastq_r2
+    File? fastq_r2
     Int cpu_cores = 8
     Int memory_gb = 16
   }
@@ -99,12 +99,19 @@ task quantify {
     mkdir -p salmon_index
     tar -xzf ~{salmon_index_dir} -C ./
 
-    # Paired-end quantification with best practice parameters
+    # Build read arguments based on paired-end or single-end data
+    R2_FILE="~{if defined(fastq_r2) then select_first([fastq_r2]) else ""}"
+    if [ -n "$R2_FILE" ]; then
+      READ_ARGS="-1 ~{fastq_r1} -2 $R2_FILE"
+    else
+      READ_ARGS="-r ~{fastq_r1}"
+    fi
+
+    # Quantification with best practice parameters
     salmon quant \
         -i salmon_index \
         --libType A \
-        -1 ~{fastq_r1} \
-        -2 ~{fastq_r2} \
+        $READ_ARGS \
         -o ~{sample_name}_quant \
         -p ~{cpu_cores} \
         --validateMappings \
 
@@ -38,7 +38,7 @@ Performs RNA-seq alignment using STAR's two-pass methodology.
 **Inputs:**
 - `star_genome_tar` (File): STAR genome index from `build_index`
 - `r1` (File): R1 FASTQ file
-- `r2` (File): R2 FASTQ file
+- `r2` (File?, optional): R2 FASTQ file (omit for single-end data)
 - `name` (String): Sample name for output files
 - `sjdb_overhang` (Int): Length of genomic sequence around junctions (default: 100)
 - `memory_gb` (Int): Memory allocation in GB (default: 62)
@@ -71,13 +71,13 @@ workflow my_rna_seq_pipeline {
     File reference_fasta
     File reference_gtf
   }
-  
+
   call star_tasks.build_index {
     input:
       reference_fasta = reference_fasta,
       reference_gtf = reference_gtf
   }
-  
+
   scatter (sample in samples) {
     call star_tasks.align_two_pass {
       input:
@@ -87,14 +87,25 @@ workflow my_rna_seq_pipeline {
         name = sample.name
     }
   }
-  
+
   output {
     Array[File] aligned_bams = align_two_pass.bam
     Array[File] gene_counts = align_two_pass.gene_counts
   }
 }
 ```
 
+**Single-end data:**
+```wdl
+# Simply omit the r2 parameter for single-end samples
+call star_tasks.align_two_pass {
+  input:
+    star_genome_tar = build_index.star_index_tar,
+    r1 = my_single_end_fastq,
+    name = "my_sample"
+}
+```
+
 ### Advanced Usage Examples
 
 **Custom splice junction parameters:**
@@ -143,8 +154,9 @@ The test workflow (`star_example`) automatically:
 1. Downloads reference genome data using `ww-testdata`
 2. Downloads demonstration FASTQ data using `ww-testdata`
 3. Builds STAR genome index
-4. Performs RNA-seq alignment using STAR two-pass methodology
-5. Validates all outputs
+4. Performs paired-end RNA-seq alignment using STAR two-pass methodology
+5. Performs single-end RNA-seq alignment using STAR two-pass methodology
+6. Validates all outputs (both paired-end and single-end)
 
 ## Configuration Guidelines
 
@@ -177,6 +189,7 @@ The module supports flexible resource configuration:
 ## Features
 
 - **Two-pass methodology**: Optimal splice junction detection using STAR's two-pass approach
+- **Single-end and paired-end support**: Works with both single-end and paired-end sequencing data
 - **Comprehensive outputs**: BAM files, gene counts, splice junctions, and detailed logs
 - **Multi-sample support**: Process multiple samples in parallel
 - **Module integration**: Seamlessly combines with ww-sra and ww-testdata