EIT-Pathogena · JeremyWesthead · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -13,7 +13,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-22.04, macos-14]
+        # os: [ubuntu-22.04, macos-14]
+        os: [macos-14]
         python-version: ['3.10']
     name: Python ${{ matrix.python-version }} (${{ matrix.os }})
     steps:
@@ -28,9 +29,9 @@ jobs:
         shell: bash -el {0}
         run: |
           if [ "${{ matrix.os }}" == "ubuntu-22.04" ]; then
-              conda create -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }}
+              conda create -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }} rust
           elif [ "${{ matrix.os }}" == "macos-14" ]; then
-              conda create --platform osx-64 -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }}
+              conda create --platform osx-64 -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }} rust
           fi
           conda activate pathogena-test
           pip install '.[dev]'

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
   "pydantic>=2.6.2,<3",
   "tenacity==8.2.3",
   "click>=8.1.7",
+  "fastq-validation==1.0.5",
 ]
 
 [project.scripts]

diff --git a/src/pathogena/models.py b/src/pathogena/models.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from typing import Any, Literal, Optional
 
+import fastq_validation
 from pydantic import BaseModel, Field, model_validator
 
 from pathogena import util
@@ -168,30 +169,47 @@ def validate_reads_from_fastq(self) -> None:
         """
         reads = self.get_read_paths()
         logging.info("Performing FastQ checks and gathering total reads")
-        valid_lines_per_read = 4
-        self.reads_in = 0
-        for read in reads:
-            logging.info(f"Calculating read count in: {read}")
-            if read.suffix == ".gz":
-                line_count = util.reads_lines_from_gzip(file_path=read)
+        try:
+            if self.is_illumina():
+                stats1, stats2 = fastq_validation.check_illumina(*reads)
+                self.reads_in = stats1.num_reads + stats2.num_reads
+                if not stats1.is_illumina():
+                    raise ValueError(
+                        f"FastQ file {reads[0]} doesn't appear to be Illumina! "
+                        f"Mean read length {stats1.mean_read_length} bp, "
+                        f"Percentage of reads the same length {round(stats1.percent_same_length * 100, 2)}%"
+                    )
+                if not stats2.is_illumina():
+                    raise ValueError(
+                        f"FastQ file {reads[1]} doesn't appear to be Illumina! "
+                        f"Mean read length {stats2.mean_read_length} bp, "
+                        f"Percentage of reads the same length {round(stats2.percent_same_length * 100, 2)}%"
+                    )
             else:
-                line_count = util.reads_lines_from_fastq(file_path=read)
-            if line_count % valid_lines_per_read != 0:
-                raise ValueError(
-                    f"FASTQ file {read.name} does not have a multiple of 4 lines"
-                )
-            self.reads_in += line_count / valid_lines_per_read
+                stats = fastq_validation.check_ont(*reads)
+                self.reads_in = stats.num_reads
+                if not stats.is_ont():
+                    raise ValueError(
+                        f"FastQ file {reads[0]} doesn't appear to be ONT! "
+                        f"Mean read length {stats.mean_read_length} bp, "
+                        f"Percentage of reads the same length {round(stats.percent_same_length * 100, 2)}%"
+                    )
+        except Exception as e:
+            logging.info(e)
+            raise ValueError(
+                f"Invalid FastQ file(s) for sample {self.sample_name}. Please check the file(s) and try again."
+            ) from e
         logging.info(f"{self.reads_in} reads in FASTQ file")
 
-    def get_read_paths(self) -> list[Path]:
+    def get_read_paths(self) -> list[str]:
         """Get the paths of the read files.
 
         Returns:
-            list[Path]: A list of paths to the read files.
+            list[str]: A list of paths to the read files.
         """
-        reads = [self.reads_1_resolved_path]
+        reads = [self.reads_1_resolved_path.as_posix()]
         if self.is_illumina():
-            reads.append(self.reads_2_resolved_path)
+            reads.append(self.reads_2_resolved_path.as_posix())
         return reads
 
     def is_ont(self) -> bool:

diff --git a/src/pathogena/util.py b/src/pathogena/util.py
@@ -394,22 +394,6 @@ def display_cli_version() -> None:
     logging.info(f"EIT Pathogena client version {pathogena.__version__}")
 
 
-def command_exists(command: str) -> bool:
-    """Check if a command exists in the system.
-
-    Args:
-        command (str): The command to check.
-
-    Returns:
-        bool: True if the command exists, False otherwise.
-    """
-    try:
-        result = subprocess.run(["type", command], capture_output=True)
-    except FileNotFoundError:  # Catch Python parsing related errors
-        return False
-    return result.returncode == 0
-
-
 def gzip_file(input_file: Path, output_file: str) -> Path:
     """Gzip a file and save it with a new name.
 
@@ -431,58 +415,6 @@ def gzip_file(input_file: Path, output_file: str) -> Path:
     return Path(output_file)
 
 
-def reads_lines_from_gzip(file_path: Path) -> int:
-    """Count the number of lines in a gzipped file.
-
-    Args:
-        file_path (Path): The path to the gzipped file.
-
-    Returns:
-        int: The number of lines in the file.
-    """
-    line_count = 0
-    # gunzip offers a ~4x faster speed when opening GZip files, use it if we can.
-    if command_exists("gunzip"):
-        logging.debug("Reading lines using gunzip")
-        result = subprocess.run(
-            ["gunzip", "-c", file_path.as_posix()], stdout=subprocess.PIPE, text=True
-        )
-        line_count = result.stdout.count("\n")
-    if line_count == 0:  # gunzip didn't work, try the long method
-        logging.debug("Using gunzip failed, using Python's gzip implementation")
-        try:
-            with gzip.open(file_path, "r") as contents:
-                line_count = sum(1 for _ in contents)
-        except gzip.BadGzipFile as e:
-            logging.error(f"Failed to open the Gzip file: {e}")
-    return line_count
-
-
-def reads_lines_from_fastq(file_path: Path) -> int:
-    """Count the number of lines in a FASTQ file.
-
-    Args:
-        file_path (Path): The path to the FASTQ file.
-
-    Returns:
-        int: The number of lines in the file.
-    """
-    try:
-        with open(file_path) as contents:
-            line_count = sum(1 for _ in contents)
-        return line_count
-    except PermissionError:
-        logging.error(
-            f"You do not have permission to access this file {file_path.name}."
-        )
-    except OSError as e:
-        logging.error(f"An OS error occurred trying to open {file_path.name}: {e}")
-    except Exception as e:
-        logging.error(
-            f"An unexpected error occurred trying to open {file_path.name}: {e}"
-        )
-
-
 def find_duplicate_entries(inputs: list[str]) -> list[str]:
     """Return a list of items that appear more than once in the input list.
 

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -3,27 +3,6 @@
 from pathogena import util
 
 
-def test_reads_lines_from_gzip() -> None:
-    """Test that the `reads_lines_from_gzip` function correctly reads the expected number of lines from a gzip file."""
-    expected_lines = 4
-    file_path = Path(__file__).parent / "data" / "reads" / "tuberculosis_1_1.fastq.gz"
-    lines = util.reads_lines_from_gzip(file_path=file_path)
-    assert lines == expected_lines
-
-
-def test_reads_lines_from_fastq() -> None:
-    """Test that the `reads_lines_from_fastq` function correctly reads the expected number of lines from a fastq file."""
-    expected_lines = 4
-    file_path = Path(__file__).parent / "data" / "reads" / "tuberculosis_1_1.fastq"
-    lines = util.reads_lines_from_fastq(file_path=file_path)
-    assert lines == expected_lines
-
-
-def test_fail_command_exists() -> None:
-    """Test that the `command_exists` function correctly identifies a non-existent command."""
-    assert not util.command_exists("notarealcommandtest")
-
-
 def test_find_duplicate_entries() -> None:
     """Test that the `find_duplicate_entries` function correctly identifies duplicate entries in a list."""
     data = ["foo", "foo", "bar", "bar", "baz"]