Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-22.04, macos-14]
# os: [ubuntu-22.04, macos-14]
os: [macos-14]
python-version: ['3.10']
name: Python ${{ matrix.python-version }} (${{ matrix.os }})
steps:
Expand All @@ -28,9 +29,9 @@ jobs:
shell: bash -el {0}
run: |
if [ "${{ matrix.os }}" == "ubuntu-22.04" ]; then
conda create -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }}
conda create -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }} rust
elif [ "${{ matrix.os }}" == "macos-14" ]; then
conda create --platform osx-64 -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }}
conda create --platform osx-64 -y -n pathogena-test -c conda-forge -c bioconda hostile==1.1.0 python==${{ matrix.python-version }} rust
fi
conda activate pathogena-test
pip install '.[dev]'
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dependencies = [
"pydantic>=2.6.2,<3",
"tenacity==8.2.3",
"click>=8.1.7",
"fastq-validation==1.0.5",
]

[project.scripts]
Expand Down
50 changes: 34 additions & 16 deletions src/pathogena/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
from typing import Any, Literal, Optional

import fastq_validation
from pydantic import BaseModel, Field, model_validator

from pathogena import util
Expand Down Expand Up @@ -168,30 +169,47 @@ def validate_reads_from_fastq(self) -> None:
"""
reads = self.get_read_paths()
logging.info("Performing FastQ checks and gathering total reads")
valid_lines_per_read = 4
self.reads_in = 0
for read in reads:
logging.info(f"Calculating read count in: {read}")
if read.suffix == ".gz":
line_count = util.reads_lines_from_gzip(file_path=read)
try:
if self.is_illumina():
stats1, stats2 = fastq_validation.check_illumina(*reads)
self.reads_in = stats1.num_reads + stats2.num_reads
if not stats1.is_illumina():
raise ValueError(
f"FastQ file {reads[0]} doesn't appear to be Illumina! "
f"Mean read length {stats1.mean_read_length} bp, "
f"Percentage of reads the same length {round(stats1.percent_same_length * 100, 2)}%"
)
if not stats2.is_illumina():
raise ValueError(
f"FastQ file {reads[1]} doesn't appear to be Illumina! "
f"Mean read length {stats2.mean_read_length} bp, "
f"Percentage of reads the same length {round(stats2.percent_same_length * 100, 2)}%"
)
else:
line_count = util.reads_lines_from_fastq(file_path=read)
if line_count % valid_lines_per_read != 0:
raise ValueError(
f"FASTQ file {read.name} does not have a multiple of 4 lines"
)
self.reads_in += line_count / valid_lines_per_read
stats = fastq_validation.check_ont(*reads)
self.reads_in = stats.num_reads
if not stats.is_ont():
raise ValueError(
f"FastQ file {reads[0]} doesn't appear to be ONT! "
f"Mean read length {stats.mean_read_length} bp, "
f"Percentage of reads the same length {round(stats.percent_same_length * 100, 2)}%"
)
except Exception as e:
logging.info(e)
raise ValueError(
f"Invalid FastQ file(s) for sample {self.sample_name}. Please check the file(s) and try again."
) from e
logging.info(f"{self.reads_in} reads in FASTQ file")

def get_read_paths(self) -> list[Path]:
def get_read_paths(self) -> list[str]:
"""Get the paths of the read files.

Returns:
list[Path]: A list of paths to the read files.
list[str]: A list of paths to the read files.
"""
reads = [self.reads_1_resolved_path]
reads = [self.reads_1_resolved_path.as_posix()]
if self.is_illumina():
reads.append(self.reads_2_resolved_path)
reads.append(self.reads_2_resolved_path.as_posix())
return reads

def is_ont(self) -> bool:
Expand Down
68 changes: 0 additions & 68 deletions src/pathogena/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,22 +394,6 @@ def display_cli_version() -> None:
logging.info(f"EIT Pathogena client version {pathogena.__version__}")


def command_exists(command: str) -> bool:
"""Check if a command exists in the system.

Args:
command (str): The command to check.

Returns:
bool: True if the command exists, False otherwise.
"""
try:
result = subprocess.run(["type", command], capture_output=True)
except FileNotFoundError: # Catch Python parsing related errors
return False
return result.returncode == 0


def gzip_file(input_file: Path, output_file: str) -> Path:
"""Gzip a file and save it with a new name.

Expand All @@ -431,58 +415,6 @@ def gzip_file(input_file: Path, output_file: str) -> Path:
return Path(output_file)


def reads_lines_from_gzip(file_path: Path) -> int:
"""Count the number of lines in a gzipped file.

Args:
file_path (Path): The path to the gzipped file.

Returns:
int: The number of lines in the file.
"""
line_count = 0
# gunzip offers a ~4x faster speed when opening GZip files, use it if we can.
if command_exists("gunzip"):
logging.debug("Reading lines using gunzip")
result = subprocess.run(
["gunzip", "-c", file_path.as_posix()], stdout=subprocess.PIPE, text=True
)
line_count = result.stdout.count("\n")
if line_count == 0: # gunzip didn't work, try the long method
logging.debug("Using gunzip failed, using Python's gzip implementation")
try:
with gzip.open(file_path, "r") as contents:
line_count = sum(1 for _ in contents)
except gzip.BadGzipFile as e:
logging.error(f"Failed to open the Gzip file: {e}")
return line_count


def reads_lines_from_fastq(file_path: Path) -> int:
"""Count the number of lines in a FASTQ file.

Args:
file_path (Path): The path to the FASTQ file.

Returns:
int: The number of lines in the file.
"""
try:
with open(file_path) as contents:
line_count = sum(1 for _ in contents)
return line_count
except PermissionError:
logging.error(
f"You do not have permission to access this file {file_path.name}."
)
except OSError as e:
logging.error(f"An OS error occurred trying to open {file_path.name}: {e}")
except Exception as e:
logging.error(
f"An unexpected error occurred trying to open {file_path.name}: {e}"
)


def find_duplicate_entries(inputs: list[str]) -> list[str]:
"""Return a list of items that appear more than once in the input list.

Expand Down
21 changes: 0 additions & 21 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,6 @@
from pathogena import util


def test_reads_lines_from_gzip() -> None:
"""Test that the `reads_lines_from_gzip` function correctly reads the expected number of lines from a gzip file."""
expected_lines = 4
file_path = Path(__file__).parent / "data" / "reads" / "tuberculosis_1_1.fastq.gz"
lines = util.reads_lines_from_gzip(file_path=file_path)
assert lines == expected_lines


def test_reads_lines_from_fastq() -> None:
"""Test that the `reads_lines_from_fastq` function correctly reads the expected number of lines from a fastq file."""
expected_lines = 4
file_path = Path(__file__).parent / "data" / "reads" / "tuberculosis_1_1.fastq"
lines = util.reads_lines_from_fastq(file_path=file_path)
assert lines == expected_lines


def test_fail_command_exists() -> None:
"""Test that the `command_exists` function correctly identifies a non-existent command."""
assert not util.command_exists("notarealcommandtest")


def test_find_duplicate_entries() -> None:
"""Test that the `find_duplicate_entries` function correctly identifies duplicate entries in a list."""
data = ["foo", "foo", "bar", "bar", "baz"]
Expand Down
Loading