needletail/needletail.pyi at d0175541789c9edbbdd207acff90dac6a4f18827 · onecodex/needletail · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from pathlib import Path
from typing import Iterator, Optional, Union

class NeedletailError(Exception):
    """
    Raised when parsing invalid fastx data with parse_fastx_file or parse_fastx_string
    """

class FastxReader(Iterator[Record]):
    """
    An iterator that yields sequence records.

    Yields
    ------
    Record
        A `Record` object representing a sequence record.

    See also
    --------
    parse_fastx_file:
        A function to parse sequence records from a FASTA/FASTQ file.
    parse_fastx_string:
        A function to parse sequence records from a FASTA/FASTQ string.
    Record:
        A class representing a FASTA/FASTQ sequence record.
    """

class Record:
    """
    A record representing a biological sequence.

    Parameters
    ----------
    id : str
        The identifier of the sequence record.
    seq : str
        A string representing the sequence.

    Attributes
    ----------
    id : str
        The identifier of the sequence record. In a FASTA file, this is the
        string containing all characters (including whitespaces) after the
        leading '>' character. In a FASTQ file, this is the string containing
        all characters (including whitespaces) after the leading '@' character.
    seq : str
        A string representing the sequence.
    qual : str, optional
        A string representing the quality scores of the sequence. If the object
        represents a FASTA record, this attribute will be `None`.
    name : str
        The name of the sequence record. This is the string before the first
        whitespace character in the `id` attribute.
    description : str, optional
        The description of the sequence record. This is the string after the
        first whitespace character in the `id` attribute. If the `id` attribute
        contains no whitespace characters, this attribute will be `None`.

    Methods
    -------
    is_fasta
        Check if the object represents a FASTA record.
    is_fastq
        Check if the object represents a FASTQ record.
    normalize(iupac)
        Normalize the sequence stored in the `seq` attribute of the object.
    """

    id: str
    seq: str
    name: str
    description: Optional[str]
    qual: Optional[str]

    def is_fasta(self) -> bool:
        """
        Check if the object represents a FASTA record.

        Returns
        -------
        bool
            `True` if the record lacks quality information, otherwise `False`.
        """
        pass

    def is_fastq(self) -> bool:
        """
        Check if the object represents a FASTQ record.

        Returns
        -------
        bool
            `True` if the record has quality information, otherwise `False`.
        """
        pass

    def normalize(self, iupac: bool) -> None:
        """
        Normalize the sequence stored in the `seq` attribute of the object.

        See also
        --------
        normalize_seq: A function to normalize nucleotide sequence strings.

        Notes
        -----
        The `normalize` method is designed for nucleotide sequences only. If
        used with protein sequences, it will incorrectly process amino acid
        characters as if they were nucleotides.
        """
        pass

def parse_fastx_file(path: Union[str, Path]) -> FastxReader:
    """
    Returns an iterator that parses a FASTA/FASTQ file and yields sequence
    records.

    Parameters
    ----------
    path : str or pathlib.Path
        The path to a FASTA/FASTQ file.

    Returns
    -------
    FastxReader
        A `FastxReader` iterator that yields `Record` objects representing
        sequences from the input file.

    Raises
    ------
    NeedletailError
        If an error occurs while reading and parsing the input file.

    See also
    --------
    parse_fastx_string:
        A function to parse sequence records from a FASTA/FASTQ string.
    FastxReader:
        A class with instances that are iterators that yield `Record` objects.
    """
    pass

def parse_fastx_string(fastx_string: str) -> FastxReader:
    """
    Returns an iterator that parses a FASTA/FASTQ string and yields sequence
    records.

    Parameters
    ----------
    content : str
        A string containing FASTA/FASTQ-formatted sequence records.

    Returns
    -------
    FastxReader
        A `FastxReader` iterator that yields `Record` objects representing
        sequences from the input string.

    Raises
    ------
    NeedletailError
        If an error occurs while parsing the input string.

    See also
    --------
    parse_fastx_file:
        A function to parse sequence records from a FASTA/FASTQ file.
    FastxReader:
        A class with instances that are iterators that yield `Record` objects.
    """
    pass

def normalize_seq(seq: str, iupac: bool) -> str:
    """
    Normalize the sequence string of nucleotide records by:

    - Converting lowercase characters to uppercase.
    - Removing whitespace and newline characters.
    - Replacing 'U' with 'T'.
    - Replacing '.' and '~' with '-'.
    - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is `True`,
      in which case characters representing nucleotide ambiguity are not
      replaced.

    Parameters
    ----------
    seq : str
        A string representing a nucleotide sequence.
    iupac : bool, default: False
        If `True`, characters representing nucleotide ambiguity ('B', 'D',
        'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase
        forms) will not be converted to 'N'. Lowercase characters will still
        be converted to uppercase.

    Returns
    -------
    str
        The normalized sequence string.

    Notes
    -----
    The `normalize_seq` function is designed for nucleotide sequences only. If
    used with protein sequences, it will incorrectly process amino acid
    characters as if they were nucleotides.
    """
    pass

def reverse_complement(seq: str) -> str:
    """
    Compute the reverse complement of a nucleotide sequence.

    Parameters
    ----------
    seq : str
        A string representing a nucleotide sequence.

    Returns
    -------
    str
        The reverse complement of the input nucleotide sequence.

    Notes
    -----
    The `reverse_complement` method is designed for nucleotide sequences
    only. If used with protein sequences, it will incorrectly process
    amino acid characters as if they were nucleotides.
    """
    pass

def decode_phred(qual: str, base_64: bool) -> tuple[int]:
    """
    Decode Phred quality strings to quality scores.

    Parameters
    ----------
    phred : str
        A string representing Phred-encoded quality strings.
    base_64 : bool, default=False
        If `True`, return the quality using the Phred+64 encoding, otherwise
        the Phred+33 encoding will be used.

    Returns
    -------
    tuple of int
        A list of integers representing quality scores derived from the
        probability of a base-calling error using a logarithmic transformation.
    """
    pass