Skip to content

Commit e1918ce

Browse files
authored
Kmers.jl compatibility (#282)
Add various small improvements to BioSequences to make it compatible with the upcoming Kmers.jl release. All of these changes are improved internals. * Improve showerror for EncodeError * Improve test in has_interface * Add generic BioSequence methods for bits_per_symbol, firstbitindex and lastbitindex * Add some more reversebits methods for different bits per symbol sizes
1 parent 9a3f893 commit e1918ce

File tree

9 files changed

+57
-13
lines changed

9 files changed

+57
-13
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "BioSequences"
22
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
33
authors = ["Sabrina Jaye Ward <sabrinajward@protonmail.com>", "Jakob Nissen <jakobnybonissen@gmail.com>"]
4-
version = "3.4.0"
4+
version = "3.4.1"
55

66
[deps]
77
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

docs/src/construction.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ If the input cannot be encoded by any of the built-in alphabets, an error is thr
317317

318318
```jldoctest
319319
julia> bioseq("0!(CC!;#&&%")
320-
ERROR: cannot encode 0x30 in AminoAcidAlphabet
320+
ERROR: cannot encode 0x30 (Char '0') in AminoAcidAlphabet
321321
[...]
322322
```
323323

src/alphabet.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,15 @@ end
135135
EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val)
136136

137137
function Base.showerror(io::IO, err::EncodeError{A}) where {A}
138-
print(io, "cannot encode ", repr(err.val), " in ", A)
138+
val = err.val
139+
char_repr = if val isa Integer && val < 0x80
140+
repr(val) * " (Char '" * Char(val) * "')"
141+
elseif val isa Union{AbstractString, AbstractChar}
142+
repr(val)
143+
else
144+
string(err.val)
145+
end
146+
print(io, "cannot encode " * char_repr * " in ", A)
139147
end
140148

141149
"""

src/biosequence/biosequence.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ function has_interface(
7474
isempty(syms) && error("Vector syms must not be empty")
7575
first(syms) isa eltype(T) || error("Vector is of wrong element type")
7676
seq = T((i for i in syms))
77-
length(seq) > 0 || return false
77+
length(seq) == length(syms) || return false
7878
eachindex(seq) === Base.OneTo(length(seq)) || return false
7979
E = encoded_data_eltype(T)
8080
e = extract_encoded_element(seq, 1)
@@ -101,13 +101,14 @@ Base.nextind(::BioSequence, i::Integer) = Int(i) + 1
101101
Base.prevind(::BioSequence, i::Integer) = Int(i) - 1
102102
Base.size(x::BioSequence) = (length(x),)
103103
Base.eltype(::Type{<:BioSequence{A}}) where {A <: Alphabet} = eltype(A)
104-
Base.eltype(x::BioSequence) = eltype(typeof(x))
105104
Alphabet(::Type{<:BioSequence{A}}) where {A <: Alphabet} = A()
106105
Alphabet(x::BioSequence) = Alphabet(typeof(x))
107106
Base.isempty(x::BioSequence) = iszero(length(x))
108107
Base.empty(::Type{T}) where {T <: BioSequence} = T(eltype(T)[])
109108
Base.empty(x::BioSequence) = empty(typeof(x))
110109
BitsPerSymbol(x::BioSequence) = BitsPerSymbol(Alphabet(typeof(x)))
110+
bits_per_symbol(::Type{T}) where {T <: BioSequence} = bits_per_symbol(Alphabet(T))
111+
bits_per_symbol(x::BioSequence) = bits_per_symbol(typeof(x))
111112
Base.hash(s::BioSequence, x::UInt) = foldl((a, b) -> hash(b, a), s, init=x)
112113

113114
function Base.similar(seq::BioSequence, len::Integer=length(seq))

src/biosequence/indexing.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
(i % UInt) - 1 < (lastindex(seq) % UInt) ? (@inbounds seq[i], i + 1) : nothing
1212
end
1313

14+
lastbitindex(x::BioSequence) = bitindex(x, lastindex(x))
15+
firstbitindex(x::BioSequence) = bitindex(x, firstindex(x))
16+
1417
## Bounds checking
1518
function Base.checkbounds(x::BioSequence, i::Integer)
1619
firstindex(x) i lastindex(x) || throw(BoundsError(x, i))

src/bit-manipulation/bit-manipulation.jl

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,34 @@
1-
@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned
1+
const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128}
2+
3+
@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned
24
mask = 0x33333333333333333333333333333333 % T
35
x = ((x >> 2) & mask) | ((x & mask) << 2)
46
return reversebits(x, BitsPerSymbol{4}())
57
end
68

7-
@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: Base.BitUnsigned
9+
@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: BitUnsigned
810
mask = 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F % T
911
x = ((x >> 4) & mask) | ((x & mask) << 4)
10-
return bswap(x)
12+
return reversebits(x, BitsPerSymbol{8}())
13+
end
14+
15+
@inline reversebits(x::T, ::BitsPerSymbol{8}) where T <: BitUnsigned = bswap(x)
16+
17+
@inline reversebits(x::UInt16, ::BitsPerSymbol{16}) = x
18+
@inline function reversebits(x::T, ::BitsPerSymbol{16}) where T <: Union{UInt32, UInt64}
19+
mask = 0x0000FFFF0000FFFF0000FFFF0000FFFF % T
20+
x = ((x >> 16) & mask) | ((x & mask) << 16)
21+
reversebits(x, BitsPerSymbol{32}())
22+
end
23+
24+
@inline reversebits(x::UInt32, ::BitsPerSymbol{32}) = x
25+
@inline function reversebits(x::T, ::BitsPerSymbol{32}) where T <: Union{UInt64}
26+
mask = 0x00000000FFFFFFF00000000FFFFFFFF % T
27+
x = ((x >> 32) & mask) | ((x & mask) << 32)
28+
reversebits(x, BitsPerSymbol{64}())
1129
end
1230

13-
reversebits(x::T, ::BitsPerSymbol{8}) where T <: Base.BitUnsigned = bswap(x)
31+
@inline reversebits(x::UInt64, ::BitsPerSymbol{64}) = x
1432

1533
@inline function complement_bitpar(x::Unsigned, ::T) where {T<:NucleicAcidAlphabet{2}}
1634
return ~x

src/longsequences/constructors.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ julia> bioseq("UAUGCUGUAGG")
131131
UAUGCUGUAGG
132132
133133
julia> bioseq("PKMW#3>>0;kL")
134-
ERROR: cannot encode 0x23 in AminoAcidAlphabet
134+
ERROR: cannot encode 0x23 (Char '#') in AminoAcidAlphabet
135135
[...]
136136
```
137137
"""

src/longsequences/indexing.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@
1010
bitindex(N, encoded_data_eltype(typeof(x)), i)
1111
end
1212

13-
firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s) % UInt)
14-
lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s) % UInt)
15-
1613
@inline function extract_encoded_element(x::SeqOrView, i::Integer)
1714
bi = bitindex(x, i % UInt)
1815
extract_encoded_element(bi, x.data)

test/biosequences/misc.jl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,21 @@ end
193193
@test ungap(seq) == seq
194194
cp = copy(seq)
195195
@test ungap!(seq) == cp
196+
end
197+
198+
# Ideally, we'd not test this internal function, but instead the code that
199+
# relies on this, but this is only called if you have custom alphabets, and
200+
# creating these alphabets of different sizes is a hassle
201+
@testset "bitreverse" begin
202+
bps8 = BioSequences.BitsPerSymbol{8}()
203+
bps16 = BioSequences.BitsPerSymbol{16}()
204+
bps32 = BioSequences.BitsPerSymbol{32}()
205+
bps64 = BioSequences.BitsPerSymbol{64}()
206+
reversebits = BioSequences.reversebits
207+
@test reversebits(0x0102, bps16) === 0x0102
208+
@test reversebits(0x01020304, bps16) === 0x03040102
209+
@test reversebits(0x0102030405060708, bps16) === 0x0708050603040102
210+
@test reversebits(0x01020304, bps32) === 0x01020304
211+
@test reversebits(0x0102030405060708, bps32) === 0x0506070801020304
212+
@test reversebits(0x0102030405060708, bps64) === 0x0102030405060708
196213
end

0 commit comments

Comments
 (0)