Kmers.jl compatibility (#282)

jakobnissen · web-flow · commit e1918ce3ed8f · 2024-12-22T11:35:31.000+01:00
Add various small improvements to BioSequences to make it compatible with the
upcoming Kmers.jl release.
All of these changes are improved internals.

* Improve showerror for EncodeError
* Improve test in has_interface
* Add generic BioSequence methods for bits_per_symbol, firstbitindex and
  lastbitindex
* Add some more reversebits methods for different bits per symbol sizes
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioSequences"
 uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 authors = ["Sabrina Jaye Ward <sabrinajward@protonmail.com>", "Jakob Nissen <jakobnybonissen@gmail.com>"]
-version = "3.4.0"
+version = "3.4.1"
 
 [deps]
 BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
diff --git a/docs/src/construction.md b/docs/src/construction.md
@@ -317,7 +317,7 @@ If the input cannot be encoded by any of the built-in alphabets, an error is thr
 
 ```jldoctest
 julia> bioseq("0!(CC!;#&&%")
-ERROR: cannot encode 0x30 in AminoAcidAlphabet
+ERROR: cannot encode 0x30 (Char '0') in AminoAcidAlphabet
 [...]
 ```
 
diff --git a/src/alphabet.jl b/src/alphabet.jl
@@ -135,7 +135,15 @@ end
 EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val)
 
 function Base.showerror(io::IO, err::EncodeError{A}) where {A}
-    print(io, "cannot encode ", repr(err.val), " in ", A)
+    val = err.val
+    char_repr = if val isa Integer && val < 0x80
+        repr(val) * " (Char '" * Char(val) * "')"
+    elseif val isa Union{AbstractString, AbstractChar}
+        repr(val)
+    else
+        string(err.val)
+    end
+    print(io, "cannot encode " * char_repr * " in ", A)
 end
 
 """
diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl
@@ -74,7 +74,7 @@ function has_interface(
         isempty(syms) && error("Vector syms must not be empty")
         first(syms) isa eltype(T) || error("Vector is of wrong element type")
         seq = T((i for i in syms))
-        length(seq) > 0 || return false
+        length(seq) == length(syms) || return false
         eachindex(seq) === Base.OneTo(length(seq)) || return false
         E = encoded_data_eltype(T)
         e = extract_encoded_element(seq, 1)
@@ -101,13 +101,14 @@ Base.nextind(::BioSequence, i::Integer) = Int(i) + 1
 Base.prevind(::BioSequence, i::Integer) = Int(i) - 1
 Base.size(x::BioSequence) = (length(x),)
 Base.eltype(::Type{<:BioSequence{A}}) where {A <: Alphabet} = eltype(A)
-Base.eltype(x::BioSequence) = eltype(typeof(x))
 Alphabet(::Type{<:BioSequence{A}}) where {A <: Alphabet} = A()
 Alphabet(x::BioSequence) = Alphabet(typeof(x))
 Base.isempty(x::BioSequence) = iszero(length(x))
 Base.empty(::Type{T}) where {T <: BioSequence} = T(eltype(T)[])
 Base.empty(x::BioSequence) = empty(typeof(x))
 BitsPerSymbol(x::BioSequence) = BitsPerSymbol(Alphabet(typeof(x)))
+bits_per_symbol(::Type{T}) where {T <: BioSequence} = bits_per_symbol(Alphabet(T))
+bits_per_symbol(x::BioSequence) = bits_per_symbol(typeof(x))
 Base.hash(s::BioSequence, x::UInt) = foldl((a, b) -> hash(b, a), s, init=x)
 
 function Base.similar(seq::BioSequence, len::Integer=length(seq))
diff --git a/src/biosequence/indexing.jl b/src/biosequence/indexing.jl
@@ -11,6 +11,9 @@
     (i % UInt) - 1 < (lastindex(seq) % UInt) ? (@inbounds seq[i], i + 1) : nothing
 end
 
+lastbitindex(x::BioSequence) = bitindex(x, lastindex(x))
+firstbitindex(x::BioSequence) = bitindex(x, firstindex(x))
+
 ## Bounds checking
 function Base.checkbounds(x::BioSequence, i::Integer)
     firstindex(x) ≤ i ≤ lastindex(x) || throw(BoundsError(x, i))
diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl
@@ -1,16 +1,34 @@
-@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned
+const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128}
+
+@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned
      mask = 0x33333333333333333333333333333333 % T
      x = ((x >> 2) & mask) | ((x & mask) << 2)
      return reversebits(x, BitsPerSymbol{4}())
 end
 
-@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: Base.BitUnsigned
+@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: BitUnsigned
      mask = 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F % T
      x = ((x >> 4) & mask) | ((x & mask) << 4)
-     return bswap(x)
+     return reversebits(x, BitsPerSymbol{8}())
+end
+
+@inline reversebits(x::T, ::BitsPerSymbol{8}) where T <: BitUnsigned = bswap(x)
+
+@inline reversebits(x::UInt16, ::BitsPerSymbol{16}) = x
+@inline function reversebits(x::T, ::BitsPerSymbol{16}) where T <: Union{UInt32, UInt64}
+    mask = 0x0000FFFF0000FFFF0000FFFF0000FFFF % T
+    x = ((x >> 16) & mask) | ((x & mask) << 16)
+    reversebits(x, BitsPerSymbol{32}())
+end
+
+@inline reversebits(x::UInt32, ::BitsPerSymbol{32}) = x
+@inline function reversebits(x::T, ::BitsPerSymbol{32}) where T <: Union{UInt64}
+    mask = 0x00000000FFFFFFF00000000FFFFFFFF % T
+    x = ((x >> 32) & mask) | ((x & mask) << 32)
+    reversebits(x, BitsPerSymbol{64}())
 end
 
-reversebits(x::T, ::BitsPerSymbol{8}) where T <: Base.BitUnsigned = bswap(x)
+@inline reversebits(x::UInt64, ::BitsPerSymbol{64}) = x
 
 @inline function complement_bitpar(x::Unsigned, ::T) where {T<:NucleicAcidAlphabet{2}}
     return ~x
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -131,7 +131,7 @@ julia> bioseq("UAUGCUGUAGG")
 UAUGCUGUAGG
 
 julia> bioseq("PKMW#3>>0;kL")
-ERROR: cannot encode 0x23 in AminoAcidAlphabet
+ERROR: cannot encode 0x23 (Char '#') in AminoAcidAlphabet
 [...]
 ```
 """
diff --git a/src/longsequences/indexing.jl b/src/longsequences/indexing.jl
@@ -10,9 +10,6 @@
     bitindex(N, encoded_data_eltype(typeof(x)), i)
 end
 
-firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s) % UInt)
-lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s) % UInt)
-
 @inline function extract_encoded_element(x::SeqOrView, i::Integer)
     bi = bitindex(x, i % UInt)
     extract_encoded_element(bi, x.data)
diff --git a/test/biosequences/misc.jl b/test/biosequences/misc.jl
@@ -193,4 +193,21 @@ end
     @test ungap(seq) == seq
     cp = copy(seq)
     @test ungap!(seq) == cp
+end
+
+# Ideally, we'd not test this internal function, but instead the code that
+# relies on this, but this is only called if you have custom alphabets, and
+# creating these alphabets of different sizes is a hassle
+@testset "bitreverse" begin
+    bps8 = BioSequences.BitsPerSymbol{8}()
+    bps16 = BioSequences.BitsPerSymbol{16}()
+    bps32 = BioSequences.BitsPerSymbol{32}()
+    bps64 = BioSequences.BitsPerSymbol{64}()
+    reversebits = BioSequences.reversebits
+    @test reversebits(0x0102, bps16) === 0x0102
+    @test reversebits(0x01020304, bps16) === 0x03040102
+    @test reversebits(0x0102030405060708, bps16) === 0x0708050603040102
+    @test reversebits(0x01020304, bps32) === 0x01020304
+    @test reversebits(0x0102030405060708, bps32) === 0x0506070801020304
+    @test reversebits(0x0102030405060708, bps64) === 0x0102030405060708 
 end