Skip to content

Commit 4a03043

Browse files
authored
Add more optimised find methods for BioSequences (#331)
This adds SIMD optimised methods for `findprev` and `findnext` (and therefore, by extension `findfirst` and `findlast`) for the predicates `!isgap`, `iscertain`, `!iscertain`, `isambiguous` and `!isambiguous` for the default alphabets for LongSequence and LongSubSeq The purpose of these is to find e.g. regions of N's in a sequence, or find the first symbol in a long DNA sequence which is not a gap or an unambiguous symbol. Some benchmarks for AA - nucleotide benchmarks are even better: ```julia s = randaaseq(100_000) @Btime findfirst(isgap, s) @Btime findfirst(isambiguous, s) @Btime findfirst(!iscertain, s) for i in eachindex(s) s[i] = AA_B end @Btime findfirst(!isambiguous, s) @Btime findfirst(iscertain, s) ``` Before: 1.738 μs (0 allocations: 0 bytes) 39.195 μs (0 allocations: 0 bytes) 39.195 μs (0 allocations: 0 bytes) 39.205 μs (0 allocations: 0 bytes) 39.205 μs (0 allocations: 0 bytes) After: 1.894 μs (0 allocations: 0 bytes) 3.270 μs (0 allocations: 0 bytes) 5.500 μs (0 allocations: 0 bytes) 3.240 μs (0 allocations: 0 bytes) 4.530 μs (0 allocations: 0 bytes)
1 parent e1918ce commit 4a03043

File tree

7 files changed

+433
-94
lines changed

7 files changed

+433
-94
lines changed

src/BioSequences.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ include("biosequence/biosequence.jl")
219219
include("longsequences/longsequence.jl")
220220
include("longsequences/hash.jl")
221221
include("longsequences/randseq.jl")
222+
include("longsequences/find.jl")
222223

223224
include("counting.jl")
224225

src/biosequence/find.jl

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,24 @@
77
### This file is a part of BioJulia.
88
### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md
99

10+
Base.findfirst(f::Function, seq::BioSequence) = findnext(f, seq, firstindex(seq))
11+
Base.findlast(f::Function, seq::BioSequence) = findprev(f, seq, lastindex(seq))
12+
1013
function Base.findnext(f::Function, seq::BioSequence, start::Integer)
14+
start = Int(start)::Int
1115
start > lastindex(seq) && return nothing
12-
checkbounds(seq, start)
16+
@boundscheck (start < 1 && throw(BoundsError(seq, start)))
17+
@inline _findnext(f, seq, start)
18+
end
19+
20+
function Base.findprev(f::Function, seq::BioSequence, start::Integer)
21+
start = Int(start)::Int
22+
start < 1 && return nothing
23+
@boundscheck (start > lastindex(seq) && throw(BoundsError(seq, start)))
24+
@inline _findprev(f, seq, start)
25+
end
26+
27+
function _findnext(f::Function, seq::BioSequence, start::Int)
1328
@inbounds for i in start:lastindex(seq)
1429
if f(seq[i])
1530
return i
@@ -18,25 +33,31 @@ function Base.findnext(f::Function, seq::BioSequence, start::Integer)
1833
return nothing
1934
end
2035

21-
# No ambiguous sites can exist in a nucleic acid sequence using the two-bit alphabet.
22-
Base.findnext(::typeof(isambiguous), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Integer) = nothing
23-
24-
function Base.findprev(f::Function, seq::BioSequence, start::Integer)
25-
start < firstindex(seq) && return nothing
26-
checkbounds(seq, start)
27-
for i in start:-1:firstindex(seq)
36+
function _findprev(f::Function, seq::BioSequence, start::Int)
37+
@inbounds for i in start:-1:firstindex(seq)
2838
if f(seq[i])
2939
return i
3040
end
3141
end
3242
return nothing
3343
end
3444

35-
Base.findfirst(f::Function, seq::BioSequence) = findnext(f, seq, firstindex(seq))
36-
Base.findlast(f::Function, seq::BioSequence) = findprev(f, seq, lastindex(seq))
45+
# For two-bit alphabet
46+
# N.B: The isgap and isambiguous have identical definitions but are separate methods to avoid
47+
# ambiguity errors.
48+
_findnext(::typeof(isambiguous), ::BioSequence{<:NucleicAcidAlphabet{2}}, ::Int) = nothing
49+
_findprev(::typeof(isambiguous), ::BioSequence{<:NucleicAcidAlphabet{2}}, ::Int) = nothing
50+
_findnext(::typeof(isgap), ::BioSequence{<:NucleicAcidAlphabet{2}}, ::Int) = nothing
51+
_findprev(::typeof(isgap), ::BioSequence{<:NucleicAcidAlphabet{2}}, ::Int) = nothing
3752

38-
# Finding specific symbols
53+
_findnext(::typeof(iscertain), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Int) = from
54+
_findprev(::typeof(iscertain), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Int) = from
55+
_findnext(::typeof(!isambiguous), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Int) = from
56+
_findprev(::typeof(!isambiguous), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Int) = from
57+
_findnext(::typeof(!isgap), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Int) = from
58+
_findprev(::typeof(!isgap), seq::BioSequence{<:NucleicAcidAlphabet{2}}, from::Int) = from
3959

60+
# Finding specific symbols
4061
Base.findnext(x::DNA, seq::BioSequence{<:DNAAlphabet}, start::Integer) = findnext(isequal(x), seq, start)
4162
Base.findnext(x::RNA, seq::BioSequence{<:RNAAlphabet}, start::Integer) = findnext(isequal(x), seq, start)
4263
Base.findnext(x::AminoAcid, seq::BioSequence{AminoAcidAlphabet}, start::Integer) = findnext(isequal(x), seq, start)
@@ -51,4 +72,16 @@ Base.findfirst(x::AminoAcid, seq::BioSequence{AminoAcidAlphabet}) = findfirst(is
5172

5273
Base.findlast(x::DNA, seq::BioSequence{<:DNAAlphabet}) = findlast(isequal(x), seq)
5374
Base.findlast(x::RNA, seq::BioSequence{<:RNAAlphabet}) = findlast(isequal(x), seq)
54-
Base.findlast(x::AminoAcid, seq::BioSequence{AminoAcidAlphabet}) = findlast(isequal(x), seq)
75+
Base.findlast(x::AminoAcid, seq::BioSequence{AminoAcidAlphabet}) = findlast(isequal(x), seq)
76+
77+
# Finding gaps
78+
_findnext(::typeof(isgap), seq::BioSequence, i::Int) = _findnext(==(gap(eltype(seq))), seq, i)
79+
_findprev(::typeof(isgap), seq::BioSequence, i::Int) = _findprev(==(gap(eltype(seq))), seq, i)
80+
81+
function _findnext(::ComposedFunction{typeof(!), typeof(isgap)}, seq::BioSequence, i::Int)
82+
_findnext(!=(gap(eltype(seq))), seq, i)
83+
end
84+
85+
function _findprev(::ComposedFunction{typeof(!), typeof(isgap)}, seq::BioSequence, i::Int)
86+
_findprev(!=(gap(eltype(seq))), seq, i)
87+
end

src/counting.jl

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,12 +259,6 @@ _n_certain(seq::BioSequence) = count_naive(iscertain, seq)
259259

260260
_n_gc(seq::BioSequence) = count_naive(isGC, seq)
261261

262-
const FourBit = SeqOrView{<:NucleicAcidAlphabet{4}}
263-
const TwoBit = SeqOrView{<:NucleicAcidAlphabet{2}}
264-
265-
const TwoBitSeq = BioSequence{<:NucleicAcidAlphabet{2}}
266-
const FourBitSeq = BioSequence{<:NucleicAcidAlphabet{4}}
267-
268262
# Trivial overloads
269263
_n_certain(s::TwoBitSeq) = length(s)
270264
_n_ambiguous(s::TwoBitSeq) = 0

src/longsequences/find.jl

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# Zeros out all nibbles except the ones with 4bit A,C,G or T
2+
@inline function iscertain_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
3+
x = enumerate_nibbles(x)
4+
y = (x & 0x4444444444444444) >> 2
5+
y |= (x & 0x2222222222222222) >> 1
6+
return x & ~y & 0x1111111111111111
7+
end
8+
9+
# Zeros out all which is not a certain (normal or AA_Term)
10+
@inline function iscertain_kernel(::AminoAcidAlphabet, x::UInt64)
11+
# 1. Set normal to FF, others to 00
12+
y = simd_lt_byte(x, 0x16)
13+
14+
# 2. Set Term to FF, others to 00
15+
z = set_zero_encoding(BitsPerSymbol{8}(), x 0x1a1a1a1a1a1a1a1a) * 0xFF
16+
17+
# 3: OR them
18+
return y | z
19+
end
20+
21+
function _findnext(::typeof(iscertain), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
22+
f = @inline u -> iscertain_kernel(Alphabet(seq), u)
23+
return _findnext_nonzero(f, seq, i)
24+
end
25+
26+
function _findprev(::typeof(iscertain), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
27+
f = @inline u -> iscertain_kernel(Alphabet(seq), u)
28+
return _findprev_nonzero(f, seq, i)
29+
end
30+
31+
@inline function simd_lt_byte(x::UInt64, byte::UInt8)
32+
T = NTuple{8, VecElement{UInt8}}
33+
x = reinterpret(T, x)
34+
y = ntuple(i -> VecElement(byte), Val{8}())
35+
s = """
36+
%res = icmp ult <8 x i8> %0, %1
37+
%resb = sext <8 x i1> %res to <8 x i8>
38+
ret <8 x i8> %resb
39+
"""
40+
z = Core.Intrinsics.llvmcall(s, NTuple{8, VecElement{UInt8}}, Tuple{NTuple{8, VecElement{UInt8}}, NTuple{8, VecElement{UInt8}}}, x, y)
41+
return reinterpret(UInt64, z)
42+
end
43+
44+
@inline function sub_byte(x::UInt64, byte::UInt8)
45+
T = NTuple{8, VecElement{UInt8}}
46+
x = reinterpret(T, x)
47+
y = ntuple(i -> VecElement(byte), Val{8}())
48+
s = """
49+
%res = sub <8 x i8> %0, %1
50+
ret <8 x i8> %res
51+
"""
52+
z = Core.Intrinsics.llvmcall(s, NTuple{8, VecElement{UInt8}}, Tuple{NTuple{8, VecElement{UInt8}}, NTuple{8, VecElement{UInt8}}}, x, y)
53+
return reinterpret(UInt64, z)
54+
end
55+
56+
# Zeros out all nibbles encoding 4bit A,C,G or T
57+
@inline function uncertain_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
58+
return enumerate_nibbles(x) 0x1111111111111111
59+
end
60+
61+
# Zero out normal AAs and AA_Term
62+
@inline function uncertain_kernel(::AminoAcidAlphabet, x::UInt64)
63+
# Zero out normal AA, set rest to 0xFF
64+
y = ~simd_lt_byte(x, 0x16)
65+
66+
# Zero out 0x1a, set rest to other bitpatterns
67+
z = x 0x1a1a1a1a1a1a1a1a
68+
69+
return y & z
70+
end
71+
72+
function _findnext(
73+
::ComposedFunction{typeof(!), typeof(iscertain)},
74+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
75+
i::Int,
76+
)
77+
f = @inline u -> uncertain_kernel(Alphabet(seq), u)
78+
return _findnext_nonzero(f, seq, i)
79+
end
80+
81+
function _findprev(
82+
::ComposedFunction{typeof(!), typeof(iscertain)},
83+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
84+
i::Int,
85+
)
86+
f = @inline u -> uncertain_kernel(Alphabet(seq), u)
87+
return _findprev_nonzero(f, seq, i)
88+
end
89+
90+
# Zeros out A, C, G, T or Gap
91+
@inline function ambiguous_kernel(A::NucleicAcidAlphabet{4}, x::UInt64)
92+
# The y part makes every nibble 0xF, unless it's 0 to begin with
93+
y = x | (x >>> 2)
94+
y |= y >>> 1
95+
y &= 0x1111111111111111
96+
y *= 0x0F
97+
return uncertain_kernel(A, x) & y
98+
end
99+
100+
# Zero out all except ambiguous symbols (AA_B, AA_J, AA_Z, AA_X), 0x16:0x19
101+
@inline function ambiguous_kernel(::AminoAcidAlphabet, x::UInt64)
102+
return simd_lt_byte(sub_byte(x, 0x16), 0x04)
103+
end
104+
105+
function _findnext(::typeof(isambiguous), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
106+
f = @inline u -> ambiguous_kernel(Alphabet(seq), u)
107+
return _findnext_nonzero(f, seq, i)
108+
end
109+
110+
function _findprev(::typeof(isambiguous), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
111+
f = @inline u -> ambiguous_kernel(Alphabet(seq), u)
112+
return _findprev_nonzero(f, seq, i)
113+
end
114+
115+
# Zeros out all except A, C, G, T and Gap
116+
@inline function unambiguous_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
117+
y = enumerate_nibbles(x)
118+
y = (y >>> 1) | (y >>> 2)
119+
y &= 0x1111111111111111
120+
return y ⊻= 0x1111111111111111
121+
end
122+
123+
# Zero out the four ambiguous amino acids B, J, Z, X
124+
@inline function unambiguous_kernel(::AminoAcidAlphabet, x::UInt64)
125+
return ~simd_lt_byte(sub_byte(x, 0x16), 0x04)
126+
end
127+
128+
function _findnext(
129+
::ComposedFunction{typeof(!), typeof(isambiguous)},
130+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
131+
i::Int,
132+
)
133+
f = @inline u -> unambiguous_kernel(Alphabet(seq), u)
134+
return _findnext_nonzero(f, seq, i)
135+
end
136+
137+
function _findprev(
138+
::ComposedFunction{typeof(!), typeof(isambiguous)},
139+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
140+
i::Int,
141+
)
142+
f = @inline u -> unambiguous_kernel(Alphabet(seq), u)
143+
return _findprev_nonzero(f, seq, i)
144+
end
145+
146+
# For debug
147+
#=
148+
function make_all_nibbles()
149+
x = UInt64(0)
150+
for i in 0:15
151+
x = (x << 4) | i
152+
end
153+
x
154+
end
155+
=#

src/longsequences/longsequence.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,9 @@ include("copying.jl")
120120
include("stringliterals.jl")
121121
include("transformations.jl")
122122
include("operators.jl")
123+
124+
const FourBit = SeqOrView{<:NucleicAcidAlphabet{4}}
125+
const TwoBit = SeqOrView{<:NucleicAcidAlphabet{2}}
126+
127+
const TwoBitSeq = BioSequence{<:NucleicAcidAlphabet{2}}
128+
const FourBitSeq = BioSequence{<:NucleicAcidAlphabet{4}}

0 commit comments

Comments
 (0)