Skip to content

Commit f2c5119

Browse files
committed
AA and fourbit preds
1 parent 37f578e commit f2c5119

File tree

2 files changed

+86
-15
lines changed

2 files changed

+86
-15
lines changed

src/longsequences/find.jl

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# Zeros out all nibbles except the ones with 4bit A,C,G or T
2-
function iscertain_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
2+
@inline function iscertain_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
33
x = enumerate_nibbles(x)
44
y = (x & 0x4444444444444444) >> 2
55
y |= (x & 0x2222222222222222) >> 1
66
return x & ~y & 0x1111111111111111
77
end
88

99
# Zeros out all which is not a certain (normal or AA_Term)
10-
function iscertain_kernel(::AminoAcidAlphabet, x::UInt64)
10+
@inline function iscertain_kernel(::AminoAcidAlphabet, x::UInt64)
1111
# 1. Set normal to FF, others to 00
1212
y = simd_lt_byte(x, 0x16)
1313

@@ -18,6 +18,16 @@ function iscertain_kernel(::AminoAcidAlphabet, x::UInt64)
1818
return y | z
1919
end
2020

21+
function _findnext(::typeof(iscertain), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
22+
f = @inline u -> iscertain_kernel(Alphabet(seq), u)
23+
return _findnext_nonzero(f, seq, i)
24+
end
25+
26+
function _findprev(::typeof(iscertain), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
27+
f = @inline u -> iscertain_kernel(Alphabet(seq), u)
28+
return _findprev_nonzero(f, seq, i)
29+
end
30+
2131
@inline function simd_lt_byte(x::UInt64, byte::UInt8)
2232
T = NTuple{8, VecElement{UInt8}}
2333
x = reinterpret(T, x)
@@ -28,7 +38,7 @@ end
2838
ret <8 x i8> %resb
2939
"""
3040
z = Core.Intrinsics.llvmcall(s, NTuple{8, VecElement{UInt8}}, Tuple{NTuple{8, VecElement{UInt8}}, NTuple{8, VecElement{UInt8}}}, x, y)
31-
reinterpret(UInt64, z)
41+
return reinterpret(UInt64, z)
3242
end
3343

3444
@inline function sub_byte(x::UInt64, byte::UInt8)
@@ -40,14 +50,16 @@ end
4050
ret <8 x i8> %res
4151
"""
4252
z = Core.Intrinsics.llvmcall(s, NTuple{8, VecElement{UInt8}}, Tuple{NTuple{8, VecElement{UInt8}}, NTuple{8, VecElement{UInt8}}}, x, y)
43-
reinterpret(UInt64, z)
53+
return reinterpret(UInt64, z)
4454
end
4555

4656
# Zeros out all nibbles encoding 4bit A,C,G or T
47-
uncertain_kernel(::NucleicAcidAlphabet{4}, x::UInt64) = enumerate_nibbles(x) 0x1111111111111111
57+
@inline function uncertain_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
58+
return enumerate_nibbles(x) 0x1111111111111111
59+
end
4860

4961
# Zero out normal AAs and AA_Term
50-
function uncertain_kernel(::AminoAcidAlphabet, x::UInt64)
62+
@inline function uncertain_kernel(::AminoAcidAlphabet, x::UInt64)
5163
# Zero out normal AA, set rest to 0xFF
5264
y = ~simd_lt_byte(x, 0x16)
5365

@@ -57,8 +69,26 @@ function uncertain_kernel(::AminoAcidAlphabet, x::UInt64)
5769
return y & z
5870
end
5971

72+
function _findnext(
73+
::ComposedFunction{typeof(!), typeof(iscertain)},
74+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
75+
i::Int,
76+
)
77+
f = @inline u -> uncertain_kernel(Alphabet(seq), u)
78+
return _findnext_nonzero(f, seq, i)
79+
end
80+
81+
function _findprev(
82+
::ComposedFunction{typeof(!), typeof(iscertain)},
83+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
84+
i::Int,
85+
)
86+
f = @inline u -> uncertain_kernel(Alphabet(seq), u)
87+
return _findprev_nonzero(f, seq, i)
88+
end
89+
6090
# Zeros out A, C, G, T or Gap
61-
function ambiguous_kernel(A::NucleicAcidAlphabet{4}, x::UInt64)
91+
@inline function ambiguous_kernel(A::NucleicAcidAlphabet{4}, x::UInt64)
6292
# The y part makes every nibble 0xF, unless it's 0 to begin with
6393
y = x | (x >>> 2)
6494
y |= y >>> 1
@@ -68,23 +98,50 @@ function ambiguous_kernel(A::NucleicAcidAlphabet{4}, x::UInt64)
6898
end
6999

70100
# Zero out all except ambiguous symbols (AA_B, AA_J, AA_Z, AA_X), 0x16:0x19
71-
function ambiguous_kernel(::AminoAcidAlphabet, x::UInt64)
101+
@inline function ambiguous_kernel(::AminoAcidAlphabet, x::UInt64)
72102
return simd_lt_byte(sub_byte(x, 0x16), 0x04)
73103
end
74104

105+
function _findnext(::typeof(isambiguous), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
106+
f = @inline u -> ambiguous_kernel(Alphabet(seq), u)
107+
return _findnext_nonzero(f, seq, i)
108+
end
109+
110+
function _findprev(::typeof(isambiguous), seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}}, i::Int)
111+
f = @inline u -> ambiguous_kernel(Alphabet(seq), u)
112+
return _findprev_nonzero(f, seq, i)
113+
end
114+
75115
# Zeros out all except A, C, G, T and Gap
76-
function unambiguous_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
116+
@inline function unambiguous_kernel(::NucleicAcidAlphabet{4}, x::UInt64)
77117
y = enumerate_nibbles(x)
78118
y = (y >>> 1) | (y >>> 2)
79119
y &= 0x1111111111111111
80120
return y ⊻= 0x1111111111111111
81121
end
82122

83123
# Zero out the four ambiguous amino acids B, J, Z, X
84-
function unambiguous_kernel(::AminoAcidAlphabet, x::UInt64)
124+
@inline function unambiguous_kernel(::AminoAcidAlphabet, x::UInt64)
85125
return ~simd_lt_byte(sub_byte(x, 0x16), 0x04)
86126
end
87127

128+
function _findnext(
129+
::ComposedFunction{typeof(!), typeof(isambiguous)},
130+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
131+
i::Int,
132+
)
133+
f = @inline u -> unambiguous_kernel(Alphabet(seq), u)
134+
return _findnext_nonzero(f, seq, i)
135+
end
136+
137+
function _findprev(
138+
::ComposedFunction{typeof(!), typeof(isambiguous)},
139+
seq::SeqOrView{<:Union{<:NucleicAcidAlphabet{4}, AminoAcidAlphabet}},
140+
i::Int,
141+
)
142+
f = @inline u -> unambiguous_kernel(Alphabet(seq), u)
143+
return _findprev_nonzero(f, seq, i)
144+
end
88145

89146
# For debug
90147
#=

src/longsequences/operators.jl

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -273,9 +273,7 @@ function _findnext(
273273
enc === nothing && return nothing
274274
u_enc = enc * encoding_expansion(BitsPerSymbol(seq))
275275
f = x -> set_zero_encoding(BitsPerSymbol(seq), x u_enc)
276-
vw = @inbounds view(seq, i:lastindex(seq))
277-
res = _findfirst_nonzero(f, vw)
278-
res === nothing ? nothing : res + i - 1
276+
_findnext_nonzero(f, seq, i)
279277
end
280278

281279
function _findnext(
@@ -287,11 +285,20 @@ function _findnext(
287285
enc === nothing && return nothing
288286
u_enc = enc * encoding_expansion(BitsPerSymbol(seq))
289287
f = x -> x u_enc
288+
_findnext_nonzero(f, seq, i)
289+
end
290+
291+
@inline function _findnext_nonzero(
292+
f,
293+
seq::SeqOrView{<:KNOWN_ALPHABETS},
294+
i::Int,
295+
)
290296
vw = @inbounds view(seq, i:lastindex(seq))
291297
res = _findfirst_nonzero(f, vw)
292298
res === nothing ? nothing : res + i - 1
293299
end
294300

301+
295302
# Find first index of seq, where f is a function that takes an encodung UInt64
296303
# and will zero out every coding element we are not looking for.
297304
function _findfirst_nonzero(f::Function, seq::SeqOrView{<:KNOWN_ALPHABETS})
@@ -348,8 +355,7 @@ function _findprev(
348355
enc === nothing && return nothing
349356
u_enc = enc * encoding_expansion(BitsPerSymbol(seq))
350357
f = x -> set_zero_encoding(BitsPerSymbol(seq), x u_enc)
351-
vw = @inbounds view(seq, 1:i)
352-
_findlast_nonzero(f, vw)
358+
_findprev_nonzero(f, seq, i)
353359
end
354360

355361
function _findprev(
@@ -361,6 +367,14 @@ function _findprev(
361367
enc === nothing && return nothing
362368
u_enc = enc * encoding_expansion(BitsPerSymbol(seq))
363369
f = x -> x u_enc
370+
_findprev_nonzero(f, seq, i)
371+
end
372+
373+
function _findprev_nonzero(
374+
f,
375+
seq::SeqOrView{<:KNOWN_ALPHABETS},
376+
i::Int,
377+
)
364378
vw = @inbounds view(seq, 1:i)
365379
_findlast_nonzero(f, vw)
366380
end

0 commit comments

Comments
 (0)