From fd25505b8e4e390bfd1d097b7f0747322a4cef8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=E2=80=99Mara?= Date: Thu, 16 Jun 2022 01:25:10 +1000 Subject: [PATCH 1/4] Documentation for `findall` method --- src/BioSequences.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/BioSequences.jl b/src/BioSequences.jl index aef223a9..77e369d9 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -252,6 +252,52 @@ Base.eltype(::Type{<:Search{Q}}) where {Q<:HasRangeEltype} = UnitRange{Int} Base.eltype(::Type{<:Search}) = Int Base.IteratorSize(::Type{<:Search}) = Base.SizeUnknown() +""" + findall(pattern, sequence::BioSequence[,rng::UnitRange{Int}]; overlap::Bool=true)::Vector + +Find all occurrences of `pattern` in `sequence`. + +The return value is a vector of ranges of indices where the matching sequences were found. +If there are no matching sequences, the return value is an empty vector. + +The search is restricted to the specified range when `rng` is set. + +With the keyword argument `overlap` set as `true`, the start index for the next search gets set to the start of the current match plus one; if set to `false`, the start index for the next search gets set to the end of the current match plus one. +The default value for the keyword argument `overlap` is `true`. + +The `pattern` can be a `Biosymbol` or a predicate. + +See also [`ExactSearchQuery`](@ref), [`ApproximateSearchQuery`](@ref), [`PWMSearchQuery`](@ref). + +# Examples +```jldoctest +julia> seq = dna"ACACACAC" +8nt DNA Sequence: +ACACACAC + +julia> findall(DNA_A, seq) +4-element Vector{Int64}: + 1 + 3 + 5 + 7 + +julia> findall(ExactSearchQuery(dna"ACAC"), seq) +3-element Vector{UnitRange{Int64}}: + 1:4 + 3:6 + 5:8 + +julia> findall(ExactSearchQuery(dna"ACAC"), seq; overlap=false) +2-element Vector{UnitRange{Int64}}: + 1:4 + 5:8 + +julia> findall(ExactSearchQuery(dna"ACAC"), seq, 2:7; overlap=false) +1-element Vector{UnitRange{Int64}}: + 3:6 +``` +""" function Base.findall(pat, seq::BioSequence; overlap::Bool = DEFAULT_OVERLAP) return collect(search(pat, seq; overlap)) end From 5292edb4c7dd34da596cd7a391bb84f0452cfaa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Mon, 1 Aug 2022 21:17:55 +1000 Subject: [PATCH 2/4] Add `findall` to examples --- docs/src/sequence_search.md | 5 +++++ src/search/ExactSearchQuery.jl | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/docs/src/sequence_search.md b/docs/src/sequence_search.md index ca49b70e..ac99ea99 100644 --- a/docs/src/sequence_search.md +++ b/docs/src/sequence_search.md @@ -250,6 +250,11 @@ julia> qa = PWMSearchQuery(motifs, 1.0); julia> findfirst(qa, subject) 3 +julia> findall(qa, subject) +3-element Vector{Int64}: + 3 + 5 + 9 ``` [Wasserman2004]: https://doi.org/10.1038/nrg1315 diff --git a/src/search/ExactSearchQuery.jl b/src/search/ExactSearchQuery.jl index 948960fd..fbf08ad7 100644 --- a/src/search/ExactSearchQuery.jl +++ b/src/search/ExactSearchQuery.jl @@ -28,6 +28,11 @@ julia> findnext(query, seq, 6) julia> findprev(query, seq, 7) 3:5 +julia> findall(query, seq) +2-element Vector{UnitRange{Int64}}: + 3:5 + 8:10 + julia> occursin(query, seq) true From 871bba9b669e97bb3ed3ad22da88570ca1daf154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Mon, 1 Aug 2022 20:56:56 +1000 Subject: [PATCH 3/4] Suppress output --- src/BioSequences.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/BioSequences.jl b/src/BioSequences.jl index 77e369d9..f4d7ac4d 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -271,9 +271,7 @@ See also [`ExactSearchQuery`](@ref), [`ApproximateSearchQuery`](@ref), [`PWMSear # Examples ```jldoctest -julia> seq = dna"ACACACAC" -8nt DNA Sequence: -ACACACAC +julia> seq = dna"ACACACAC"; julia> findall(DNA_A, seq) 4-element Vector{Int64}: From b7223a1c6313c9ee4a717f68d4c11c49b22726ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Mon, 1 Aug 2022 20:58:15 +1000 Subject: [PATCH 4/4] Tweak nomenclature --- src/BioSequences.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BioSequences.jl b/src/BioSequences.jl index f4d7ac4d..4676840e 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -265,7 +265,7 @@ The search is restricted to the specified range when `rng` is set. With the keyword argument `overlap` set as `true`, the start index for the next search gets set to the start of the current match plus one; if set to `false`, the start index for the next search gets set to the end of the current match plus one. The default value for the keyword argument `overlap` is `true`. -The `pattern` can be a `Biosymbol` or a predicate. +The `pattern` can be a `Biosymbol` or a search query. See also [`ExactSearchQuery`](@ref), [`ApproximateSearchQuery`](@ref), [`PWMSearchQuery`](@ref).