Draft working version of DictDB

PyDataBlog · PyDataBlog · Jan 3, 2022 · Oct 24, 2021 · Oct 26, 2021 · Oct 27, 2021
commit 79552ef2b5da0227ed7bd98dadf4e902197b6ca6
diff --git a/extras/examples.jl b/extras/examples.jl
@@ -29,4 +29,7 @@ results = search(db,  Cosine(), "foo"; α=0.8)  # yet to be implemented
 
 bs = ["foo", "bar", "foo", "foo", "bar"]
 SimString.extract_features(CharacterNGrams(3, " "), "prepress")
-SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
+SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
+
+db = DictDB(WordNGrams(2, " ", " "))
+push!(db, "You are a really really really cool dude.")
diff --git a/src/SimString.jl b/src/SimString.jl
@@ -13,7 +13,7 @@ include("search.jl")
 
 
 ####### Global export of user API #######
-export Dice, Jaccard, Cosine,
+export Dice, Jaccard, Cosine, Overlap,
     AbstractSimStringDB, DictDB,
     CharacterNGrams, WordNGrams,
     push!, append!, search

diff --git a/src/db_collection.jl b/src/db_collection.jl
@@ -12,14 +12,35 @@ Abstract type for feature extraction structs
 abstract type FeatureExtractor end
 
 
+# Feature Extraction Definitions
+
+"""
+Feature extraction on character-level ngrams
+"""
+struct CharacterNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
+    n::T1         # number of n-grams to extract
+    padder::T2    # string to use to pad n-grams
+end
+
+
+"""
+Feature extraction based on word-level ngrams
+"""
+struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
+    n::T1           # number of n-grams to extract
+    padder::T2      # string to use to pad n-grams
+    splitter::T2    # string to use to split words
+end
+
+
 """
 Custom DB collection for storing SimString data using base Dictionary `Dict`
 """
 struct DictDB{
     T1<:FeatureExtractor,
     T2<:AbstractString,
-    T3<:AbstractDict{Int64, Set{String}},
-    T4<:AbstractDict{Int64, DefaultOrderedDict{Vector{String}, Set{String}}}
+    T3<:AbstractDict,
+    T4<:AbstractDict ,
     } <: AbstractSimStringDB
 
     feature_extractor::T1                       # NGram feature extractor
@@ -29,11 +50,25 @@ struct DictDB{
 end
 
 
-function DictDB(x::FeatureExtractor)
+"""
+"""
+function DictDB(x::CharacterNGrams)
     DictDB(
         x,
         String[],
         DefaultDict{Int, Set{String}}( () -> Set{String}() ),
-        DefaultDict{ Int, DefaultOrderedDict{Vector{String}, Set{String}}  }( () -> DefaultOrderedDict{Vector{String}, Set{String} }(Set{String}))
+        DefaultDict{ Int, DefaultOrderedDict{Tuple{String, Int64}, Set{String}}  }( () -> DefaultOrderedDict{Tuple{String, Int64}, Set{String} }(Set{String}))
     )
 end
+
+
+"""
+"""
+function DictDB(x::WordNGrams)
+    DictDB(
+        x,
+        String[],
+        DefaultDict{Int, Set{String}}( () -> Set{String}() ),
+        DefaultDict{ Int, DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String}}  }( () -> DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String} }(Set{String}))
+    )
+end
diff --git a/src/features.jl b/src/features.jl
@@ -1,40 +1,39 @@
-# Feature Extraction Definitions
-
 """
-Feature extraction on character-level ngrams
+Internal function to pad AbstractString types with specified padder
 """
-struct CharacterNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
-    n::T1         # number of n-grams to extract
-    padder::T2    # string to use to pad n-grams
+function pad_string(x::AbstractString, padder::AbstractString)
+    return string(padder, x, padder)
 end
 
 
 """
-Feature extraction based on word-level ngrams
+Internal function to pad AbstractVector types with specified padder
 """
-struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
-    n::T1           # number of n-grams to extract
-    padder::T2      # string to use to pad n-grams
-    splitter::T2    # string to use to split words
+function pad_string(x::AbstractVector, padder::AbstractString)
+    # Insert a padder as the first and last element of x
+    insert!(x, 1, padder)
+    push!(x, padder)
+    return x
 end
 
 
 """
-Internal function to pad AbstractString types with specified padder
+Internal function to generate intial uncounted ngrams on a character level
 """
-function pad_string(x::AbstractString, padder::AbstractString)
-    return string(padder, x, padder)
+function init_ngrams(extractor::CharacterNGrams, x, n)
+    map(0:length(x)-n) do i
+        x[i+1: i+n]
+    end
 end
 
 
 """
-Internal function to pad AbstractVector types with specified padder
+Internal function to generate intial uncounted ngrams on a word level
 """
-function pad_string(x::AbstractVector, padder::AbstractString)
-    # Insert a padder as the first and last element of x
-    insert!(x, 1, padder)
-    push!(x, padder)
-    return x
+function init_ngrams(extractor::WordNGrams, x, n)
+    map(0:length(x)-n) do i
+        tuple(String.(x[i+1: i+n])...)
+    end
 end
 
 
@@ -43,18 +42,15 @@ Internal function to create character-level ngrams features from an AbstractStri
 """
 function n_grams(extractor::CharacterNGrams, x, n)
     # Return counted n-grams (including duplicates)
-    return cummulative_ngram_count(extractor, [x[i+1: i+n] for i in 0:length(x) - n])
-
+    return cummulative_ngram_count(init_ngrams(extractor, x, n))
 end
 
 
 """
 Internal function to create word-level ngrams from an AbstractVector
 """
 function n_grams(extractor::WordNGrams, x, n)
-    # [tuple(x[i+1: i+n]...) for i in 0:length(x) - n]
-    init_grams = [x[i+1: i+n] for i in 0:length(x) - n]
-    return cummulative_ngram_count(extractor, init_grams)
+    return cummulative_ngram_count(init_ngrams(extractor, x, n))
 end
 
 
@@ -78,71 +74,19 @@ function extract_features(extractor::WordNGrams, str)
 end
 
 
-
-# """
-# Internal function to count and pad generated character-level ngrams (including duplicates)
-# """
-# function cummulative_ngram_count(extractor::CharacterNGrams, x)
-#     p1 = sortperm(x)
-#     p2 = sortperm(p1)
-#     x = sort(x)
-
-#     results = String[]
-#     counter = 0
-#     last_i, rest = Iterators.peel(x)
-
-#     push!(results, string(last_i, "#",  counter += 1))
-
-#     for i in rest
-#         counter = i == last_i ? counter + 1 : 1
-#         last_i = i
-#         push!(results, string(i, "#", counter))
-#     end
-#     return results[p2]
-# end
-
-
-"""
-Internal function to count and pad generated character-level ngrams (including duplicates)
-"""
-function cummulative_ngram_count(extractor::WordNGrams, x)
-    p1 = sortperm(x)
-    p2 = sortperm(p1)
-    x = sort(x)
-
-    results = Vector{Vector{String}}()
-    counter = 0
-    last_i, rest = Iterators.peel(x)
-
-    push!(last_i, "#$(counter += 1)")
-    push!(results, last_i)
-
-    for i in rest
-        counter = i == last_i[1:extractor.n] ? counter + 1 : 1
-        last_i = i
-
-        push!(last_i, "#$(counter)")
-        push!(results, last_i)
-    end
-    return results[p2]
-end
-
-
 """
 Internal function to count and pad generated character-level ngrams (including duplicates)
 """
-function cummulative_ngram_count(extractor::CharacterNGrams, x)
-    counter = Dict{String, Int}()
-    unique_list = Vector{Vector{String}}()
+function cummulative_ngram_count(x)
+    counter = Dict{eltype(x), Int}()
 
-    for val in x
+    unique_list = map(x) do val
         if val in keys(counter)
             counter[val] += 1
         else
             counter[val] = 1
         end
-        # push!(unique_list, string(val, "#", counter[val]))
-        push!(unique_list, [val, string("#", counter[val])])
+        (val, counter[val])
     end
 
     return unique_list
@@ -180,7 +124,7 @@ Add bulk items to a new or existing collection of strings using
 the custom AbstractSimStringDB type.
 """
 function append!(db::AbstractSimStringDB, str::Vector)
-    @inbounds for i in str
+    @inbounds @simd for i in str
         push!(db, i)
     end
 end
diff --git a/src/measures.jl b/src/measures.jl
@@ -1,4 +1,4 @@
-# String Similarity Measure Definitions
+############## String Similarity Measure Definitions  ##############
 
 """
 Abstract base type for all string similarity measures.
@@ -30,6 +30,8 @@ Overlap Similarity Measure.
 struct Overlap <: AbstractSimilarityMeasure end
 
 
+
+############## Minimum Feature Sizes Per Measure  ##############
 """
 Calculate minimum feature size for Dice similarity measure.
 """
@@ -54,6 +56,16 @@ function minimum_feature_size(measure::Cosine, query_size, α)
 end
 
 
+"""
+Calculate minimum feature size for Overlap similarity measure.
+"""
+function minimum_feature_size(measure::Overlap, query_size, α)
+    return 1
+end
+
+
+############## Maximum Feature Size Per Measure  ##############
+
 """
 Calculate maximum feature size for Dice similarity measure.
 """
@@ -78,6 +90,16 @@ function maximum_feature_size(measure::Cosine, query_size, α)
 end
 
 
+"""
+Calculate maximum feature size for Overlap similarity measure.
+"""
+function maximum_feature_size(measure::Overlap, query_size, α)
+    return typemax(Int)
+end
+
+
+
+############## Similarity Score Per Measure  ##############
 """
 Calculate similarity score between X and Y using Dice similarity measure.
 """
@@ -110,6 +132,8 @@ function similarity_score(measure::Overlap, X, Y)
 end
 
 
+
+############## Number of Minimum Overlaps Per Measure  ##############
 """
 Calculate the minimum overlap for a query size, candidate size, and α
 using Dice similarity measure.

diff --git a/test/test01_db_collection.jl b/test/test01_db_collection.jl
@@ -3,15 +3,64 @@ using SimString
 using Test
 
 
-@testset "Check updating of db" begin
+@testset "Check single updates of DictDB using CharacterNGrams" begin
+    db = DictDB(CharacterNGrams(3, " "))
+    push!(db, "foo")
+    push!(db, "bar")
+    push!(db, "fooo")
 
+    @test db.string_collection == ["foo", "bar", "fooo"]
+    @test db.string_size_map[5] == Set(["bar", "foo"])
+    @test db.string_size_map[6] == Set(["fooo"])
+
+    @test collect(keys(db.string_feature_map)) == [5, 6]
+
+    @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
+    @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
 end
 
 
+@testset "Check single update of DictDB using WordNGrams" begin
+    db = DictDB(WordNGrams(2, " ", " "))
+    push!(db, "You are a really really really cool dude.")
+
+    @test db.string_collection == ["You are a really really really cool dude."]
+    @test db.string_size_map[9] == Set(["You are a really really really cool dude."])
+    @test collect(keys(db.string_feature_map)) == [9]
+    @test collect(values(db.string_feature_map[9])) == repeat([Set(["You are a really really really cool dude."])], 9)
+end
 
 
+@testset "Check bulk updates of DictDB using CharacterNGrams" begin
+    db = DictDB(CharacterNGrams(3, " "))
+    append!(db, ["foo", "bar", "fooo"])
 
+    @test db.string_collection == ["foo", "bar", "fooo"]
+    @test db.string_size_map[5] == Set(["bar", "foo"])
+    @test db.string_size_map[6] == Set(["fooo"])
 
+    @test collect(keys(db.string_feature_map)) == [5, 6]
+
+    @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
+    @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+
+    @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
+end
+
+
+@testset "Check bulk updates of DictDB using WordNGrams" begin
+    db = DictDB(WordNGrams(2, " ", " "))
+    append!(db, ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+    @test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
+    @test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+    @test collect(keys(db.string_feature_map)) == [9]
+    @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+    @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+    @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
+end