Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
551b98c
Updated gitignore
PyDataBlog Oct 24, 2021
80035f6
WIP on code structure
PyDataBlog Oct 26, 2021
3840cb0
Updated CI file
PyDataBlog Oct 27, 2021
0666879
WIP on features
PyDataBlog Oct 27, 2021
0d6a256
Initial architecture
PyDataBlog Oct 28, 2021
61810a6
WIP make db architecture
PyDataBlog Oct 31, 2021
de7705d
Added ngram func
PyDataBlog Oct 31, 2021
22920c1
Added ngrams
PyDataBlog Oct 31, 2021
3a1cb6c
Cleaned up
PyDataBlog Oct 31, 2021
9f36e58
Initial codebase structure
PyDataBlog Nov 8, 2021
a43fc26
Fixed module imports bug & removed utils.jl
PyDataBlog Nov 8, 2021
767ba86
Added ngram count for wordngrams
PyDataBlog Nov 9, 2021
cd4ee56
Updated user API and switched from add! to push!
PyDataBlog Nov 10, 2021
8b59186
Replaced add! with push, added examples and implemented measures
PyDataBlog Nov 12, 2021
47684f1
Proposed DB structure
PyDataBlog Nov 14, 2021
7b184d8
Switched to datastructures for dictdb
PyDataBlog Nov 23, 2021
44ad356
Switched ngram counts as vectors
PyDataBlog Dec 15, 2021
79552ef
Draft working version of DictDB
PyDataBlog Dec 18, 2021
a77ae71
Removed export of base functions
PyDataBlog Dec 18, 2021
bd07a36
Code restructure
PyDataBlog Dec 29, 2021
a63e872
Added tests for measures
PyDataBlog Dec 29, 2021
ef0e6e5
Initial draft of search functionality
PyDataBlog Jan 2, 2022
ff29e52
Working but dirty implementation of search
PyDataBlog Jan 2, 2022
a78de69
Cleaned up & prepared for switch to 0 indexing implementation
PyDataBlog Jan 3, 2022
d24bdb6
Alpha release
PyDataBlog Jan 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Draft working version of DictDB
  • Loading branch information
PyDataBlog committed Dec 18, 2021
commit 79552ef2b5da0227ed7bd98dadf4e902197b6ca6
5 changes: 4 additions & 1 deletion extras/examples.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ results = search(db, Cosine(), "foo"; α=0.8) # yet to be implemented

bs = ["foo", "bar", "foo", "foo", "bar"]
SimString.extract_features(CharacterNGrams(3, " "), "prepress")
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")

db = DictDB(WordNGrams(2, " ", " "))
push!(db, "You are a really really really cool dude.")
2 changes: 1 addition & 1 deletion src/SimString.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ include("search.jl")


####### Global export of user API #######
export Dice, Jaccard, Cosine,
export Dice, Jaccard, Cosine, Overlap,
AbstractSimStringDB, DictDB,
CharacterNGrams, WordNGrams,
push!, append!, search
Expand Down
43 changes: 39 additions & 4 deletions src/db_collection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,35 @@ Abstract type for feature extraction structs
abstract type FeatureExtractor end


# Feature Extraction Definitions

"""
Feature extraction on character-level ngrams
"""
struct CharacterNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
n::T1 # number of n-grams to extract
padder::T2 # string to use to pad n-grams
end


"""
Feature extraction based on word-level ngrams
"""
struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
n::T1 # number of n-grams to extract
padder::T2 # string to use to pad n-grams
splitter::T2 # string to use to split words
end


"""
Custom DB collection for storing SimString data using base Dictionary `Dict`
"""
struct DictDB{
T1<:FeatureExtractor,
T2<:AbstractString,
T3<:AbstractDict{Int64, Set{String}},
T4<:AbstractDict{Int64, DefaultOrderedDict{Vector{String}, Set{String}}}
T3<:AbstractDict,
T4<:AbstractDict ,
} <: AbstractSimStringDB

feature_extractor::T1 # NGram feature extractor
Expand All @@ -29,11 +50,25 @@ struct DictDB{
end


function DictDB(x::FeatureExtractor)
"""
"""
function DictDB(x::CharacterNGrams)
DictDB(
x,
String[],
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
DefaultDict{ Int, DefaultOrderedDict{Vector{String}, Set{String}} }( () -> DefaultOrderedDict{Vector{String}, Set{String} }(Set{String}))
DefaultDict{ Int, DefaultOrderedDict{Tuple{String, Int64}, Set{String}} }( () -> DefaultOrderedDict{Tuple{String, Int64}, Set{String} }(Set{String}))
)
end


"""
"""
function DictDB(x::WordNGrams)
DictDB(
x,
String[],
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
DefaultDict{ Int, DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String} }(Set{String}))
)
end
108 changes: 26 additions & 82 deletions src/features.jl
Original file line number Diff line number Diff line change
@@ -1,40 +1,39 @@
# Feature Extraction Definitions

"""
Feature extraction on character-level ngrams
Internal function to pad AbstractString types with specified padder
"""
struct CharacterNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
n::T1 # number of n-grams to extract
padder::T2 # string to use to pad n-grams
function pad_string(x::AbstractString, padder::AbstractString)
return string(padder, x, padder)
end


"""
Feature extraction based on word-level ngrams
Internal function to pad AbstractVector types with specified padder
"""
struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
n::T1 # number of n-grams to extract
padder::T2 # string to use to pad n-grams
splitter::T2 # string to use to split words
function pad_string(x::AbstractVector, padder::AbstractString)
# Insert a padder as the first and last element of x
insert!(x, 1, padder)
push!(x, padder)
return x
end


"""
Internal function to pad AbstractString types with specified padder
Internal function to generate intial uncounted ngrams on a character level
"""
function pad_string(x::AbstractString, padder::AbstractString)
return string(padder, x, padder)
function init_ngrams(extractor::CharacterNGrams, x, n)
map(0:length(x)-n) do i
x[i+1: i+n]
end
end


"""
Internal function to pad AbstractVector types with specified padder
Internal function to generate intial uncounted ngrams on a word level
"""
function pad_string(x::AbstractVector, padder::AbstractString)
# Insert a padder as the first and last element of x
insert!(x, 1, padder)
push!(x, padder)
return x
function init_ngrams(extractor::WordNGrams, x, n)
map(0:length(x)-n) do i
tuple(String.(x[i+1: i+n])...)
end
end


Expand All @@ -43,18 +42,15 @@ Internal function to create character-level ngrams features from an AbstractStri
"""
function n_grams(extractor::CharacterNGrams, x, n)
# Return counted n-grams (including duplicates)
return cummulative_ngram_count(extractor, [x[i+1: i+n] for i in 0:length(x) - n])

return cummulative_ngram_count(init_ngrams(extractor, x, n))
end


"""
Internal function to create word-level ngrams from an AbstractVector
"""
function n_grams(extractor::WordNGrams, x, n)
# [tuple(x[i+1: i+n]...) for i in 0:length(x) - n]
init_grams = [x[i+1: i+n] for i in 0:length(x) - n]
return cummulative_ngram_count(extractor, init_grams)
return cummulative_ngram_count(init_ngrams(extractor, x, n))
end


Expand All @@ -78,71 +74,19 @@ function extract_features(extractor::WordNGrams, str)
end



# """
# Internal function to count and pad generated character-level ngrams (including duplicates)
# """
# function cummulative_ngram_count(extractor::CharacterNGrams, x)
# p1 = sortperm(x)
# p2 = sortperm(p1)
# x = sort(x)

# results = String[]
# counter = 0
# last_i, rest = Iterators.peel(x)

# push!(results, string(last_i, "#", counter += 1))

# for i in rest
# counter = i == last_i ? counter + 1 : 1
# last_i = i
# push!(results, string(i, "#", counter))
# end
# return results[p2]
# end


"""
Internal function to count and pad generated character-level ngrams (including duplicates)
"""
function cummulative_ngram_count(extractor::WordNGrams, x)
p1 = sortperm(x)
p2 = sortperm(p1)
x = sort(x)

results = Vector{Vector{String}}()
counter = 0
last_i, rest = Iterators.peel(x)

push!(last_i, "#$(counter += 1)")
push!(results, last_i)

for i in rest
counter = i == last_i[1:extractor.n] ? counter + 1 : 1
last_i = i

push!(last_i, "#$(counter)")
push!(results, last_i)
end
return results[p2]
end


"""
Internal function to count and pad generated character-level ngrams (including duplicates)
"""
function cummulative_ngram_count(extractor::CharacterNGrams, x)
counter = Dict{String, Int}()
unique_list = Vector{Vector{String}}()
function cummulative_ngram_count(x)
counter = Dict{eltype(x), Int}()

for val in x
unique_list = map(x) do val
if val in keys(counter)
counter[val] += 1
else
counter[val] = 1
end
# push!(unique_list, string(val, "#", counter[val]))
push!(unique_list, [val, string("#", counter[val])])
(val, counter[val])
end

return unique_list
Expand Down Expand Up @@ -180,7 +124,7 @@ Add bulk items to a new or existing collection of strings using
the custom AbstractSimStringDB type.
"""
function append!(db::AbstractSimStringDB, str::Vector)
@inbounds for i in str
@inbounds @simd for i in str
push!(db, i)
end
end
26 changes: 25 additions & 1 deletion src/measures.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# String Similarity Measure Definitions
############## String Similarity Measure Definitions ##############

"""
Abstract base type for all string similarity measures.
Expand Down Expand Up @@ -30,6 +30,8 @@ Overlap Similarity Measure.
struct Overlap <: AbstractSimilarityMeasure end



############## Minimum Feature Sizes Per Measure ##############
"""
Calculate minimum feature size for Dice similarity measure.
"""
Expand All @@ -54,6 +56,16 @@ function minimum_feature_size(measure::Cosine, query_size, α)
end


"""
Calculate minimum feature size for Overlap similarity measure.
"""
function minimum_feature_size(measure::Overlap, query_size, α)
return 1
end


############## Maximum Feature Size Per Measure ##############

"""
Calculate maximum feature size for Dice similarity measure.
"""
Expand All @@ -78,6 +90,16 @@ function maximum_feature_size(measure::Cosine, query_size, α)
end


"""
Calculate maximum feature size for Overlap similarity measure.
"""
function maximum_feature_size(measure::Overlap, query_size, α)
return typemax(Int)
end



############## Similarity Score Per Measure ##############
"""
Calculate similarity score between X and Y using Dice similarity measure.
"""
Expand Down Expand Up @@ -110,6 +132,8 @@ function similarity_score(measure::Overlap, X, Y)
end



############## Number of Minimum Overlaps Per Measure ##############
"""
Calculate the minimum overlap for a query size, candidate size, and α
using Dice similarity measure.
Expand Down
51 changes: 50 additions & 1 deletion test/test01_db_collection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,64 @@ using SimString
using Test


@testset "Check updating of db" begin
@testset "Check single updates of DictDB using CharacterNGrams" begin
db = DictDB(CharacterNGrams(3, " "))
push!(db, "foo")
push!(db, "bar")
push!(db, "fooo")

@test db.string_collection == ["foo", "bar", "fooo"]
@test db.string_size_map[5] == Set(["bar", "foo"])
@test db.string_size_map[6] == Set(["fooo"])

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
end


@testset "Check single update of DictDB using WordNGrams" begin
db = DictDB(WordNGrams(2, " ", " "))
push!(db, "You are a really really really cool dude.")

@test db.string_collection == ["You are a really really really cool dude."]
@test db.string_size_map[9] == Set(["You are a really really really cool dude."])
@test collect(keys(db.string_feature_map)) == [9]
@test collect(values(db.string_feature_map[9])) == repeat([Set(["You are a really really really cool dude."])], 9)
end


@testset "Check bulk updates of DictDB using CharacterNGrams" begin
db = DictDB(CharacterNGrams(3, " "))
append!(db, ["foo", "bar", "fooo"])

@test db.string_collection == ["foo", "bar", "fooo"]
@test db.string_size_map[5] == Set(["bar", "foo"])
@test db.string_size_map[6] == Set(["fooo"])

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)

@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
end


@testset "Check bulk updates of DictDB using WordNGrams" begin
db = DictDB(WordNGrams(2, " ", " "))
append!(db, ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
@test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test collect(keys(db.string_feature_map)) == [9]
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
end



Expand Down
Loading