Zero index (#6)

PyDataBlog · web-flow · commit 504460e9ce1f · 2022-01-23T14:19:55.000+01:00
* Beta release (0-index implementation) 
* Added basic summary description for dictdb collection
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -16,13 +16,16 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Support for unicodes
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
+- [ ] Support for building databases directly from text files
+- [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
 
 - [X] Dice coefficient
 - [X] Jaccard coefficient
 - [X] Cosine coefficient
 - [X] Overlap coefficient
+- [X] Exact match
 
 ## Installation
 
@@ -67,7 +70,9 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
 #  ("foo", 1.0)
 #  ("fooo", 0.8888888888888888)
 
-
+# Describe a working database collection
+desc = describe_collection(db)
+# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
 ```
 
 ## TODO: Benchmarks
diff --git a/extras/examples.jl b/extras/examples.jl
@@ -21,14 +21,16 @@ push!(db, "foo");
 push!(db, "bar");
 push!(db, "fooo");
 
-f(x, c, s) = search(x, c, s)
+f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
 test = "foo";
 col = db;
 sim = Cosine();
+a = 0.8;
+r = true;
 
-f(Cosine(),  db, "foo")
+f(Cosine(),  db, "foo", 0.8, true)
 
-@btime f($sim,  $col, $test)
+@btime f($sim,  $col, $test, $a, $r)
 @btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
 
 
diff --git a/extras/py_benchmarks.py b/extras/py_benchmarks.py
@@ -13,4 +13,4 @@ def f(x):
     for i in x:
         db.add(i)
 
-# %time f(fake_names)
+# %time f(fake_names)
diff --git a/src/SimString.jl b/src/SimString.jl
@@ -2,9 +2,9 @@ module SimString
 
 import Base: push!, append!
 using DataStructures: DefaultOrderedDict, DefaultDict
-# using ProgressMeter
-# using CircularArrays
-# using OffsetArrays
+using ProgressMeter
+using CircularArrays
+using OffsetArrays
 
 ######### Import modules & utils ################
 include("db_collection.jl")
@@ -16,8 +16,8 @@ include("search.jl")
 
 
 ####### Global export of user API #######
-export Dice, Jaccard, Cosine, Overlap,
-    AbstractSimStringDB, DictDB,
+export Dice, Jaccard, Cosine, Overlap, ExactMatch,
+    AbstractSimStringDB, DictDB, describe_collection,
     CharacterNGrams, WordNGrams,
     search
 
diff --git a/src/dictdb.jl b/src/dictdb.jl
@@ -75,28 +75,49 @@ end
 
 ################################## DictDB UTIL Functions  ############################
 """
-Internal function for retrieving existing features by size
+    describe_collection(db::DictDB)
+
+Basic summary stats for the DB
+
+# Arguments
+* `db`: DictDB object
+
+# Example
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+append!(db, ["foo", "bar", "fooo"]);
+describe_collection(db)
+
+# Returns
+* NamedTuples: Summary stats for the DB
+```
+
 """
-function retrieve_existing_feature_by_size(db::DictDB, size, feature)
-    return db.string_feature_map[size][feature]
-end
+function describe_collection(db::DictDB)
 
+# Total number of strings in collection
+∑ = length(db.string_collection)
 
-# """
-# Basic summary stats for the DB
-# """
-# function describe_db(db::DictDB)
+# Average number of ngram features
+n = [x for x in keys(db.string_size_map)]
+μ = sum(n) / length(n)
 
-# end
+# Total number of ngram features
+total_ngrams = 0
+for i in values(db.string_feature_map)
+    total_ngrams += length(i)
+end
+
+return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
+end
 
 
 """
 Internal function to lookup feature sets by size and feature
 """
 function lookup_feature_set_by_size_feature(db::DictDB, size, feature)
-    # TODO: Clean this up and make it more efficient. Shouldn't updated db.string_feature_map
     if feature ∉ keys(db.lookup_cache[size])
-        db.lookup_cache[size][feature] = retrieve_existing_feature_by_size(db, size, feature)
+        db.lookup_cache[size][feature] = get(db.string_feature_map[size], feature, Set{String}())
     end
     return db.lookup_cache[size][feature]
 end
diff --git a/src/features.jl b/src/features.jl
@@ -41,7 +41,6 @@ end
 Internal function to create character-level ngrams features from an AbstractString
 """
 function n_grams(extractor::CharacterNGrams, x, n)
-    # Return counted n-grams (including duplicates)
     return cummulative_ngram_count(init_ngrams(extractor, x, n))
 end
 
@@ -54,13 +53,21 @@ function n_grams(extractor::WordNGrams, x, n)
 end
 
 
+"""
+Internal function to make zero indexed circular arrays
+"""
+function make_zero_index_circular_array(x)
+    return CircularArray(OffsetArray(x, 0:length(x)-1))
+end
+
+
 """
 Internal function to generate character-level ngrams features from an AbstractString
 """
 function extract_features(extractor::CharacterNGrams, str)
     n = extractor.n - 1 == 0 ? 1 : extractor.n - 1
     str = pad_string(str, repeat(extractor.padder, n))
-    return n_grams(extractor, str, extractor.n)
+    return make_zero_index_circular_array(n_grams(extractor, str, extractor.n))
 end
 
 
@@ -70,7 +77,7 @@ Internal function to generate word-level ngrams features from an AbstractString
 function extract_features(extractor::WordNGrams, str)
     words_split = split(str, extractor.splitter)
     padded_words = pad_string(words_split, extractor.padder)
-    return n_grams(extractor, padded_words, extractor.n)
+    return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
 end
 
 
@@ -80,16 +87,14 @@ Internal function to count and pad generated character-level ngrams (including d
 function cummulative_ngram_count(x)
     counter = Dict{eltype(x), Int}()
 
-    unique_list = map(x) do val
+    return map(x) do val
         if val in keys(counter)
             counter[val] += 1
         else
             counter[val] = 1
         end
         (val, counter[val])
     end
-
-    return unique_list
 end
 
 
diff --git a/src/measures.jl b/src/measures.jl
@@ -30,6 +30,11 @@ Overlap Similarity Measure.
 struct Overlap <: AbstractSimilarityMeasure end
 
 
+"""
+Exact Match Similarity Measure.
+"""
+struct ExactMatch <: AbstractSimilarityMeasure end
+
 
 ############## Minimum Feature Sizes Per Measure  ##############
 """
@@ -64,8 +69,15 @@ function minimum_feature_size(measure::Overlap, query_size, α)
 end
 
 
+"""
+Calculate minimum feature size for ExactMatch similarity measure.
+"""
+function minimum_feature_size(measure::ExactMatch, query_size, α)
+    return query_size
+end
 ############## Maximum Feature Size Per Measure  ##############
 
+
 """
 Calculate maximum feature size for Dice similarity measure.
 """
@@ -98,6 +110,14 @@ function maximum_feature_size(measure::Overlap, db::AbstractSimStringDB, query_s
 end
 
 
+"""
+Calculate maximum feature size for ExactMatch similarity measure.
+"""
+function maximum_feature_size(measure::ExactMatch, db::AbstractSimStringDB, query_size, α)
+    return query_size
+end
+
+
 ############## Similarity Score Per Measure  ##############
 """
 Calculate similarity score between X and Y using Dice similarity measure.
@@ -131,6 +151,13 @@ function similarity_score(measure::Overlap, X, Y)
 end
 
 
+"""
+Calculate similarity score between X and Y using ExactMatch similarity measure.
+"""
+function similarity_score(measure::ExactMatch, X, Y)
+    return Set(X) == Set(Y) ? 1.0 : 0.0
+end
+
 
 ############## Number of Minimum Overlaps Per Measure  ##############
 """
@@ -166,4 +193,13 @@ using Overlap similarity measure.
 """
 function minimum_overlap(measure::Overlap, query_size, candidate_size, α)
     return ceil(Int, (α * min(query_size, candidate_size)) )
+end
+
+
+"""
+Calculate the minimum overlap (τ) for a query size, candidate size, and α
+using ExactMatch similarity measure.
+"""
+function minimum_overlap(measure::ExactMatch, query_size, candidate_size, α)
+    return query_size
 end
diff --git a/src/search.jl b/src/search.jl
@@ -60,43 +60,22 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
     features = sort(features, by = i -> length(lookup_feature_set_by_size_feature(db_collection, candidate_size, i) ) )
 
     # Count the occurrences of each feature
-    candidate_match_counts = DefaultDict(0)
-
+    candidate_match_counts = DefaultDict{String, Int}(0)
     feature_slice_index = query_feature_length - τ + 1
+    idx = query_feature_length - τ
+    focus_features = feature_slice_index < 0 ? (@view features[0:end + feature_slice_index]) : (@view features[0:idx])
 
-    if feature_slice_index < 0
-        focus_features = features[1:end + feature_slice_index]
-    else
-        focus_features = features[1:feature_slice_index]
-    end
-
-    for i in focus_features
+    @inbounds @views for i in focus_features
         for s in lookup_feature_set_by_size_feature(db_collection, candidate_size, i)
             candidate_match_counts[s] += 1
         end
     end
 
     results = String[]
 
-    # TODO: Return results in case of a perfect match??
-    # if τ == 1
-    #     results = collect(keys(candidate_match_counts))
-    # end
-
     for (candidate, match_count) in candidate_match_counts
-
-        for i in (query_feature_length - τ + 1) : query_feature_length - 1  # TODO: Verify
-
-            if i < 0
-                feature = features[end + i]
-            elseif i == 0
-                feature = features[i+1]
-            else
-                feature = features[i]
-
-            end
-
-            if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, feature)
+        for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
+            if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
                 match_count += 1
             end
 
@@ -106,11 +85,9 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
             end
 
             remaining_count = query_feature_length - i - 1
-
             if (match_count + remaining_count) < τ
                 break
             end
-
         end
     end
     return results
@@ -133,7 +110,7 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
     results = String[]
 
     # Generate and return results from the potential candidate size pool
-    for candidate_size in min_feature_size:max_feature_size
+    @inbounds for candidate_size in min_feature_size:max_feature_size
         # Minimum overlap
         τ = minimum_overlap(measure, length_of_features, candidate_size, α)
 
diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl
@@ -64,6 +64,15 @@ end
 
 
 
+@testset "Test describe functionality" begin
+    db = DictDB(CharacterNGrams(2, " "));
+    append!(db, ["foo", "bar", "fooo"]);
+
+    # Interact with db
+    search(Dice(), db, "zep"; α=0.8, ranked=true)
+
+    @test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
+end
 
 
 
diff --git a/test/test02_features.jl b/test/test02_features.jl
@@ -5,10 +5,10 @@ using Test
 
 @testset "Test feature extraction" begin
     char_ngram_res = SimString.extract_features(CharacterNGrams(3, " "), "prepress")
-    @test char_ngram_res[6] == ("pre", 2)
+    @test char_ngram_res[5] == ("pre", 2)
 
     word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
-    @test word_ngram_res[6] == (("really", "really"), 2)
+    @test word_ngram_res[5] == (("really", "really"), 2)
 end
 
 
diff --git a/test/test03_measures.jl b/test/test03_measures.jl
diff --git a/test/test04_search.jl b/test/test04_search.jl