Skip to content

Commit 504460e

Browse files
authored
Zero index (#6)
* Beta release (0-index implementation) * Added basic summary description for dictdb collection
1 parent 6f71152 commit 504460e

File tree

12 files changed

+152
-59
lines changed

12 files changed

+152
-59
lines changed

docs/src/index.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] Support for unicodes
1717
- [ ] Custom user defined feature generation methods
1818
- [ ] Mecab-based tokenizer support
19+
- [ ] Support for building databases directly from text files
20+
- [ ] Support for persistent databases
1921

2022
## Suported String Similarity Measures
2123

2224
- [X] Dice coefficient
2325
- [X] Jaccard coefficient
2426
- [X] Cosine coefficient
2527
- [X] Overlap coefficient
28+
- [X] Exact match
2629

2730
## Installation
2831

@@ -67,7 +70,9 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
6770
# ("foo", 1.0)
6871
# ("fooo", 0.8888888888888888)
6972

70-
73+
# Describe a working database collection
74+
desc = describe_collection(db)
75+
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
7176
```
7277

7378
## TODO: Benchmarks

extras/examples.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@ push!(db, "foo");
2121
push!(db, "bar");
2222
push!(db, "fooo");
2323

24-
f(x, c, s) = search(x, c, s)
24+
f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
2525
test = "foo";
2626
col = db;
2727
sim = Cosine();
28+
a = 0.8;
29+
r = true;
2830

29-
f(Cosine(), db, "foo")
31+
f(Cosine(), db, "foo", 0.8, true)
3032

31-
@btime f($sim, $col, $test)
33+
@btime f($sim, $col, $test, $a, $r)
3234
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
3335

3436

extras/py_benchmarks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ def f(x):
1313
for i in x:
1414
db.add(i)
1515

16-
# %time f(fake_names)
16+
# %time f(fake_names)

src/SimString.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ module SimString
22

33
import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
5-
# using ProgressMeter
6-
# using CircularArrays
7-
# using OffsetArrays
5+
using ProgressMeter
6+
using CircularArrays
7+
using OffsetArrays
88

99
######### Import modules & utils ################
1010
include("db_collection.jl")
@@ -16,8 +16,8 @@ include("search.jl")
1616

1717

1818
####### Global export of user API #######
19-
export Dice, Jaccard, Cosine, Overlap,
20-
AbstractSimStringDB, DictDB,
19+
export Dice, Jaccard, Cosine, Overlap, ExactMatch,
20+
AbstractSimStringDB, DictDB, describe_collection,
2121
CharacterNGrams, WordNGrams,
2222
search
2323

src/dictdb.jl

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,28 +75,49 @@ end
7575

7676
################################## DictDB UTIL Functions ############################
7777
"""
78-
Internal function for retrieving existing features by size
78+
describe_collection(db::DictDB)
79+
80+
Basic summary stats for the DB
81+
82+
# Arguments
83+
* `db`: DictDB object
84+
85+
# Example
86+
```julia
87+
db = DictDB(CharacterNGrams(2, " "));
88+
append!(db, ["foo", "bar", "fooo"]);
89+
describe_collection(db)
90+
91+
# Returns
92+
* NamedTuples: Summary stats for the DB
93+
```
94+
7995
"""
80-
function retrieve_existing_feature_by_size(db::DictDB, size, feature)
81-
return db.string_feature_map[size][feature]
82-
end
96+
function describe_collection(db::DictDB)
8397

98+
# Total number of strings in collection
99+
= length(db.string_collection)
84100

85-
# """
86-
# Basic summary stats for the DB
87-
# """
88-
# function describe_db(db::DictDB)
101+
# Average number of ngram features
102+
n = [x for x in keys(db.string_size_map)]
103+
μ = sum(n) / length(n)
89104

90-
# end
105+
# Total number of ngram features
106+
total_ngrams = 0
107+
for i in values(db.string_feature_map)
108+
total_ngrams += length(i)
109+
end
110+
111+
return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
112+
end
91113

92114

93115
"""
94116
Internal function to lookup feature sets by size and feature
95117
"""
96118
function lookup_feature_set_by_size_feature(db::DictDB, size, feature)
97-
# TODO: Clean this up and make it more efficient. Shouldn't updated db.string_feature_map
98119
if feature keys(db.lookup_cache[size])
99-
db.lookup_cache[size][feature] = retrieve_existing_feature_by_size(db, size, feature)
120+
db.lookup_cache[size][feature] = get(db.string_feature_map[size], feature, Set{String}())
100121
end
101122
return db.lookup_cache[size][feature]
102123
end

src/features.jl

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ end
4141
Internal function to create character-level ngrams features from an AbstractString
4242
"""
4343
function n_grams(extractor::CharacterNGrams, x, n)
44-
# Return counted n-grams (including duplicates)
4544
return cummulative_ngram_count(init_ngrams(extractor, x, n))
4645
end
4746

@@ -54,13 +53,21 @@ function n_grams(extractor::WordNGrams, x, n)
5453
end
5554

5655

56+
"""
57+
Internal function to make zero indexed circular arrays
58+
"""
59+
function make_zero_index_circular_array(x)
60+
return CircularArray(OffsetArray(x, 0:length(x)-1))
61+
end
62+
63+
5764
"""
5865
Internal function to generate character-level ngrams features from an AbstractString
5966
"""
6067
function extract_features(extractor::CharacterNGrams, str)
6168
n = extractor.n - 1 == 0 ? 1 : extractor.n - 1
6269
str = pad_string(str, repeat(extractor.padder, n))
63-
return n_grams(extractor, str, extractor.n)
70+
return make_zero_index_circular_array(n_grams(extractor, str, extractor.n))
6471
end
6572

6673

@@ -70,7 +77,7 @@ Internal function to generate word-level ngrams features from an AbstractString
7077
function extract_features(extractor::WordNGrams, str)
7178
words_split = split(str, extractor.splitter)
7279
padded_words = pad_string(words_split, extractor.padder)
73-
return n_grams(extractor, padded_words, extractor.n)
80+
return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
7481
end
7582

7683

@@ -80,16 +87,14 @@ Internal function to count and pad generated character-level ngrams (including d
8087
function cummulative_ngram_count(x)
8188
counter = Dict{eltype(x), Int}()
8289

83-
unique_list = map(x) do val
90+
return map(x) do val
8491
if val in keys(counter)
8592
counter[val] += 1
8693
else
8794
counter[val] = 1
8895
end
8996
(val, counter[val])
9097
end
91-
92-
return unique_list
9398
end
9499

95100

src/measures.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ Overlap Similarity Measure.
3030
struct Overlap <: AbstractSimilarityMeasure end
3131

3232

33+
"""
34+
Exact Match Similarity Measure.
35+
"""
36+
struct ExactMatch <: AbstractSimilarityMeasure end
37+
3338

3439
############## Minimum Feature Sizes Per Measure ##############
3540
"""
@@ -64,8 +69,15 @@ function minimum_feature_size(measure::Overlap, query_size, α)
6469
end
6570

6671

72+
"""
73+
Calculate minimum feature size for ExactMatch similarity measure.
74+
"""
75+
function minimum_feature_size(measure::ExactMatch, query_size, α)
76+
return query_size
77+
end
6778
############## Maximum Feature Size Per Measure ##############
6879

80+
6981
"""
7082
Calculate maximum feature size for Dice similarity measure.
7183
"""
@@ -98,6 +110,14 @@ function maximum_feature_size(measure::Overlap, db::AbstractSimStringDB, query_s
98110
end
99111

100112

113+
"""
114+
Calculate maximum feature size for ExactMatch similarity measure.
115+
"""
116+
function maximum_feature_size(measure::ExactMatch, db::AbstractSimStringDB, query_size, α)
117+
return query_size
118+
end
119+
120+
101121
############## Similarity Score Per Measure ##############
102122
"""
103123
Calculate similarity score between X and Y using Dice similarity measure.
@@ -131,6 +151,13 @@ function similarity_score(measure::Overlap, X, Y)
131151
end
132152

133153

154+
"""
155+
Calculate similarity score between X and Y using ExactMatch similarity measure.
156+
"""
157+
function similarity_score(measure::ExactMatch, X, Y)
158+
return Set(X) == Set(Y) ? 1.0 : 0.0
159+
end
160+
134161

135162
############## Number of Minimum Overlaps Per Measure ##############
136163
"""
@@ -166,4 +193,13 @@ using Overlap similarity measure.
166193
"""
167194
function minimum_overlap(measure::Overlap, query_size, candidate_size, α)
168195
return ceil(Int, (α * min(query_size, candidate_size)) )
196+
end
197+
198+
199+
"""
200+
Calculate the minimum overlap (τ) for a query size, candidate size, and α
201+
using ExactMatch similarity measure.
202+
"""
203+
function minimum_overlap(measure::ExactMatch, query_size, candidate_size, α)
204+
return query_size
169205
end

src/search.jl

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -60,43 +60,22 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
6060
features = sort(features, by = i -> length(lookup_feature_set_by_size_feature(db_collection, candidate_size, i) ) )
6161

6262
# Count the occurrences of each feature
63-
candidate_match_counts = DefaultDict(0)
64-
63+
candidate_match_counts = DefaultDict{String, Int}(0)
6564
feature_slice_index = query_feature_length - τ + 1
65+
idx = query_feature_length - τ
66+
focus_features = feature_slice_index < 0 ? (@view features[0:end + feature_slice_index]) : (@view features[0:idx])
6667

67-
if feature_slice_index < 0
68-
focus_features = features[1:end + feature_slice_index]
69-
else
70-
focus_features = features[1:feature_slice_index]
71-
end
72-
73-
for i in focus_features
68+
@inbounds @views for i in focus_features
7469
for s in lookup_feature_set_by_size_feature(db_collection, candidate_size, i)
7570
candidate_match_counts[s] += 1
7671
end
7772
end
7873

7974
results = String[]
8075

81-
# TODO: Return results in case of a perfect match??
82-
# if τ == 1
83-
# results = collect(keys(candidate_match_counts))
84-
# end
85-
8676
for (candidate, match_count) in candidate_match_counts
87-
88-
for i in (query_feature_length - τ + 1) : query_feature_length - 1 # TODO: Verify
89-
90-
if i < 0
91-
feature = features[end + i]
92-
elseif i == 0
93-
feature = features[i+1]
94-
else
95-
feature = features[i]
96-
97-
end
98-
99-
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, feature)
77+
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
78+
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
10079
match_count += 1
10180
end
10281

@@ -106,11 +85,9 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
10685
end
10786

10887
remaining_count = query_feature_length - i - 1
109-
11088
if (match_count + remaining_count) < τ
11189
break
11290
end
113-
11491
end
11592
end
11693
return results
@@ -133,7 +110,7 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
133110
results = String[]
134111

135112
# Generate and return results from the potential candidate size pool
136-
for candidate_size in min_feature_size:max_feature_size
113+
@inbounds for candidate_size in min_feature_size:max_feature_size
137114
# Minimum overlap
138115
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
139116

test/test01_dictdb.jl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,15 @@ end
6464

6565

6666

67+
@testset "Test describe functionality" begin
68+
db = DictDB(CharacterNGrams(2, " "));
69+
append!(db, ["foo", "bar", "fooo"]);
70+
71+
# Interact with db
72+
search(Dice(), db, "zep"; α=0.8, ranked=true)
73+
74+
@test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
75+
end
6776

6877

6978

test/test02_features.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ using Test
55

66
@testset "Test feature extraction" begin
77
char_ngram_res = SimString.extract_features(CharacterNGrams(3, " "), "prepress")
8-
@test char_ngram_res[6] == ("pre", 2)
8+
@test char_ngram_res[5] == ("pre", 2)
99

1010
word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
11-
@test word_ngram_res[6] == (("really", "really"), 2)
11+
@test word_ngram_res[5] == (("really", "really"), 2)
1212
end
1313

1414

0 commit comments

Comments
 (0)