Closed
Description
Special unicode characters breaks the current n-grams feature generation as it uses array indexing as replicated below:
using SimString
db = DictDB(CharacterNGrams(2, " "))
push!(db, "„orosz állami részvénytársaság")
Output:
StringIndexError: invalid index [3], valid nearby indices [2]=>'„', [5]=>'o'
string_index_err(::String, ::Int64)@string.jl:12
getindex@string.jl:263[inlined]
(::SimString.var"#13#14"{String, Int64})(::Int64)@features.jl:25
iterate@generator.jl:47[inlined]
collect_to!@array.jl:782[inlined]
collect_to_with_first!@array.jl:760[inlined]
_collect(::UnitRange{Int64}, ::Base.Generator{UnitRange{Int64}, SimString.var"#13#14"{String, Int64}}, ::Base.EltypeUnknown, ::Base.HasShape{1})@array.jl:754
collect_similar@array.jl:653[inlined]
map@abstractarray.jl:2849[inlined]
init_ngrams@features.jl:24[inlined]
n_grams@features.jl:44[inlined]
extract_features(::SimString.CharacterNGrams{Int64, String}, ::String)@features.jl:70
push!(::SimString.DictDB{SimString.CharacterNGrams{Int64, String}, String, DataStructures.DefaultDict{Int64, Set{String}, SimString.var"#1#4"}, DataStructures.DefaultDict{Int64, DataStructures.DefaultOrderedDict{Tuple{String, Int64}, Set{String}}, SimString.var"#2#5"}, DataStructures.DefaultDict{Int64, DataStructures.DefaultDict{Tuple{String, Int64}, Set{String}}, SimString.var"#3#6"}}, ::String)@features.jl:124
top-level scope