Skip to content

Optimize string searching using two way search (WIP) #14135

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 206 additions & 26 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,19 @@
use mem;
use char;
use clone::Clone;
use cmp;
use cmp::{Eq, TotalEq};
use container::Container;
use default::Default;
use iter::{Filter, Map, Iterator};
use iter::{Rev, DoubleEndedIterator, ExactSize};
use iter::range;
use num::Saturating;
use option::{None, Option, Some};
use raw::Repr;
use slice::{ImmutableVector, Vector};
use slice;
use uint;

/*
Section: Creating a string
Expand Down Expand Up @@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
}
}

/// The internal state of an iterator that searches for matches of a substring
/// within a larger string using naive search
#[deriving(Clone)]
struct NaiveSearcher {
position: uint
}

impl NaiveSearcher {
fn new() -> NaiveSearcher {
NaiveSearcher { position: 0 }
}

fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> {
while self.position + needle.len() <= haystack.len() {
if haystack.slice(self.position, self.position + needle.len()) == needle {
let matchPos = self.position;
self.position += needle.len(); // add 1 for all matches
return Some((matchPos, matchPos + needle.len()));
} else {
self.position += 1;
}
}
None
}
}

/// The internal state of an iterator that searches for matches of a substring
/// within a larger string using two-way search
#[deriving(Clone)]
struct TwoWaySearcher {
// constants
critPos: uint,
period: uint,
byteset: u64,

// variables
position: uint,
memory: uint
}

impl TwoWaySearcher {
fn new(needle: &[u8]) -> TwoWaySearcher {
let (critPos1, period1) = TwoWaySearcher::maximal_suffix(needle, false);
let (critPos2, period2) = TwoWaySearcher::maximal_suffix(needle, true);

let critPos;
let period;
if critPos1 > critPos2 {
critPos = critPos1;
period = period1;
} else {
critPos = critPos2;
period = period2;
}

let byteset = needle.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a);

if needle.slice_to(critPos) == needle.slice_from(needle.len() - critPos) {
TwoWaySearcher {
critPos: critPos,
period: period,
byteset: byteset,

position: 0,
memory: 0
}
} else {
TwoWaySearcher {
critPos: critPos,
period: cmp::max(critPos, needle.len() - critPos) + 1,
byteset: byteset,

position: 0,
memory: uint::MAX // Dummy value to signify that the period is long
}
}
}

#[inline]
fn next(&mut self, haystack: &[u8], needle: &[u8], longPeriod: bool) -> Option<(uint, uint)> {
'search: loop {
// Check that we have room to search in
if self.position + needle.len() > haystack.len() {
return None;
}

// Quickly skip by large portions unrelated to our substring
if (self.byteset >> (haystack[self.position + needle.len() - 1] & 0x3f)) & 1 == 0 {
self.position += needle.len();
continue 'search;
}

// See if the right part of the needle matches
let start = if longPeriod { self.critPos } else { cmp::max(self.critPos, self.memory) };
for i in range(start, needle.len()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly faster by avoiding (some) bounds checks: for (i, needle_byte) in needle.iter().enumerate(). Or maybe even

let iter = needle.iter().zip(haystack.slice_from(self.position).iter());

for (i, (needle_byte,haystack_byte)) in iter.enumerate() { ... }

if needle[i] != haystack[self.position + i] {
self.position += i - self.critPos + 1;
if !longPeriod {
self.memory = 0;
}
continue 'search;
}
}

// See if the left part of the needle matches
let start = if longPeriod { 0 } else { self.memory };
for i in range(start, self.critPos).rev() {
if needle[i] != haystack[self.position + i] {
self.position += self.period;
if !longPeriod {
self.memory = needle.len() - self.period;
}
continue 'search;
}
}

// We have found a match!
let matchPos = self.position;
self.position += needle.len(); // add self.period for all matches
if !longPeriod {
self.memory = 0; // set to needle.len() - self.period for all matches
}
return Some((matchPos, matchPos + needle.len()));
}
}

#[inline]
fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) {
let mut left = -1; // Corresponds to i in the paper
let mut right = 0; // Corresponds to j in the paper
let mut offset = 1; // Corresponds to k in the paper
let mut period = 1; // Corresponds to p in the paper

while right + offset < arr.len() {
let a;
let b;
if reversed {
a = arr[left + offset];
b = arr[right + offset];
} else {
a = arr[right + offset];
b = arr[left + offset];
}
if a < b {
// Suffix is smaller, period is entire prefix so far.
right += offset;
offset = 1;
period = right - left;
} else if a == b {
// Advance through repetition of the current period.
if offset == period {
right += offset;
offset = 1;
} else {
offset += 1;
}
} else {
// Suffix is larger, start over from current location.
left = right;
right += 1;
offset = 1;
period = 1;
}
}
(left + 1, period)
}
}

/// The internal state of an iterator that searches for matches of a substring
/// within a larger string using a dynamically chosed search algorithm
#[deriving(Clone)]
enum Searcher {
Naive(NaiveSearcher),
TwoWay(TwoWaySearcher),
TwoWayLong(TwoWaySearcher)
}

impl Searcher {
fn new(haystack: &[u8], needle: &[u8]) -> Searcher {
// FIXME: Tune this.
if needle.len() > haystack.len() - 20 {
Naive(NaiveSearcher::new())
} else {
let searcher = TwoWaySearcher::new(needle);
if searcher.memory == uint::MAX { // If the period is long
TwoWayLong(searcher)
} else {
TwoWay(searcher)
}
}
}
}

/// An iterator over the start and end indices of the matches of a
/// substring within a larger string
#[deriving(Clone)]
pub struct MatchIndices<'a> {
// constants
haystack: &'a str,
needle: &'a str,
position: uint,
searcher: Searcher
}

/// An iterator over the substrings of a string separated by a given
Expand All @@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
#[inline]
fn next(&mut self) -> Option<(uint, uint)> {
// See Issue #1932 for why this is a naive search
let (h_len, n_len) = (self.haystack.len(), self.needle.len());
let mut match_start = 0;
let mut match_i = 0;

while self.position < h_len {
if self.haystack[self.position] == self.needle[match_i] {
if match_i == 0 { match_start = self.position; }
match_i += 1;
self.position += 1;

if match_i == n_len {
// found a match!
return Some((match_start, self.position));
}
} else {
// failed match, backtrack
if match_i > 0 {
match_i = 0;
self.position = match_start;
}
self.position += 1;
}
match self.searcher {
Naive(ref mut searcher)
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()),
TwoWay(ref mut searcher)
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
TwoWayLong(ref mut searcher)
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true)
}
None
}
}

Expand Down Expand Up @@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
MatchIndices {
haystack: *self,
needle: sep,
position: 0
searcher: Searcher::new(self.as_bytes(), sep.as_bytes())
}
}

Expand Down
74 changes: 74 additions & 0 deletions src/libstd/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2413,4 +2413,78 @@ mod bench {
assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);
})
}

#[bench]
fn bench_contains_short_short(b: &mut Bencher) {
let haystack = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
let needle = "sit";

b.iter(|| {
assert!(haystack.contains(needle));
})
}

#[bench]
fn bench_contains_short_long(b: &mut Bencher) {
let haystack = "\
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse quis lorem sit amet dolor \
ultricies condimentum. Praesent iaculis purus elit, ac malesuada quam malesuada in. Duis sed orci \
eros. Suspendisse sit amet magna mollis, mollis nunc luctus, imperdiet mi. Integer fringilla non \
sem ut lacinia. Fusce varius tortor a risus porttitor hendrerit. Morbi mauris dui, ultricies nec \
tempus vel, gravida nec quam.

In est dui, tincidunt sed tempus interdum, adipiscing laoreet ante. Etiam tempor, tellus quis \
sagittis interdum, nulla purus mattis sem, quis auctor erat odio ac tellus. In nec nunc sit amet \
diam volutpat molestie at sed ipsum. Vestibulum laoreet consequat vulputate. Integer accumsan \
lorem ac dignissim placerat. Suspendisse convallis faucibus lorem. Aliquam erat volutpat. In vel \
eleifend felis. Sed suscipit nulla lorem, sed mollis est sollicitudin et. Nam fermentum egestas \
interdum. Curabitur ut nisi justo.

Sed sollicitudin ipsum tellus, ut condimentum leo eleifend nec. Cras ut velit ante. Phasellus nec \
mollis odio. Mauris molestie erat in arcu mattis, at aliquet dolor vehicula. Quisque malesuada \
lectus sit amet nisi pretium, a condimentum ipsum porta. Morbi at dapibus diam. Praesent egestas \
est sed risus elementum, eu rutrum metus ultrices. Etiam fermentum consectetur magna, id rutrum \
felis accumsan a. Aliquam ut pellentesque libero. Sed mi nulla, lobortis eu tortor id, suscipit \
ultricies neque. Morbi iaculis sit amet risus at iaculis. Praesent eget ligula quis turpis \
feugiat suscipit vel non arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. \
Aliquam sit amet placerat lorem.

Cras a lacus vel ante posuere elementum. Nunc est leo, bibendum ut facilisis vel, bibendum at \
mauris. Nullam adipiscing diam vel odio ornare, luctus adipiscing mi luctus. Nulla facilisi. \
Mauris adipiscing bibendum neque, quis adipiscing lectus tempus et. Sed feugiat erat et nisl \
lobortis pharetra. Donec vitae erat enim. Nullam sit amet felis et quam lacinia tincidunt. Aliquam \
suscipit dapibus urna. Sed volutpat urna in magna pulvinar volutpat. Phasellus nec tellus ac diam \
cursus accumsan.

Nam lectus enim, dapibus non nisi tempor, consectetur convallis massa. Maecenas eleifend dictum \
feugiat. Etiam quis mauris vel risus luctus mattis a a nunc. Nullam orci quam, imperdiet id \
vehicula in, porttitor ut nibh. Duis sagittis adipiscing nisl vitae congue. Donec mollis risus eu \
leo suscipit, varius porttitor nulla porta. Pellentesque ut sem nec nisi euismod vehicula. Nulla \
malesuada sollicitudin quam eu fermentum.";
let needle = "english";

b.iter(|| {
assert!(!haystack.contains(needle));
})
}

#[bench]
fn bench_contains_bad_naive(b: &mut Bencher) {
let haystack = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
let needle = "aaaaaaaab";

b.iter(|| {
assert!(!haystack.contains(needle));
})
}

#[bench]
fn bench_contains_equal(b: &mut Bencher) {
let haystack = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
let needle = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";

b.iter(|| {
assert!(haystack.contains(needle));
})
}
}