diff --git a/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs b/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs index e755aad..0962fcc 100644 --- a/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs +++ b/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs @@ -8,13 +8,17 @@ public class BenchmarkFuzzySearch private const string term2 = "fooo--foo-----fo"; private const string text = "foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--foo-----fo--foo-f--fooo--"; - [Benchmark] - public void SubstitutionOnlyBufferingShort() => FuzzySearch.FindSubstitutionsOnlyBuffering(term, text, 1); + //[Benchmark] + //public void SubstitutionOnlyBufferingShort() => FuzzySearch.FindSubstitutionsOnlyBuffering(term, text, 1); - [Benchmark] - public void SubstitutionOnlyBufferingLong() => FuzzySearch.FindSubstitutionsOnlyBuffering(term2, text, 1); + //[Benchmark] + //public void SubstitutionOnlyBufferingLong() => FuzzySearch.FindSubstitutionsOnlyBuffering(term2, text, 1); + + + //[Benchmark] + //public void SubstitutionOnlyBufferingLong3distance() => FuzzySearch.FindSubstitutionsOnlyBuffering(term2, text, 3); [Benchmark] - public void SubstitutionOnlyBufferingLong3distance() => FuzzySearch.FindSubstitutionsOnlyBuffering(term2, text, 3); + public void LevenshteinLong() => FuzzySearch.FindBuffering(term2, text, 3); } \ No newline at end of file diff --git a/FuzzySearchNet.Tests/TestUtils.cs b/FuzzySearchNet.Tests/TestUtils.cs new file mode 100644 index 0000000..0d61d19 --- /dev/null +++ b/FuzzySearchNet.Tests/TestUtils.cs @@ -0,0 +1,28 @@ +namespace FuzzySearchNet.Tests; + +internal class TestUtils +{ + public static void AssertMatch(MatchResult match, int expectedStartIndex, int expectedEndIndex, string text, int? expectedDistance = null) + { + Assert.That(match.StartIndex, Is.EqualTo(expectedStartIndex)); + Assert.That(match.EndIndex, Is.EqualTo(expectedEndIndex)); + Assert.That(match.Match, Is.EqualTo(text[expectedStartIndex..expectedEndIndex])); + + if (expectedDistance.HasValue) + { + Assert.That(match.Distance, Is.EqualTo(expectedDistance)); + } + } + + public static void AssertMatch(MatchResult match, int expectedStartIndex, string expectedMatch, int? expectedDistance = null) + { + Assert.That(match.StartIndex, Is.EqualTo(expectedStartIndex)); + Assert.That(match.EndIndex, Is.EqualTo(expectedStartIndex + expectedMatch.Length)); + Assert.That(match.Match, Is.EqualTo(expectedMatch)); + + if (expectedDistance.HasValue) + { + Assert.That(match.Distance, Is.EqualTo(expectedDistance)); + } + } +} diff --git a/FuzzySearchNet.Tests/Tests/FuzzySearchExactMatchTests.cs b/FuzzySearchNet.Tests/Tests/FuzzySearchExactMatchTests.cs index a1a2f7f..8f79140 100644 --- a/FuzzySearchNet.Tests/Tests/FuzzySearchExactMatchTests.cs +++ b/FuzzySearchNet.Tests/Tests/FuzzySearchExactMatchTests.cs @@ -1,4 +1,4 @@ -namespace FuzzySearchNet.Tests.Tests; +namespace FuzzySearchNet.Tests; public class FuzzySearchExactMatchTests { diff --git a/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs b/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs index 4710d12..8422138 100644 --- a/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs +++ b/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs @@ -1,64 +1,56 @@ -namespace FuzzySearchNet.Tests.Tests; +namespace FuzzySearchNet.Tests; /// /// Testing with similar tests as https://github.com/taleinat/fuzzysearch to ensure somewhat compatible behaviour /// public class FuzzySearchLevenshteinTests { - [TestCase("PATTERN", "PATTERN", 0, 0, 7, 0)] - [TestCase("def", "abcddefg", 0, 4, 7, 0)] - [TestCase("def", "abcdeffg", 1, 3, 6, 0)] - [TestCase("defgh", "abcdedefghi", 3, 5, 10, 0)] - [TestCase("cdefgh", "abcdefghghi", 3, 2, 8, 0)] - [TestCase("bde", "abcdefg", 1, 1, 5, 1)] - [TestCase("1234567", "--123567--", 1, 2, 8, 1)] - [TestCase("1234567", "--1238567--", 1, 2, 9, 1)] - [TestCase("1234567", "23567-----", 2, 0, 5, 2)] - [TestCase("1234567", "--23567---", 2, 1, 7, 2)] - [TestCase("1234567", "-----23567", 2, 4, 10, 2)] - public void TestSingleMatchWithDeletions(string pattern, string text, int maxDistance, int expectedStart, int expectedEnd, int expectedDistance) + [TestCase("PATTERN", "PATTERN", 0, 0, "PATTERN", 0)] + [TestCase("def", "abcddefg", 0, 4, "def", 0)] + [TestCase("def", "abcdeffg", 1, 3, "def", 0)] + [TestCase("defgh", "abcdedefghi", 3, 5, "defgh", 0)] + [TestCase("cdefgh", "abcdefghghi", 3, 2, "cdefgh", 0)] + [TestCase("bde", "abcdefg", 1, 1, "bcde", 1)] + [TestCase("1234567", "--123567--", 1, 2, "123567", 1)] + [TestCase("1234567", "--1238567--", 1, 2, "1238567", 1)] + [TestCase("1234567", "23567-----", 2, 0, "23567", 2)] + [TestCase("1234567", "--23567---", 2, 1, "-23567", 2)] + [TestCase("1234567", "-----23567", 2, 4, "-23567", 2)] + public void TestSingleMatchWithDeletions(string pattern, string text, int maxDistance, int expectedStart, string expectedMatch, int expectedDistance) { var results = FuzzySearch.Find(pattern, text, maxDistance).ToList(); Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(1)); - - Assert.That(results[0].StartIndex, Is.EqualTo(expectedStart)); - Assert.That(results[0].EndIndex, Is.EqualTo(expectedEnd)); - Assert.That(results[0].Distance, Is.EqualTo(expectedDistance)); - Assert.That(results[0].Match, Is.EqualTo(text[expectedStart..expectedEnd])); + TestUtils.AssertMatch(results[0], expectedStart, expectedMatch, expectedDistance); }); } - [TestCase("PATTERN", "----------PATT-ERN---------", 1, 10, 18, 1)] - [TestCase("PATTERN", "----------PATT-ERN---------", 2, 10, 18, 1)] + [TestCase("PATTERN", "----------PATT-ERN---------", 1, 10, "PATT-ERN", 1)] + [TestCase("PATTERN", "----------PATT-ERN---------", 2, 10, "PATT-ERN", 1)] - [TestCase("PATTERN", "----------PATTTERN---------", 1, 10, 18, 1)] - [TestCase("PATTERN", "----------PATTTERN---------", 2, 10, 18, 1)] + [TestCase("PATTERN", "----------PATTTERN---------", 1, 10, "PATTTERN", 1)] + [TestCase("PATTERN", "----------PATTTERN---------", 2, 10, "PATTTERN", 1)] - [TestCase("PATTERN", "----------PATTERNN---------", 0, 10, 17, 0)] - [TestCase("PATTERN", "----------PATTERNN---------", 1, 10, 17, 0)] - [TestCase("PATTERN", "----------PATTERNN---------", 2, 10, 17, 0)] - public void TestSingleMatchWithInsertion(string pattern, string text, int maxDistance, int expectedStart, int expectedEnd, int expectedDistance) + [TestCase("PATTERN", "----------PATTERNN---------", 0, 10, "PATTERN", 0)] + [TestCase("PATTERN", "----------PATTERNN---------", 1, 10, "PATTERN", 0)] + [TestCase("PATTERN", "----------PATTERNN---------", 2, 10, "PATTERN", 0)] + public void TestSingleMatchWithInsertion(string pattern, string text, int maxDistance, int expectedStart, string expectedMatch, int expectedDistance) { var results = FuzzySearch.Find(pattern, text, maxDistance).ToList(); Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(1)); - - Assert.That(results[0].StartIndex, Is.EqualTo(expectedStart)); - Assert.That(results[0].EndIndex, Is.EqualTo(expectedEnd)); - Assert.That(results[0].Distance, Is.EqualTo(expectedDistance)); - Assert.That(results[0].Match, Is.EqualTo(text[expectedStart..expectedEnd])); + TestUtils.AssertMatch(results[0], expectedStart, expectedMatch, expectedDistance); }); } [Test] - public void TestZeroMaxDistanceMultiple2() + public void Test2DeletionsBufferStart() { var word = "pattern"; var text = "atern----"; @@ -68,10 +60,7 @@ public void TestZeroMaxDistanceMultiple2() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(1)); - - Assert.That(results[0].StartIndex, Is.EqualTo(0)); - Assert.That(results[0].EndIndex, Is.EqualTo(5)); - Assert.That(results[0].Match, Is.EqualTo(text[0..5])); + TestUtils.AssertMatch(results[0], 0, "atern", 2); }); } @@ -106,7 +95,7 @@ public void TestZeroMaxDistanceNoMatch2() [Test] - public void TestZeroMaxDistanceMultiple85() + public void TestSingleDeletionBufferStart() { var word = "pattern"; var text = "patern----"; @@ -116,15 +105,13 @@ public void TestZeroMaxDistanceMultiple85() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(1)); - - Assert.That(results[0].StartIndex, Is.EqualTo(0)); - Assert.That(results[0].EndIndex, Is.EqualTo(6)); - Assert.That(results[0].Match, Is.EqualTo(text[0..6])); + TestUtils.AssertMatch(results[0], 0, "patern", 1); }); } + [Test] - public void TestZeroMaxDistanceMultipleMiddle() + public void TestSingleDeletionBufferMiddle() { var word = "pattern"; var text = "--patern--"; @@ -134,13 +121,11 @@ public void TestZeroMaxDistanceMultipleMiddle() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(1)); - - Assert.That(results[0].StartIndex, Is.EqualTo(2)); - Assert.That(results[0].EndIndex, Is.EqualTo(8)); - Assert.That(results[0].Match, Is.EqualTo(text[2..8])); + TestUtils.AssertMatch(results[0], 2, "patern", 1); }); } + [Test] public void TestMultipleMatchesConsecutive() { @@ -152,17 +137,12 @@ public void TestMultipleMatchesConsecutive() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(2)); - - Assert.That(results[0].StartIndex, Is.EqualTo(2)); - Assert.That(results[0].EndIndex, Is.EqualTo(9)); - Assert.That(results[0].Match, Is.EqualTo(text[2..9])); - - Assert.That(results[1].StartIndex, Is.EqualTo(9)); - Assert.That(results[1].EndIndex, Is.EqualTo(16)); - Assert.That(results[1].Match, Is.EqualTo(text[9..16])); + TestUtils.AssertMatch(results[0], 2, "pattern", 0); + TestUtils.AssertMatch(results[1], 9, "pattern", 0); }); } + [Test] public void TestMultipleMatchesConsecutive2() { @@ -174,17 +154,12 @@ public void TestMultipleMatchesConsecutive2() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(2)); - - Assert.That(results[0].StartIndex, Is.EqualTo(2)); - Assert.That(results[0].EndIndex, Is.EqualTo(9)); - Assert.That(results[0].Match, Is.EqualTo(text[2..9])); - - Assert.That(results[1].StartIndex, Is.EqualTo(10)); - Assert.That(results[1].EndIndex, Is.EqualTo(17)); - Assert.That(results[1].Match, Is.EqualTo(text[10..17])); + TestUtils.AssertMatch(results[0], 2, "pattern", 0); + TestUtils.AssertMatch(results[1], 10, "pattern", 0); }); } + [Test] public void TestMultipleMatchesConsecutiveSubstitutions() { @@ -196,14 +171,40 @@ public void TestMultipleMatchesConsecutiveSubstitutions() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(2)); + TestUtils.AssertMatch(results[0], 2, "patterm", 1); + TestUtils.AssertMatch(results[1], 9, "patyern", 1); + }); + } - Assert.That(results[0].StartIndex, Is.EqualTo(2)); - Assert.That(results[0].EndIndex, Is.EqualTo(9)); - Assert.That(results[0].Match, Is.EqualTo(text[2..9])); - Assert.That(results[1].StartIndex, Is.EqualTo(9)); - Assert.That(results[1].EndIndex, Is.EqualTo(16)); - Assert.That(results[1].Match, Is.EqualTo(text[9..16])); + [Test] + public void TestMultipleMatchesConsecutiveInsertion() + { + var word = "pattern"; + var text = "--patyternpatxtern--"; + + var results = FuzzySearch.Find(word, text, 1).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(2)); + TestUtils.AssertMatch(results[0], 2, "patytern", 1); + TestUtils.AssertMatch(results[1], 10, "patxtern", 1); + }); + } + + [Test] + public void TestOverlappingMatches() + { + var word = "pattern"; + var text = "--pattpatterntern--"; + + var results = FuzzySearch.Find(word, text, 2).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(1)); + TestUtils.AssertMatch(results[0], 6, "pattern", 0); }); } @@ -218,17 +219,12 @@ public void TestMultipleMatchesConsecutiveDeletion() Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(2)); - - Assert.That(results[0].StartIndex, Is.EqualTo(2)); - Assert.That(results[0].EndIndex, Is.EqualTo(8)); - Assert.That(results[0].Match, Is.EqualTo(text[2..8])); - - Assert.That(results[1].StartIndex, Is.EqualTo(8)); - Assert.That(results[1].EndIndex, Is.EqualTo(14)); - Assert.That(results[1].Match, Is.EqualTo(text[8..14])); + TestUtils.AssertMatch(results[0], 2, "pattrn", 1); + TestUtils.AssertMatch(results[1], 8, "pttern", 1); }); } + [TestCase("PATTERN", "")] [TestCase("", "sometext")] [TestCase("", "")] @@ -242,6 +238,7 @@ public void TestEmpty(string pattern, string text) }); } + [TestCase("PATTERN", "PATERN", 1)] public void TestShorterText(string pattern, string text, int expectedMatches) { @@ -250,12 +247,11 @@ public void TestShorterText(string pattern, string text, int expectedMatches) Assert.Multiple(() => { Assert.That(results.Count, Is.EqualTo(expectedMatches)); - Assert.That(results[0].StartIndex, Is.EqualTo(0)); - Assert.That(results[0].EndIndex, Is.EqualTo(text.Length)); - Assert.That(results[0].Match, Is.EqualTo(text[0..text.Length])); + TestUtils.AssertMatch(results[0], 0, "PATERN", 1); }); } + [TestCase("PATTERN", "PAERN", 0)] public void TestShorterTextNoMatch(string pattern, string text, int expectedMatches) { @@ -266,4 +262,98 @@ public void TestShorterTextNoMatch(string pattern, string text, int expectedMatc Assert.That(results.Count, Is.EqualTo(expectedMatches)); }); } + + + [TestCase("pattern", "pattern---------------------", 0, "pattern", 0)] + [TestCase("pattern", "attern---------------------", 0, "attern", 1)] + [TestCase("pattern", "ttern---------------------", 0, "ttern", 2)] + [TestCase("pattern", "tern---------------------", 0, "tern", 3)] + [TestCase("pattern", "--------pattttern-------------", 8, "pattttern", 2)] + [TestCase("pattern", "---------pattttern------------", 9, "pattttern", 2)] + [TestCase("pattern", "----------pattttern-----------", 10, "pattttern", 2)] + [TestCase("pattern", "-----------pattttern----------", 11, "pattttern", 2)] + [TestCase("pattern", "------------pattttern---------", 12, "pattttern", 2)] + [TestCase("pattern", "-------------pattttern--------", 13, "pattttern", 2)] + [TestCase("pattern", "--------------pattttern-------", 14, "pattttern", 2)] + [TestCase("pattern", "---------------pattttern------", 15, "pattttern", 2)] + [TestCase("pattern", "----------------pattttern-----", 16, "pattttern", 2)] + [TestCase("pattern", "-----------------pattttern----", 17, "pattttern", 2)] + [TestCase("pattern", "------------------pattttern---", 18, "pattttern", 2)] + [TestCase("pattern", "-------------------pattttern--", 19, "pattttern", 2)] + [TestCase("pattern", "--------------------pattttern-", 20, "pattttern", 2)] + [TestCase("pattern", "---------------------pattttern", 21, "pattttern", 2)] + [TestCase("pattern", "---patter", 3, "patter", 1)] + [TestCase("pattern", "---patte", 3, "patte", 2)] + [TestCase("pattern", "---patt", 3, "patt", 3)] + [TestCase("pattern", "----------------------pattttern", 22, "pattttern", 2)] + public void TestLevenshteinBufferBoundary(string term, string text, int expectedStartIndex, string expectedMatch, int expectedDistance) + { + var results = FuzzySearch.FindBuffering(term, text, 3).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(1)); + TestUtils.AssertMatch(results[0], expectedStartIndex, expectedMatch, expectedDistance); + }); + } + + + [TestCase("ab", "-a", 1, "a", 1)] + [TestCase("ab", "b---", 0, "b", 1)] + [TestCase("ab", "-axb", 1, "axb", 1)] + [TestCase("ab", "axb-", 0, "axb", 1)] + [TestCase("ab", "--ax", 2, "ax", 1)] + [TestCase("ab", "ax--", 0, "ax", 1)] + [TestCase("ab", "--ab", 2, "ab", 0)] + [TestCase("ab", "ab--", 0, "ab", 0)] + [TestCase("ab", "ab", 0, "ab", 0)] + [TestCase("ab", "-ab", 1, "ab", 0)] + [TestCase("ab", "ab-", 0, "ab", 0)] + [TestCase("ab", "b", 0, "b", 1)] + [TestCase("ab", "a", 0, "a", 1)] + [TestCase("a", "a", 0, "a", 0)] + [TestCase("ab", "axb", 0, "axb", 1)] + public void TestLevenshteinBufferBoundaryShort(string term, string text, int expectedStartIndex, string expectedMatch, int expectedDistance) + { + var results = FuzzySearch.FindBuffering(term, text, 1).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(1)); + TestUtils.AssertMatch(results[0], expectedStartIndex, expectedMatch, expectedDistance); + }); + } + + + [TestCase("abc", "a", 0, "a", 2)] + [TestCase("abc", "b", 0, "b", 2)] + [TestCase("abc", "c", 0, "c", 2)] + public void TestLevenshteinBufferBoundaryShort2Distance(string term, string text, int expectedStartIndex, string expectedMatch, int expectedDistance) + { + var results = FuzzySearch.FindBuffering(term, text, 2).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(1)); + TestUtils.AssertMatch(results[0], expectedStartIndex, expectedMatch, expectedDistance); + }); + } + + + [TestCase("abcd", "ax", 0, "ax", 3)] + [TestCase("abcd", "bx", 0, "bx", 3)] + [TestCase("abcd", "cx", 0, "cx", 3)] + [TestCase("abcd", "xa", 1, "a", 3)] + [TestCase("abcd", "xb", 0, "xb", 3)] + [TestCase("abcd", "xc", 0, "xc", 3)] + public void TestLevenshteinBufferBoundaryShort3Distance(string term, string text, int expectedStartIndex, string expectedMatch, int expectedDistance) + { + var results = FuzzySearch.FindBuffering(term, text, 3).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(1)); + TestUtils.AssertMatch(results[0], expectedStartIndex, expectedMatch, expectedDistance); + }); + } } diff --git a/FuzzySearchNet.Tests/Tests/FuzzySearchSubstitutionsOnlyTests.cs b/FuzzySearchNet.Tests/Tests/FuzzySearchSubstitutionsOnlyTests.cs index 22854d5..3275b58 100644 --- a/FuzzySearchNet.Tests/Tests/FuzzySearchSubstitutionsOnlyTests.cs +++ b/FuzzySearchNet.Tests/Tests/FuzzySearchSubstitutionsOnlyTests.cs @@ -1,4 +1,4 @@ -namespace FuzzySearchNet.Tests.Tests; +namespace FuzzySearchNet.Tests; public class FuzzySearchSubstitutionsOnlyTests { diff --git a/FuzzySearchNet/FuzzySearchNet.csproj b/FuzzySearchNet/FuzzySearchNet.csproj index 0560a44..7b17160 100644 --- a/FuzzySearchNet/FuzzySearchNet.csproj +++ b/FuzzySearchNet/FuzzySearchNet.csproj @@ -13,7 +13,7 @@ FuzzySearch.Net - 0.2.0 + 0.2.1 FuzzySearch.Net Verner Fortelius Fuzzy search library for finding strings in strings. Inspired by and attempts to be somewhat compatible with fuzzysearch for python https://github.com/taleinat/fuzzysearch @@ -22,7 +22,7 @@ https://github.com/vforteli/FuzzySearch.Net https://github.com/vforteli/FuzzySearch.Net https://github.com/vforteli/FuzzySearch.Net/blob/main/LICENSE.md - Clean up public methods, fix some bugs + Fix some bugs fuzzy search;levenshtein distance;dotnet;.net;c#;fuzzysearch.net true snupkg diff --git a/FuzzySearchNet/src/CandidateMatch.cs b/FuzzySearchNet/src/CandidateMatch.cs index 5351472..5c93fb1 100644 --- a/FuzzySearchNet/src/CandidateMatch.cs +++ b/FuzzySearchNet/src/CandidateMatch.cs @@ -1,3 +1,17 @@ namespace FuzzySearchNet; -public record struct CandidateMatch(int StartIndex, int TextIndex, int PatternIndex, int Distance, int Deletions, int Substitutions, int Insertions); \ No newline at end of file +public record struct CandidateMatch(int StartIndex, int TextIndex, int SubSequenceIndex = 0, int Position = 0, int Offset = 0, int Distance = 0, int Deletions = 0, int Substitutions = 0, int Insertions = 0); + +// using a record struct improves performance around 30% in benchmarks +//public record CandidateMatch +//{ +// public int StartIndex; +// public int TextIndex => StartIndex + Position; +// public int SubSequenceIndex => Position + Offset; +// public int Position = 0; +// public int Offset = 0; +// public int Deletions = 0; +// public int Substitutions = 0; +// public int Insertions = 0; +// public int Distance => Deletions + Insertions + Substitutions; +//} \ No newline at end of file diff --git a/FuzzySearchNet/src/FuzzySearch.cs b/FuzzySearchNet/src/FuzzySearch.cs index ac6c084..f273d2c 100644 --- a/FuzzySearchNet/src/FuzzySearch.cs +++ b/FuzzySearchNet/src/FuzzySearch.cs @@ -170,29 +170,31 @@ public static IEnumerable FindBuffering(string subSequence, string var matches = new List(); var candidates = new Stack(); - for (var currentIndex = 0; currentIndex <= text.Length - (subSequence.Length - 1); currentIndex++) + for (var currentIndex = 0; currentIndex < text.Length; currentIndex++) { - candidates.Push(new CandidateMatch(currentIndex, currentIndex, 0, 0, 0, 0, 0)); + candidates.Push(new CandidateMatch(currentIndex, currentIndex)); // Keep track of the best distance so far, this means we can ignore candidates with higher distance if we already have a match var bestFoundDistance = maxDistance; while (candidates.TryPop(out var candidate)) { - if (candidate.PatternIndex == subSequence.Length && candidate.Distance <= bestFoundDistance) + if (candidate.SubSequenceIndex == subSequence.Length) { - matches.Add(new MatchResult + if (candidate.TextIndex <= text.Length) { - StartIndex = candidate.StartIndex, - EndIndex = candidate.TextIndex, - Distance = candidate.Distance, - Match = text[candidate.StartIndex..candidate.TextIndex], - Deletions = candidate.Deletions, - Substitutions = candidate.Substitutions, - Insertions = candidate.Insertions, - }); - - bestFoundDistance = candidate.Distance; + bestFoundDistance = candidate.Distance; + matches.Add(new MatchResult + { + StartIndex = candidate.StartIndex, + EndIndex = candidate.TextIndex, + Distance = candidate.Distance, + Match = text[candidate.StartIndex..candidate.TextIndex], + Deletions = candidate.Deletions, + Substitutions = candidate.Substitutions, + Insertions = candidate.Insertions, + }); + } // No point searching for better matches if we find a perfect match if (candidate.Distance == 0) @@ -204,91 +206,54 @@ public static IEnumerable FindBuffering(string subSequence, string continue; } - if (candidate.TextIndex == text.Length) - { - continue; - } - - if (text[candidate.TextIndex] == subSequence[candidate.PatternIndex]) + if (candidate.SubSequenceIndex < subSequence.Length && candidate.TextIndex < text.Length && text[candidate.TextIndex] == subSequence[candidate.SubSequenceIndex]) { - candidates.Push(new CandidateMatch(candidate.StartIndex, candidate.TextIndex + 1, candidate.PatternIndex + 1, candidate.Distance, candidate.Deletions, candidate.Substitutions, candidate.Insertions)); + // match + candidates.Push(candidate with + { + Position = candidate.Position + 1, + TextIndex = candidate.TextIndex + 1, + SubSequenceIndex = candidate.SubSequenceIndex + 1, + }); if (candidate.Distance < bestFoundDistance) { + // jump over one character in text candidates.Push(candidate with { - PatternIndex = candidate.PatternIndex + 1, - Distance = candidate.Distance + 1, - Deletions = candidate.Deletions + 1, - }); - - candidates.Push(candidate with - { - TextIndex = candidate.TextIndex + 1, - Distance = candidate.Distance + 1, Insertions = candidate.Insertions + 1, + Distance = candidate.Distance + 1, + Position = candidate.Position + 2, + SubSequenceIndex = candidate.SubSequenceIndex + 1, + TextIndex = candidate.TextIndex + 2, + Offset = candidate.Offset - 1, }); } } - else + else if (candidate.Distance < bestFoundDistance) { - if (candidate.Distance < bestFoundDistance) + // substitute one character + candidates.Push(candidate with { - candidates.Push(candidate with - { - TextIndex = candidate.TextIndex + 1, - PatternIndex = candidate.PatternIndex + 1, - Distance = candidate.Distance + 1, - Substitutions = candidate.Substitutions + 1, - }); - - candidates.Push(candidate with - { - PatternIndex = candidate.PatternIndex + 1, - Distance = candidate.Distance + 1, - Deletions = candidate.Deletions + 1, - }); + Substitutions = candidate.Substitutions + 1, + Distance = candidate.Distance + 1, + Position = candidate.Position + 1, + TextIndex = candidate.TextIndex + 1, + SubSequenceIndex = candidate.SubSequenceIndex + 1, + }); - candidates.Push(candidate with - { - TextIndex = candidate.TextIndex + 1, - Distance = candidate.Distance + 1, - Insertions = candidate.Insertions + 1, - }); - } + // jump over one character in subsequence + candidates.Push(candidate with + { + Deletions = candidate.Deletions + 1, + Distance = candidate.Distance + 1, + Offset = candidate.Offset + 1, + SubSequenceIndex = candidate.SubSequenceIndex + 1, + }); } } } - matches = matches.Distinct().ToList(); - - if (matches.Count > 1) - { - var groups = new List>(); - - groups.Add(new List()); - - var match = matches[0]; - groups[0].Add(match); - - for (var i = 0; i < matches.Count - 1; i++) - { - var currentMatch = matches[i]; - if ((currentMatch.StartIndex + currentMatch.Insertions) >= (match.EndIndex - match.Insertions)) - { - groups.Add(new List()); - } - - groups.Last().Add(currentMatch); - - match = currentMatch; - } - - return groups.Select(o => o.OrderBy(o => o.Distance).ThenByDescending(o => o.Match.Length).First()).ToList(); - } - else - { - return matches; - } + return Utils.GetBestMatches(matches.OrderBy(o => o.StartIndex).ToList(), maxDistance); } } diff --git a/FuzzySearchNet/src/MatchResult.cs b/FuzzySearchNet/src/MatchResult.cs index 8e9857a..b98866f 100644 --- a/FuzzySearchNet/src/MatchResult.cs +++ b/FuzzySearchNet/src/MatchResult.cs @@ -1,6 +1,6 @@ namespace FuzzySearchNet; -public class MatchResult +public record MatchResult { public int StartIndex { get; set; } public int EndIndex { get; set; } diff --git a/FuzzySearchNet/src/Utils.cs b/FuzzySearchNet/src/Utils.cs new file mode 100644 index 0000000..10ae6fd --- /dev/null +++ b/FuzzySearchNet/src/Utils.cs @@ -0,0 +1,45 @@ +namespace FuzzySearchNet; + +public static class Utils +{ + /// + /// Group matches and return best. + /// Currently assumes the matches are in the same order they are found... + /// + /// + /// + public static IEnumerable GetBestMatches(List list, int maxDistanece) + { + var matches = list.Distinct().ToList(); + + if (matches.Count > 1) + { + var groups = new List>(); + + groups.Add(new List()); + + var match = matches[0]; + groups[0].Add(match); + + for (var i = 1; i < matches.Count; i++) + { + var currentMatch = matches[i]; + + if (currentMatch.StartIndex > (match.StartIndex + maxDistanece)) + { + groups.Add(new List()); + } + + groups.Last().Add(currentMatch); + + match = currentMatch; + } + + return groups.Select(o => o.OrderBy(o => o.Distance).ThenByDescending(o => o.Match.Length).First()).ToList(); + } + else + { + return matches; + } + } +} \ No newline at end of file