diff --git a/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs b/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs index 0dd305a..25de34e 100644 --- a/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs +++ b/FuzzySearchNet.Benchmark/BenchmarkFuzzySearch.cs @@ -20,5 +20,8 @@ public class BenchmarkFuzzySearch [Benchmark] - public void LevenshteinLong() => FuzzySearch.FindLevenshtein(term2, text, 3); + public void LevenshteinLong() + { + _ = FuzzySearch.FindLevenshtein(term2, text, 3).ToList(); + } } \ No newline at end of file diff --git a/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs b/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs index 2d9c430..41ca2e4 100644 --- a/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs +++ b/FuzzySearchNet.Tests/Tests/FuzzySearchLevenshteinTests.cs @@ -356,4 +356,28 @@ public void TestLevenshteinBufferBoundaryShort3Distance(string term, string text TestUtils.AssertMatch(results[0], expectedStartIndex, expectedMatch, expectedDistance); }); } + + + [Test] + public void TestLevenshteinLinq() + { + var text = "---abcc----abc---axc--"; + var term = "abc"; + + var results = FuzzySearch.FindLevenshtein(term, text, 2).ToList(); + + Assert.Multiple(() => + { + Assert.That(results.Count, Is.EqualTo(3)); + TestUtils.AssertMatch(results[0], 3, "abc", 0); + TestUtils.AssertMatch(results[1], 11, "abc", 0); + TestUtils.AssertMatch(results[2], 17, "axc", 1); + }); + + Assert.Multiple(() => + { + Assert.That(FuzzySearch.FindLevenshtein(term, text, 3).Any()); + TestUtils.AssertMatch(FuzzySearch.FindLevenshtein(term, text, 3).First(), 3, "abc", 0); + }); + } } diff --git a/FuzzySearchNet/FuzzySearchNet.csproj b/FuzzySearchNet/FuzzySearchNet.csproj index cbb719b..b945e2c 100644 --- a/FuzzySearchNet/FuzzySearchNet.csproj +++ b/FuzzySearchNet/FuzzySearchNet.csproj @@ -13,7 +13,7 @@ FuzzySearch.Net - 0.3.0 + 0.2.2 FuzzySearch.Net Verner Fortelius Fuzzy search library for finding strings in strings. Inspired by and attempts to be somewhat compatible with fuzzysearch for python https://github.com/taleinat/fuzzysearch @@ -22,7 +22,7 @@ https://github.com/vforteli/FuzzySearch.Net https://github.com/vforteli/FuzzySearch.Net https://github.com/vforteli/FuzzySearch.Net/blob/main/LICENSE.md - Add support for finding in streams + Add support for finding in streams for exact and substitution only. Properly use yield and ienumerable to allow usage of linq methods fuzzy search;levenshtein distance;dotnet;.net;c#;fuzzysearch.net true snupkg diff --git a/FuzzySearchNet/src/FuzzySearch.cs b/FuzzySearchNet/src/FuzzySearch.cs index db6490e..6632065 100644 --- a/FuzzySearchNet/src/FuzzySearch.cs +++ b/FuzzySearchNet/src/FuzzySearch.cs @@ -63,7 +63,7 @@ public static IEnumerable Find(string subSequence, string text, int /// public static IEnumerable FindExact(string subSequence, string text) { - // ok so this whole method is a bit redundant... but the idea is to have this using a stream instead of text... later + // indexof would probably run circles around this... var needlePosition = 0; var termLength = subSequence.Length - 1; var currentIndex = 0; @@ -167,7 +167,6 @@ public static async Task> FindExactAsync(string subSequ /// public static IEnumerable FindSubstitutionsOnly(string subSequence, string text, int maxDistance) { - var matches = new List(); var termLengthMinusOne = subSequence.Length - 1; for (var currentIndex = 0; currentIndex < text.Length - termLengthMinusOne; currentIndex++) @@ -191,7 +190,7 @@ public static IEnumerable FindSubstitutionsOnly(string subSequence, if (candidateDistance <= maxDistance) { - matches.Add(new MatchResult + yield return new MatchResult { StartIndex = currentIndex, EndIndex = currentIndex + subSequence.Length, @@ -200,11 +199,9 @@ public static IEnumerable FindSubstitutionsOnly(string subSequence, Deletions = 0, Substitutions = candidateDistance, Insertions = 0, - }); + }; } } - - return matches; } @@ -277,13 +274,21 @@ public static async Task> FindSubstitutionsOnlyAsync(st /// - /// Finds term in text with max distance + /// Finds sub sequence in text with max levenshtein distance /// /// /// - public static IEnumerable FindLevenshtein(string subSequence, string text, int maxDistance) + public static IEnumerable FindLevenshtein(string subSequence, string text, int maxDistance) => Utils.GetBestMatches(FindLevenshteinAll(subSequence, text, maxDistance), maxDistance); + + + /// + /// Finds sub sequence in text with max levenshtein distance + /// This method finds all matches and does not try to consolidate overlapping matches + /// + /// + /// + internal static IEnumerable FindLevenshteinAll(string subSequence, string text, int maxDistance) { - var matches = new List(); var candidates = new Stack(); for (var currentIndex = 0; currentIndex < text.Length; currentIndex++) @@ -300,7 +305,7 @@ public static IEnumerable FindLevenshtein(string subSequence, strin if (candidate.TextIndex <= text.Length) { bestFoundDistance = candidate.Distance; - matches.Add(new MatchResult + yield return new MatchResult { StartIndex = candidate.StartIndex, EndIndex = candidate.TextIndex, @@ -309,7 +314,7 @@ public static IEnumerable FindLevenshtein(string subSequence, strin Deletions = candidate.Deletions, Substitutions = candidate.Substitutions, Insertions = candidate.Insertions, - }); + }; } // No point searching for better matches if we find a perfect match @@ -369,7 +374,5 @@ public static IEnumerable FindLevenshtein(string subSequence, strin } } } - - return Utils.GetBestMatches(matches.OrderBy(o => o.StartIndex).ToList(), maxDistance); } } diff --git a/FuzzySearchNet/src/Utils.cs b/FuzzySearchNet/src/Utils.cs index 10ae6fd..cba6163 100644 --- a/FuzzySearchNet/src/Utils.cs +++ b/FuzzySearchNet/src/Utils.cs @@ -8,38 +8,40 @@ public static class Utils /// /// /// - public static IEnumerable GetBestMatches(List list, int maxDistanece) + public static IEnumerable GetBestMatches(IEnumerable matches, int maxDistanece) { - var matches = list.Distinct().ToList(); + var matchesEnumerator = matches.GetEnumerator(); - if (matches.Count > 1) - { - var groups = new List>(); + var group = new List(); - groups.Add(new List()); + if (matchesEnumerator.MoveNext()) + { + group.Add(matchesEnumerator.Current); - var match = matches[0]; - groups[0].Add(match); + var match = matchesEnumerator.Current; - for (var i = 1; i < matches.Count; i++) + while (matchesEnumerator.MoveNext()) { - var currentMatch = matches[i]; + var currentMatch = matchesEnumerator.Current; - if (currentMatch.StartIndex > (match.StartIndex + maxDistanece)) + if (currentMatch != null) { - groups.Add(new List()); - } + if (currentMatch.StartIndex > (match.StartIndex + maxDistanece)) + { + yield return group.OrderBy(o => o.Distance).ThenByDescending(o => o.Match.Length).First(); + group.Clear(); + } - groups.Last().Add(currentMatch); + group.Add(currentMatch); - match = currentMatch; + match = currentMatch; + } } - - return groups.Select(o => o.OrderBy(o => o.Distance).ThenByDescending(o => o.Match.Length).First()).ToList(); } - else + + if (group.Any()) { - return matches; + yield return group.OrderBy(o => o.Distance).ThenByDescending(o => o.Match.Length).First(); } } } \ No newline at end of file