From 2dcee7703c8caafe98f6a4ab62aafd67ba820b63 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 12:30:11 +0200 Subject: [PATCH 01/11] Add test case --- .../ext/geocoder/LuceneIndexTest.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 3e6c2b15195..4403d44679f 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -98,6 +98,10 @@ class LuceneIndexTest { .withCoordinate(52.52277, 13.41046) .build(); + static final RegularStop MERIDIAN_AVE = TEST_MODEL.stop("Meridian Ave N & N 148th").build(); + + static final RegularStop MERIDIAN_1 = TEST_MODEL.stop("Meridian N & Spencer").build(); + static LuceneIndex index; static StopClusterMapper mapper; @@ -113,7 +117,9 @@ static void setup() { LICHTERFELDE_OST_2, WESTHAFEN, ARTS_CENTER, - ARTHUR + ARTHUR, + MERIDIAN_AVE, + MERIDIAN_1 ) .forEach(stopModel::withRegularStop); List @@ -295,6 +301,15 @@ void agenciesAndFeedPublisher() { assertEquals(List.of(StopClusterMapper.toAgency(BVG)), cluster.primary().agencies()); assertEquals("A Publisher", cluster.primary().feedPublisher().name()); } + + @Test + void numbers() { + var result = index + .queryStopClusters("Meridian Ave N & N 148") + .map(s -> s.primary().name()) + .toList(); + assertEquals(List.of("Meridian Ave N & N 148th", "Meridian N & Spencer"), result); + } } private static @Nonnull Function primaryId() { From 1a641dda4b47cb01b804e90badfa7a6ea455189b Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 12:59:09 +0200 Subject: [PATCH 02/11] Flesh out tests --- .../ext/geocoder/LuceneIndexTest.java | 46 ++++++++++++++----- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 4403d44679f..582fb2409d7 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -12,7 +12,6 @@ import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; -import javax.annotation.Nonnull; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -98,9 +97,33 @@ class LuceneIndexTest { .withCoordinate(52.52277, 13.41046) .build(); - static final RegularStop MERIDIAN_AVE = TEST_MODEL.stop("Meridian Ave N & N 148th").build(); + static final RegularStop MERIDIAN_AVE = TEST_MODEL + .stop("Meridian Ave N & N 148th St") + .withId(FeedScopedId.parse("kcm:16340")) + .withCode("16340") + .withCoordinate(47.736145, -122.33445) + .build(); + + static final RegularStop MERIDIAN_N_1 = TEST_MODEL + .stop("Meridian N & Spencer") + .withId(FeedScopedId.parse("pierce:13268")) + .withCode("4168") + .withCoordinate(47.209366,-122.293999) + .build(); - static final RegularStop MERIDIAN_1 = TEST_MODEL.stop("Meridian N & Spencer").build(); + static final RegularStop MERIDIAN_N_2 = TEST_MODEL + .stop("Meridian N & Spencer") + .withId(FeedScopedId.parse("pierce:30976")) + .withCode("4169") + .withCoordinate(47.209316,-122.293841) + .build(); + + static final RegularStop MERIDIAN_N_3 = TEST_MODEL + .stop("N 205th St & Meridian Ave N") + .withId(FeedScopedId.parse("commtrans:490")) + .withCode("490") + .withCoordinate(47.209316,-122.293841) + .build(); static LuceneIndex index; @@ -118,8 +141,10 @@ static void setup() { WESTHAFEN, ARTS_CENTER, ARTHUR, - MERIDIAN_AVE, - MERIDIAN_1 + MERIDIAN_N_1, + MERIDIAN_N_2, + MERIDIAN_N_3, + MERIDIAN_AVE ) .forEach(stopModel::withRegularStop); List @@ -303,16 +328,13 @@ void agenciesAndFeedPublisher() { } @Test - void numbers() { - var result = index - .queryStopClusters("Meridian Ave N & N 148") - .map(s -> s.primary().name()) - .toList(); - assertEquals(List.of("Meridian Ave N & N 148th", "Meridian N & Spencer"), result); + void number() { + var names = index.queryStopClusters("Meridian Ave N & N 148").map(c -> c.primary().name()).toList(); + assertEquals(List.of(MERIDIAN_AVE.getName().toString(), MERIDIAN_N_1.getName().toString()), names); } } - private static @Nonnull Function primaryId() { + private static Function primaryId() { return c -> c.primary().id(); } } From 9c60311511a9a534bac2480b95029b0f3bc1fc01 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 13:53:33 +0200 Subject: [PATCH 03/11] Add test for ampersand ngram --- .../geocoder/EnglishNgramAnalyzerTest.java | 34 +++++++++++++++++++ .../ext/geocoder/LuceneIndexTest.java | 16 ++++++--- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index 615ef90cbbd..b4b145063f8 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -82,6 +82,40 @@ void ngram() throws IOException { ); } + @Test + void ampersand() throws IOException { + var analyzer = new EnglishNGramAnalyzer(); + List result = analyze("Meridian Ave N & N 148th St", analyzer); + + assertEquals( + List.of( + "Meri", + "Merid", + "Meridi", + "Meridia", + "Meridian", + "erid", + "eridi", + "eridia", + "eridian", + "ridi", + "ridia", + "ridian", + "idia", + "idian", + "dian", + "Av", + "N", + "N", + "148t", + "148th", + "48th", + "St" + ), + result + ); + } + public List analyze(String text, Analyzer analyzer) throws IOException { List result = new ArrayList<>(); TokenStream tokenStream = analyzer.tokenStream("name", text); diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 582fb2409d7..80899c9d5d8 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -108,21 +108,21 @@ class LuceneIndexTest { .stop("Meridian N & Spencer") .withId(FeedScopedId.parse("pierce:13268")) .withCode("4168") - .withCoordinate(47.209366,-122.293999) + .withCoordinate(47.209366, -122.293999) .build(); static final RegularStop MERIDIAN_N_2 = TEST_MODEL .stop("Meridian N & Spencer") .withId(FeedScopedId.parse("pierce:30976")) .withCode("4169") - .withCoordinate(47.209316,-122.293841) + .withCoordinate(47.209316, -122.293841) .build(); static final RegularStop MERIDIAN_N_3 = TEST_MODEL .stop("N 205th St & Meridian Ave N") .withId(FeedScopedId.parse("commtrans:490")) .withCode("490") - .withCoordinate(47.209316,-122.293841) + .withCoordinate(47.777632, -122.3346) .build(); static LuceneIndex index; @@ -329,8 +329,14 @@ void agenciesAndFeedPublisher() { @Test void number() { - var names = index.queryStopClusters("Meridian Ave N & N 148").map(c -> c.primary().name()).toList(); - assertEquals(List.of(MERIDIAN_AVE.getName().toString(), MERIDIAN_N_1.getName().toString()), names); + var names = index + .queryStopClusters("Meridian Ave N & N 148") + .map(c -> c.primary().name()) + .toList(); + assertEquals( + List.of(MERIDIAN_AVE.getName().toString(), MERIDIAN_N_1.getName().toString()), + names + ); } } From e61d1c187cc625cb94b9743a4f831f7be748f74e Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 15:36:17 +0200 Subject: [PATCH 04/11] Finetune fuzzyness of lucene indexing --- .../geocoder/EnglishNgramAnalyzerTest.java | 24 ++++++++++++++++++- .../ext/geocoder/LuceneIndexTest.java | 18 +++++++------- .../ext/geocoder/EnglishNGramAnalyzer.java | 2 +- .../ext/geocoder/LuceneIndex.java | 2 +- 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index b4b145063f8..a1194c46e90 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -17,9 +18,9 @@ void ngram() throws IOException { var analyzer = new EnglishNGramAnalyzer(); List result = analyze("Alexanderplatz", analyzer); - //System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( List.of( + "Ale", "Alex", "Alexa", "Alexan", @@ -27,6 +28,7 @@ void ngram() throws IOException { "Alexande", "Alexander", "Alexanderp", + "lex", "lexa", "lexan", "lexand", @@ -34,6 +36,7 @@ void ngram() throws IOException { "lexander", "lexanderp", "lexanderpl", + "exa", "exan", "exand", "exande", @@ -41,6 +44,7 @@ void ngram() throws IOException { "exanderp", "exanderpl", "exanderpla", + "xan", "xand", "xande", "xander", @@ -48,6 +52,7 @@ void ngram() throws IOException { "xanderpl", "xanderpla", "xanderplat", + "and", "ande", "ander", "anderp", @@ -55,27 +60,34 @@ void ngram() throws IOException { "anderpla", "anderplat", "anderplatz", + "nde", "nder", "nderp", "nderpl", "nderpla", "nderplat", "nderplatz", + "der", "derp", "derpl", "derpla", "derplat", "derplatz", + "erp", "erpl", "erpla", "erplat", "erplatz", + "rpl", "rpla", "rplat", "rplatz", + "pla", "plat", "platz", + "lat", "latz", + "atz", "Alexanderplatz" ), result @@ -87,29 +99,39 @@ void ampersand() throws IOException { var analyzer = new EnglishNGramAnalyzer(); List result = analyze("Meridian Ave N & N 148th St", analyzer); + System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( List.of( + "Mer", "Meri", "Merid", "Meridi", "Meridia", "Meridian", + "eri", "erid", "eridi", "eridia", "eridian", + "rid", "ridi", "ridia", "ridian", + "idi", "idia", "idian", + "dia", "dian", + "ian", "Av", "N", "N", + "148", "148t", "148th", + "48t", "48th", + "8th", "St" ), result diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 80899c9d5d8..e94532a0589 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -12,6 +12,7 @@ import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -259,7 +260,7 @@ void stopClustersWithTypos(String searchTerm) { @Test void fuzzyStopClusters() { var result1 = index.queryStopClusters("arts").map(primaryId()).toList(); - assertEquals(List.of(ARTS_CENTER.getId()), result1); + assertEquals(List.of(ARTS_CENTER.getId(), ARTHUR.getId()), result1); } @Test @@ -327,14 +328,15 @@ void agenciesAndFeedPublisher() { assertEquals("A Publisher", cluster.primary().feedPublisher().name()); } - @Test - void number() { - var names = index - .queryStopClusters("Meridian Ave N & N 148") - .map(c -> c.primary().name()) - .toList(); + @ParameterizedTest + @ValueSource(strings = { "Meridian Ave N & N 148th", "Meridian Ave N & N 148" }) + void shortTokens(String query) { + var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList(); assertEquals( - List.of(MERIDIAN_AVE.getName().toString(), MERIDIAN_N_1.getName().toString()), + Stream + .of(MERIDIAN_AVE, MERIDIAN_N_3, MERIDIAN_N_1) + .map(s -> s.getName().toString()) + .toList(), names ); } diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java index ffe46604744..26f6723a8b2 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -28,7 +28,7 @@ protected TokenStreamComponents createComponents(String fieldName) { result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); result = new PorterStemFilter(result); result = new CapitalizationFilter(result); - result = new NGramTokenFilter(result, 4, 10, true); + result = new NGramTokenFilter(result, 3, 10, true); return new TokenStreamComponents(src, result); } } diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java index fe7bef8ad13..71b80ac58a6 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/LuceneIndex.java @@ -288,7 +288,7 @@ private Stream matchingDocuments( } }); } else { - var nameParser = new QueryParser(NAME, analyzer); + var nameParser = new QueryParser(NAME_NGRAM, analyzer); var nameQuery = nameParser.parse(searchTerms); var ngramNameQuery = new TermQuery( From 2c13785294270ca3e39f2fa07f96f12c5589abb9 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 15:47:33 +0200 Subject: [PATCH 05/11] Add more test instances --- .../org/opentripplanner/ext/geocoder/LuceneIndexTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index e94532a0589..6117b56ebdd 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -329,7 +329,9 @@ void agenciesAndFeedPublisher() { } @ParameterizedTest - @ValueSource(strings = { "Meridian Ave N & N 148th", "Meridian Ave N & N 148" }) + @ValueSource( + strings = { "Meridian Ave N & N 148th", "Meridian Ave N & N 148", "Meridian Ave N N 148" } + ) void shortTokens(String query) { var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList(); assertEquals( From e7ec0f78e6d4efe1be132be32e8b9115049417bb Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 15:50:33 +0200 Subject: [PATCH 06/11] Simplify test setup --- .../ext/geocoder/LuceneIndexTest.java | 40 +++---------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 6117b56ebdd..f14faa9b274 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -98,33 +98,9 @@ class LuceneIndexTest { .withCoordinate(52.52277, 13.41046) .build(); - static final RegularStop MERIDIAN_AVE = TEST_MODEL - .stop("Meridian Ave N & N 148th St") - .withId(FeedScopedId.parse("kcm:16340")) - .withCode("16340") - .withCoordinate(47.736145, -122.33445) - .build(); - - static final RegularStop MERIDIAN_N_1 = TEST_MODEL - .stop("Meridian N & Spencer") - .withId(FeedScopedId.parse("pierce:13268")) - .withCode("4168") - .withCoordinate(47.209366, -122.293999) - .build(); - - static final RegularStop MERIDIAN_N_2 = TEST_MODEL - .stop("Meridian N & Spencer") - .withId(FeedScopedId.parse("pierce:30976")) - .withCode("4169") - .withCoordinate(47.209316, -122.293841) - .build(); - - static final RegularStop MERIDIAN_N_3 = TEST_MODEL - .stop("N 205th St & Meridian Ave N") - .withId(FeedScopedId.parse("commtrans:490")) - .withCode("490") - .withCoordinate(47.777632, -122.3346) - .build(); + static final RegularStop MERIDIAN_AVE = TEST_MODEL.stop("Meridian Ave N & N 148th St").build(); + static final RegularStop MERIDIAN_N1 = TEST_MODEL.stop("Meridian N & Spencer").build(); + static final RegularStop MERIDIAN_N2 = TEST_MODEL.stop("N 205th St & Meridian Ave N").build(); static LuceneIndex index; @@ -142,9 +118,8 @@ static void setup() { WESTHAFEN, ARTS_CENTER, ARTHUR, - MERIDIAN_N_1, - MERIDIAN_N_2, - MERIDIAN_N_3, + MERIDIAN_N1, + MERIDIAN_N2, MERIDIAN_AVE ) .forEach(stopModel::withRegularStop); @@ -335,10 +310,7 @@ void agenciesAndFeedPublisher() { void shortTokens(String query) { var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList(); assertEquals( - Stream - .of(MERIDIAN_AVE, MERIDIAN_N_3, MERIDIAN_N_1) - .map(s -> s.getName().toString()) - .toList(), + Stream.of(MERIDIAN_AVE, MERIDIAN_N2, MERIDIAN_N1).map(s -> s.getName().toString()).toList(), names ); } From 5b8cdc5a877afd52ef320bb56d21b855a7986c4a Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Tue, 6 Aug 2024 17:42:46 +0200 Subject: [PATCH 07/11] Move back the debug line --- .../opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index a1194c46e90..e859069dbca 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -5,7 +5,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -18,6 +17,7 @@ void ngram() throws IOException { var analyzer = new EnglishNGramAnalyzer(); List result = analyze("Alexanderplatz", analyzer); + //System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( List.of( "Ale", @@ -99,7 +99,6 @@ void ampersand() throws IOException { var analyzer = new EnglishNGramAnalyzer(); List result = analyze("Meridian Ave N & N 148th St", analyzer); - System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( List.of( "Mer", From d8c31dbcfe40d008d8ef45856c3a45729baa03f5 Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Wed, 7 Aug 2024 09:02:29 +0200 Subject: [PATCH 08/11] Add more test cases --- .../opentripplanner/ext/geocoder/LuceneIndexTest.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index f14faa9b274..6b9ed853ad1 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -305,7 +305,15 @@ void agenciesAndFeedPublisher() { @ParameterizedTest @ValueSource( - strings = { "Meridian Ave N & N 148th", "Meridian Ave N & N 148", "Meridian Ave N N 148" } + strings = { + "Meridian Ave N & N 148th", + "Meridian Ave N & N 148", + "Meridian Ave N N 148", + "Meridian Ave N 148", + "Meridian & N 148", + "Meridian Ave 148", + "Meridian Av 148", + } ) void shortTokens(String query) { var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList(); From edea05e139f12b434bf5244e8d74e4daf432c64b Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Wed, 7 Aug 2024 11:48:22 +0200 Subject: [PATCH 09/11] Tokenize number suffixes --- .../geocoder/EnglishNgramAnalyzerTest.java | 80 ++++++++++--------- .../ext/geocoder/LuceneIndexTest.java | 3 +- .../ext/geocoder/EnglishNGramAnalyzer.java | 13 ++- 3 files changed, 55 insertions(+), 41 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index e859069dbca..77917399647 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -5,22 +5,21 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; class EnglishNgramAnalyzerTest { @Test - void ngram() throws IOException { - var analyzer = new EnglishNGramAnalyzer(); - List result = analyze("Alexanderplatz", analyzer); + void ngram() { + List result = tokenize("Alexanderplatz"); //System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\""))); assertEquals( List.of( - "Ale", "Alex", "Alexa", "Alexan", @@ -28,7 +27,6 @@ void ngram() throws IOException { "Alexande", "Alexander", "Alexanderp", - "lex", "lexa", "lexan", "lexand", @@ -36,7 +34,6 @@ void ngram() throws IOException { "lexander", "lexanderp", "lexanderpl", - "exa", "exan", "exand", "exande", @@ -44,7 +41,6 @@ void ngram() throws IOException { "exanderp", "exanderpl", "exanderpla", - "xan", "xand", "xande", "xander", @@ -52,7 +48,6 @@ void ngram() throws IOException { "xanderpl", "xanderpla", "xanderplat", - "and", "ande", "ander", "anderp", @@ -60,34 +55,27 @@ void ngram() throws IOException { "anderpla", "anderplat", "anderplatz", - "nde", "nder", "nderp", "nderpl", "nderpla", "nderplat", "nderplatz", - "der", "derp", "derpl", "derpla", "derplat", "derplatz", - "erp", "erpl", "erpla", "erplat", "erplatz", - "rpl", "rpla", "rplat", "rplatz", - "pla", "plat", "platz", - "lat", "latz", - "atz", "Alexanderplatz" ), result @@ -95,56 +83,72 @@ void ngram() throws IOException { } @Test - void ampersand() throws IOException { - var analyzer = new EnglishNGramAnalyzer(); - List result = analyze("Meridian Ave N & N 148th St", analyzer); + void ampersand() { + List result = tokenize("Meridian Ave N & N 148th St"); assertEquals( List.of( - "Mer", "Meri", "Merid", "Meridi", "Meridia", "Meridian", - "eri", "erid", "eridi", "eridia", "eridian", - "rid", "ridi", "ridia", "ridian", - "idi", "idia", "idian", - "dia", "dian", - "ian", "Av", "N", "N", "148", - "148t", - "148th", - "48t", - "48th", - "8th", "St" ), result ); } - public List analyze(String text, Analyzer analyzer) throws IOException { - List result = new ArrayList<>(); - TokenStream tokenStream = analyzer.tokenStream("name", text); - CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); - tokenStream.reset(); - while (tokenStream.incrementToken()) { - result.add(attr.toString()); + @ParameterizedTest + @CsvSource( + value = { + "1st:1", + "2nd:2", + "3rd:3", + "4th:4", + "6th:6", + "148th:148", + "102nd:102", + "1003rd:1003", + "St:St", + "S3:S3", + "Aard:Aard", + }, + delimiter = ':' + ) + void numberSuffixes(String input, String expected) { + var result = tokenize(input); + assertEquals(List.of(expected), result); + } + + public List tokenize(String text) { + try (var analyzer = new EnglishNGramAnalyzer()) { + List result; + TokenStream tokenStream; + result = new ArrayList<>(); + tokenStream = analyzer.tokenStream("name", text); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } catch (IOException e) { + throw new RuntimeException(e); } - return result; } } diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 6b9ed853ad1..f3af08f29fe 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -235,7 +235,7 @@ void stopClustersWithTypos(String searchTerm) { @Test void fuzzyStopClusters() { var result1 = index.queryStopClusters("arts").map(primaryId()).toList(); - assertEquals(List.of(ARTS_CENTER.getId(), ARTHUR.getId()), result1); + assertEquals(List.of(ARTS_CENTER.getId()), result1); } @Test @@ -313,6 +313,7 @@ void agenciesAndFeedPublisher() { "Meridian & N 148", "Meridian Ave 148", "Meridian Av 148", + "meridian av 148", } ) void shortTokens(String query) { diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java index 26f6723a8b2..922108427e8 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -1,14 +1,16 @@ package org.opentripplanner.ext.geocoder; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.pattern.PatternReplaceFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; /** @@ -17,18 +19,25 @@ * of a stop name can be matched efficiently. *

* For example the query of "exanderpl" will match the stop name "Alexanderplatz". + *

+ * It also removes number suffixes in the American street names, like "147th Street", which will + * be tokenized to "147 Street". */ class EnglishNGramAnalyzer extends Analyzer { + // matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th" + private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)[st|nd|rd|th]+"); + @Override protected TokenStreamComponents createComponents(String fieldName) { StandardTokenizer src = new StandardTokenizer(); TokenStream result = new EnglishPossessiveFilter(src); result = new LowerCaseFilter(result); + result = new PatternReplaceFilter(result, NUMBER_SUFFIX_PATTERN, "$1", true); result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); result = new PorterStemFilter(result); result = new CapitalizationFilter(result); - result = new NGramTokenFilter(result, 3, 10, true); + result = new NGramTokenFilter(result, 4, 10, true); return new TokenStreamComponents(src, result); } } From f0a2c6c39c29455edff0ca9dc10ab10acaed30df Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Fri, 9 Aug 2024 10:37:08 +0200 Subject: [PATCH 10/11] Improve regexes and test names --- .../ext/geocoder/EnglishNgramAnalyzerTest.java | 8 +++++++- .../org/opentripplanner/ext/geocoder/LuceneIndexTest.java | 2 +- .../ext/geocoder/EnglishNGramAnalyzer.java | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java index 77917399647..2c352e0f760 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/EnglishNgramAnalyzerTest.java @@ -135,7 +135,13 @@ void numberSuffixes(String input, String expected) { assertEquals(List.of(expected), result); } - public List tokenize(String text) { + @Test + void wordBoundary() { + var result = tokenize("1stst"); + assertEquals(List.of("1sts", "1stst", "stst"), result); + } + + private List tokenize(String text) { try (var analyzer = new EnglishNGramAnalyzer()) { List result; TokenStream tokenStream; diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index f3af08f29fe..910c5080331 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -316,7 +316,7 @@ void agenciesAndFeedPublisher() { "meridian av 148", } ) - void shortTokens(String query) { + void numericAdjectives(String query) { var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList(); assertEquals( Stream.of(MERIDIAN_AVE, MERIDIAN_N2, MERIDIAN_N1).map(s -> s.getName().toString()).toList(), diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java index 922108427e8..a3ef8440a18 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -26,7 +26,7 @@ class EnglishNGramAnalyzer extends Analyzer { // matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th" - private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)[st|nd|rd|th]+"); + private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)(st|nd|rd|th)\\b"); @Override protected TokenStreamComponents createComponents(String fieldName) { From bf02dae9470adb2e68d5088e3f93cfcfd300c09e Mon Sep 17 00:00:00 2001 From: Leonard Ehrenfried Date: Fri, 9 Aug 2024 14:55:29 +0200 Subject: [PATCH 11/11] Apply review comments --- .../java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java | 2 ++ .../org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java index 910c5080331..de6e600037c 100644 --- a/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java +++ b/src/ext-test/java/org/opentripplanner/ext/geocoder/LuceneIndexTest.java @@ -310,6 +310,8 @@ void agenciesAndFeedPublisher() { "Meridian Ave N & N 148", "Meridian Ave N N 148", "Meridian Ave N 148", + "Meridian & 148 N", + "148 N & Meridian", "Meridian & N 148", "Meridian Ave 148", "Meridian Av 148", diff --git a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java index a3ef8440a18..17bf529a559 100644 --- a/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java +++ b/src/ext/java/org/opentripplanner/ext/geocoder/EnglishNGramAnalyzer.java @@ -25,7 +25,7 @@ */ class EnglishNGramAnalyzer extends Analyzer { - // matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th" + // Matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th" private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)(st|nd|rd|th)\\b"); @Override