Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve geocoder matches for numeric adjectives #5997

Merged
merged 11 commits into from
Aug 9, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

class EnglishNgramAnalyzerTest {

@Test
void ngram() throws IOException {
var analyzer = new EnglishNGramAnalyzer();
List<String> result = analyze("Alexanderplatz", analyzer);
void ngram() {
List<String> result = tokenize("Alexanderplatz");

//System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\"")));
assertEquals(
Expand Down Expand Up @@ -82,14 +82,79 @@ void ngram() throws IOException {
);
}

public List<String> analyze(String text, Analyzer analyzer) throws IOException {
List<String> result = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
@Test
void ampersand() {
List<String> result = tokenize("Meridian Ave N & N 148th St");

assertEquals(
List.of(
"Meri",
"Merid",
"Meridi",
"Meridia",
"Meridian",
"erid",
"eridi",
"eridia",
"eridian",
"ridi",
"ridia",
"ridian",
"idia",
"idian",
"dian",
"Av",
"N",
"N",
"148",
"St"
),
result
);
}

@ParameterizedTest
@CsvSource(
value = {
"1st:1",
"2nd:2",
"3rd:3",
"4th:4",
"6th:6",
"148th:148",
"102nd:102",
"1003rd:1003",
"St:St",
"S3:S3",
"Aard:Aard",
},
delimiter = ':'
)
void numberSuffixes(String input, String expected) {
var result = tokenize(input);
assertEquals(List.of(expected), result);
}

@Test
void wordBoundary() {
var result = tokenize("1stst");
assertEquals(List.of("1sts", "1stst", "stst"), result);
}

private List<String> tokenize(String text) {
try (var analyzer = new EnglishNGramAnalyzer()) {
List<String> result;
TokenStream tokenStream;
result = new ArrayList<>();
tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
}
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import java.util.stream.Stream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -98,6 +98,10 @@ class LuceneIndexTest {
.withCoordinate(52.52277, 13.41046)
.build();

static final RegularStop MERIDIAN_AVE = TEST_MODEL.stop("Meridian Ave N & N 148th St").build();
static final RegularStop MERIDIAN_N1 = TEST_MODEL.stop("Meridian N & Spencer").build();
static final RegularStop MERIDIAN_N2 = TEST_MODEL.stop("N 205th St & Meridian Ave N").build();

static LuceneIndex index;

static StopClusterMapper mapper;
Expand All @@ -113,7 +117,10 @@ static void setup() {
LICHTERFELDE_OST_2,
WESTHAFEN,
ARTS_CENTER,
ARTHUR
ARTHUR,
MERIDIAN_N1,
MERIDIAN_N2,
MERIDIAN_AVE
)
.forEach(stopModel::withRegularStop);
List
Expand Down Expand Up @@ -295,9 +302,32 @@ void agenciesAndFeedPublisher() {
assertEquals(List.of(StopClusterMapper.toAgency(BVG)), cluster.primary().agencies());
assertEquals("A Publisher", cluster.primary().feedPublisher().name());
}

@ParameterizedTest
@ValueSource(
strings = {
"Meridian Ave N & N 148th",
"Meridian Ave N & N 148",
"Meridian Ave N N 148",
"Meridian Ave N 148",
"Meridian & 148 N",
"148 N & Meridian",
"Meridian & N 148",
"Meridian Ave 148",
"Meridian Av 148",
"meridian av 148",
}
)
void numericAdjectives(String query) {
var names = index.queryStopClusters(query).map(c -> c.primary().name()).toList();
assertEquals(
Stream.of(MERIDIAN_AVE, MERIDIAN_N2, MERIDIAN_N1).map(s -> s.getName().toString()).toList(),
names
);
}
}

private static @Nonnull Function<StopCluster, FeedScopedId> primaryId() {
private static Function<StopCluster, FeedScopedId> primaryId() {
return c -> c.primary().id();
}
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package org.opentripplanner.ext.geocoder;

import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

/**
Expand All @@ -17,14 +19,21 @@
* of a stop name can be matched efficiently.
* <p>
* For example the query of "exanderpl" will match the stop name "Alexanderplatz".
* <p>
* It also removes number suffixes in the American street names, like "147th Street", which will
* be tokenized to "147 Street".
*/
class EnglishNGramAnalyzer extends Analyzer {

// Matches one or more numbers followed by the English suffixes "st", "nd", "rd", "th"
private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)(st|nd|rd|th)\\b");

@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
TokenStream result = new EnglishPossessiveFilter(src);
result = new LowerCaseFilter(result);
result = new PatternReplaceFilter(result, NUMBER_SUFFIX_PATTERN, "$1", true);
result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
result = new PorterStemFilter(result);
result = new CapitalizationFilter(result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ private Stream<Document> matchingDocuments(
}
});
} else {
var nameParser = new QueryParser(NAME, analyzer);
var nameParser = new QueryParser(NAME_NGRAM, analyzer);
var nameQuery = nameParser.parse(searchTerms);

var ngramNameQuery = new TermQuery(
Expand Down
Loading