Skip to content

Commit

Permalink
Tokenize number suffixes
Browse files Browse the repository at this point in the history
  • Loading branch information
leonardehrenfried committed Aug 7, 2024
1 parent d8c31db commit 45d6b68
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,146 +5,149 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

class EnglishNgramAnalyzerTest {

@Test
void ngram() throws IOException {
var analyzer = new EnglishNGramAnalyzer();
List<String> result = analyze("Alexanderplatz", analyzer);
void ngram() {
List<String> result = tokenize("Alexanderplatz");

//System.out.println(result.stream().collect(Collectors.joining("\",\"", "\"", "\"")));
assertEquals(
List.of(
"Ale",
"Alex",
"Alexa",
"Alexan",
"Alexand",
"Alexande",
"Alexander",
"Alexanderp",
"lex",
"lexa",
"lexan",
"lexand",
"lexande",
"lexander",
"lexanderp",
"lexanderpl",
"exa",
"exan",
"exand",
"exande",
"exander",
"exanderp",
"exanderpl",
"exanderpla",
"xan",
"xand",
"xande",
"xander",
"xanderp",
"xanderpl",
"xanderpla",
"xanderplat",
"and",
"ande",
"ander",
"anderp",
"anderpl",
"anderpla",
"anderplat",
"anderplatz",
"nde",
"nder",
"nderp",
"nderpl",
"nderpla",
"nderplat",
"nderplatz",
"der",
"derp",
"derpl",
"derpla",
"derplat",
"derplatz",
"erp",
"erpl",
"erpla",
"erplat",
"erplatz",
"rpl",
"rpla",
"rplat",
"rplatz",
"pla",
"plat",
"platz",
"lat",
"latz",
"atz",
"Alexanderplatz"
),
result
);
}

@Test
void ampersand() throws IOException {
var analyzer = new EnglishNGramAnalyzer();
List<String> result = analyze("Meridian Ave N & N 148th St", analyzer);
void ampersand() {
List<String> result = tokenize("Meridian Ave N & N 148th St");

assertEquals(
List.of(
"Mer",
"Meri",
"Merid",
"Meridi",
"Meridia",
"Meridian",
"eri",
"erid",
"eridi",
"eridia",
"eridian",
"rid",
"ridi",
"ridia",
"ridian",
"idi",
"idia",
"idian",
"dia",
"dian",
"ian",
"Av",
"N",
"N",
"148",
"148t",
"148th",
"48t",
"48th",
"8th",
"St"
),
result
);
}

public List<String> analyze(String text, Analyzer analyzer) throws IOException {
List<String> result = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
@ParameterizedTest
@CsvSource(
value = {
"1st:1",
"2nd:2",
"3rd:3",
"4th:4",
"6th:6",
"148th:148",
"102nd:102",
"1003rd:1003",
"St:St",
"Aard:Aard",
},
delimiter = ':'
)
void numberSuffixes(String input, String expected) {
var result = tokenize(input);
assertEquals(List.of(expected), result);
}

public List<String> tokenize(String text) {
try (var analyzer = new EnglishNGramAnalyzer()) {
List<String> result;
TokenStream tokenStream;
result = new ArrayList<>();
tokenStream = analyzer.tokenStream("name", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
}
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ void stopClustersWithTypos(String searchTerm) {
@Test
void fuzzyStopClusters() {
var result1 = index.queryStopClusters("arts").map(primaryId()).toList();
assertEquals(List.of(ARTS_CENTER.getId(), ARTHUR.getId()), result1);
assertEquals(List.of(ARTS_CENTER.getId()), result1);
}

@Test
Expand Down Expand Up @@ -313,6 +313,7 @@ void agenciesAndFeedPublisher() {
"Meridian & N 148",
"Meridian Ave 148",
"Meridian Av 148",
"meridian av 148",
}
)
void shortTokens(String query) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package org.opentripplanner.ext.geocoder;

import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

/**
Expand All @@ -17,18 +19,24 @@
* of a stop name can be matched efficiently.
* <p>
* For example the query of "exanderpl" will match the stop name "Alexanderplatz".
* <p>
* It also removes number suffixes in the American street names, like "147th Street", which will
* be tokenized to "147 Street".
*/
class EnglishNGramAnalyzer extends Analyzer {

private static final Pattern NUMBER_SUFFIX_PATTERN = Pattern.compile("(\\d+)[st|nd|rd|th]+");

@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
TokenStream result = new EnglishPossessiveFilter(src);
result = new LowerCaseFilter(result);
result = new PatternReplaceFilter(result, NUMBER_SUFFIX_PATTERN, "$1", true);
result = new StopFilter(result, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
result = new PorterStemFilter(result);
result = new CapitalizationFilter(result);
result = new NGramTokenFilter(result, 3, 10, true);
result = new NGramTokenFilter(result, 4, 10, true);
return new TokenStreamComponents(src, result);
}
}

0 comments on commit 45d6b68

Please sign in to comment.