From f8d8308cdb8db80819be7eeed5652cc4a995cc71 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 24 Sep 2024 09:30:54 +0200 Subject: [PATCH] Use binary search --- .../src/normalizer/character_converter.rs | 107 +++++++++++++++--- 1 file changed, 91 insertions(+), 16 deletions(-) diff --git a/charabia/src/normalizer/character_converter.rs b/charabia/src/normalizer/character_converter.rs index fead6f8..0494462 100644 --- a/charabia/src/normalizer/character_converter.rs +++ b/charabia/src/normalizer/character_converter.rs @@ -1,6 +1,30 @@ use super::{CharNormalizer, CharOrStr}; use crate::{Script, Token}; +const CHAR_PAIRS: &[(char, Option<(char, Option)>)] = &[ + ('Æ', Some(('a', Some('e')))), + #[cfg(feature = "vietnamese")] + ('Ð', Some(('d', None))), + ('æ', Some(('a', Some('e')))), + #[cfg(feature = "vietnamese")] + ('ð', Some(('d', None))), + #[cfg(feature = "vietnamese")] + ('Đ', Some(('d', None))), + #[cfg(feature = "vietnamese")] + ('đ', Some(('d', None))), + #[cfg(feature = "turkish")] + ('ı', Some(('i', None))), + ('Œ', Some(('o', Some('e')))), + ('œ', Some(('o', Some('e')))), + ('ة', Some(('ه', None))), + ('ـ', None), + ('ٱ', Some(('ا', None))), + ('ى', Some(('ي', None))), + ('‘', Some(('\'', None))), + ('’', Some(('\'', None))), + ('‛', Some(('\'', None))), +]; + /// This module contains the implementation of the `CharacterConverterNormalizer` struct, which is a character normalizer pub struct CharacterConverterNormalizer; @@ -31,29 +55,37 @@ impl CharNormalizer for CharacterConverterNormalizer { // Returns `true` if the Normalizer should be used. fn should_normalize(&self, token: &Token) -> bool { - true + token + .lemma + .chars() + .any(|c| c.is_uppercase() || CHAR_PAIRS.binary_search_by(|(k, _)| k.cmp(&c)).is_ok()) } } fn normalize_char(c: char) -> Option { - match c { - 'œ' | 'Œ' => Some("oe".to_string().into()), - 'æ' | 'Æ' => Some("ae".to_string().into()), - 'ـ' => None, - 'ٱ' => Some('ا'.into()), - 'ى' => Some('ي'.into()), - 'ة' => Some('ه'.into()), - '’' | '‘' | '‛' => Some('\''.into()), - #[cfg(feature = "turkish")] - 'ı' => Some('i'.into()), - #[cfg(feature = "vietnamese")] - 'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), + match CHAR_PAIRS.binary_search_by(|(k, _)| k.cmp(&c)).map(|i| &CHAR_PAIRS[i].1) { + Ok(Some((first, Some(second)))) => { + Some(CharOrStr::Char(*first).merge(&CharOrStr::Char(*second))) + } + Ok(Some((first, None))) => Some(CharOrStr::Char(*first)), + Ok(None) => None, _ => Some(c.into()), } -} -fn is_control(c: char) -> bool { - c.is_control() && !c.is_whitespace() + // match c { + // 'œ' | 'Œ' => Some("oe".to_string().into()), + // 'æ' | 'Æ' => Some("ae".to_string().into()), + // 'ـ' => None, + // 'ٱ' => Some('ا'.into()), + // 'ى' => Some('ي'.into()), + // 'ة' => Some('ه'.into()), + // '’' | '‘' | '‛' => Some('\''.into()), + // #[cfg(feature = "turkish")] + // 'ı' => Some('i'.into()), + // #[cfg(feature = "vietnamese")] + // 'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), + // _ => Some(c.into()), + // } } // Test the normalizer: @@ -96,6 +128,14 @@ mod test { script: Script::Latin, ..Default::default() }, + // Taa Marbuta + Token { + lemma: Owned("النهاردة".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Arabic, + ..Default::default() + }, ] } @@ -134,6 +174,23 @@ mod test { char_map: Some(vec![(2, 2)]), ..Default::default() }, + Token { + lemma: Owned("النهارده".to_string()), + char_end: 8, + byte_end: 16, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), + script: Script::Arabic, + ..Default::default() + }, ] } @@ -176,6 +233,24 @@ mod test { kind: TokenKind::Word, ..Default::default() }, + Token { + lemma: Owned("النهارده".to_string()), + char_end: 8, + byte_end: 16, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), + script: Script::Arabic, + kind: TokenKind::Word, + ..Default::default() + }, ] }