diff --git a/docs/Captura de tela de 2025-01-14 21-49-10.png b/docs/Captura de tela de 2025-01-14 21-49-10.png new file mode 100644 index 0000000..69aa1a0 Binary files /dev/null and b/docs/Captura de tela de 2025-01-14 21-49-10.png differ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index d942936..89a8b1d 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -6,6 +6,7 @@ add_library( src/hello.cpp src/page_rank.cpp src/inverted_index.cpp + src/preprocessing/stemmer.cpp ) target_include_directories(search_engine PUBLIC include) diff --git a/lib/include/preprocessing/stemmer.h b/lib/include/preprocessing/stemmer.h new file mode 100644 index 0000000..9626bdf --- /dev/null +++ b/lib/include/preprocessing/stemmer.h @@ -0,0 +1,114 @@ +#ifndef STEMMER_H +#define STEMMER_H + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace stemmer { + +/** + * @brief Represents a suffix transformation rule. + */ +struct StepRule { + std::string suffixToRemove; ///< Suffix to be removed. + int minimumStemSize; ///< Minimum stem size after removing the suffix. + std::string replacement; ///< Suffix replacement. +}; + +using RuleMap = std::map>; + +/** + * @class RSPL + * @brief Implements the RSPL stemming algorithm. + */ +class RSPL { + public: + /** + * @brief Default constructor. + * Initializes predefined rules. + */ + RSPL(); + + /** + * @brief Destructor. + */ + ~RSPL(); + + /** + * @brief Executes the stemming process on a sentence or word. + * @param sentence The sentence (or word) to be processed. + */ + void run(std::string* sentence); + + private: + RuleMap ruleMap_; ///< Mapping of rules for each suffix. + + /** + * @brief Mapping to normalize accented characters to their ASCII + * equivalents. + */ + const std::unordered_map accentMap_ = { + {L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'}, + {L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'}, + {L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'}, + {L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'}, + {L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'}, + {L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'}, + {L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'}, + {L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'}, + {L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'}, + {L'Ç', L'C'}}; + + /** + * @brief Applies transformation rules to a word. + * @param word The word to be processed. + * @param rules The transformation rules to be applied. + * @return `true` if a rule was applied, otherwise `false`. + */ + bool applyRules(std::string& word, const std::vector& rules); + + /** + * @brief Checks if a word ends with 's'. + * @param word The word to be checked. + * @return `true` if the word ends with 's', otherwise `false`. + */ + bool endsWithS(const std::string& word); + + /** + * @brief Checks if a word ends with 'a'. + * @param word The word to be checked. + * @return `true` if the word ends with 'a', otherwise `false`. + */ + bool endsWithA(const std::string& word); + + /** + * @brief Splits a string into parts based on delimiters. + * @param s The string to be split. + * @return A vector containing the parts of the string. + */ + std::vector split(std::string& s); + + /** + * @brief Removes accents from a string. + * @param input The input string. + * @return The string without accents. + */ + std::string removeAccents(const std::string& input); + + /** + * @brief Shrinks the size of a string to normalize it. + * @param input Pointer to the input string. + */ + void shrinkString(std::string* input); +}; + +} // namespace stemmer + +#endif // STEMMER_H diff --git a/lib/src/preprocessing/stemmer.cpp b/lib/src/preprocessing/stemmer.cpp new file mode 100644 index 0000000..521c21a --- /dev/null +++ b/lib/src/preprocessing/stemmer.cpp @@ -0,0 +1,246 @@ +#include "preprocessing/stemmer.h" +#include +#include + +namespace stemmer { +RSPL::RSPL() { + // As regras são inicializadas no construtor da classe + std::vector plural_reduction_rules = { + {"ns", 1, "m"}, {"ões", 3, "ão"}, {"ães", 1, "ão"}, {"ais", 1, "al"}, + {"éis", 2, "el"}, {"eis", 2, "el"}, {"óis", 2, "ol"}, {"is", 2, "il"}, + {"is", 1, "l"}, {"res", 1, "r"}, {"s", 2, ""}}; + + std::vector feminine_reduction_rules = { + {"inha", 4, "inho"}, {"esa", 3, "es"}, {"osa", 3, "oso"}, + {"na", 2, "no"}, {"da", 2, "do"}, {"va", 2, "vo"}, + {"ia", 2, "io"}}; + + std::vector augmentative_reduction_rules = { + {"zão", 3, "z"}, {"são", 3, "s"}, {"ão", 2, "o"}, + {"ona", 3, "on"}, {"ões", 3, "ão"}, {"íssimos", 7, "o"}}; + + std::vector diminutive_reduction_rules = { + {"zinho", 5, ""}, {"zinha", 6, ""}, {"zinhos", 6, ""}, + {"zinhas", 7, ""}, {"inho", 4, ""}, {"inha", 5, ""}}; + + std::vector verb_conjugation_reduction_rules = { + // Infinitive reduction + {"ar", 2, ""}, + {"er", 2, ""}, + {"ir", 2, ""}, + + // Gerund reduction + {"ando", 4, ""}, + {"endo", 4, ""}, + {"indo", 4, ""}, + + // Past participle reduction + {"ado", 3, ""}, + {"ido", 3, ""}, + + // Future tense reduction + {"arei", 4, "ar"}, + {"erei", 4, "er"}, + {"irei", 4, "ir"}, + {"arás", 4, "ar"}, + {"erás", 4, "er"}, + {"irás", 4, "ir"}, + {"ará", 3, "ar"}, + {"erá", 3, "er"}, + {"irá", 3, "ir"}, + {"aremos", 6, "ar"}, + {"eremos", 6, "er"}, + {"iremos", 6, "ir"}, + {"areis", 5, "ar"}, + {"ereis", 5, "er"}, + {"ireis", 5, "ir"}, + {"arão", 4, "ar"}, + {"erão", 4, "er"}, + {"irão", 4, "ir"}, + + // Imperfect tense reduction + {"ava", 3, "ar"}, + {"ia", 2, "er"}, + {"ia", 2, "ir"}, + {"ávamos", 6, "ar"}, + {"íamos", 5, "er"}, + {"íamos", 5, "ir"}, + {"áveis", 5, "ar"}, + {"íeis", 5, "er"}, + {"íeis", 5, "ir"}, + + // Present tense reduction + {"o", 1, ""}, + {"as", 2, "ar"}, + {"es", 2, "er"}, + {"es", 2, "ir"}, + {"a", 1, "ar"}, + {"e", 1, "er"}, + {"e", 1, "ir"}, + {"amos", 4, "ar"}, + {"emos", 4, "er"}, + {"imos", 4, "ir"}, + {"ais", 3, "ar"}, + {"eis", 3, "er"}, + {"is", 2, "ir"}, + {"am", 2, "ar"}, + {"em", 2, "er"}, + {"em", 2, "ir"}}; + + std::vector noun_reduction_rules = { + {"ezas", 4, "ez"}, {"ezes", 4, "ez"}, {"eza", 3, "ez"}, + {"ez", 2, ""}, {"mentos", 6, "ment"}, {"mento", 5, "ment"}, + {"idades", 7, "idade"}, {"idade", 6, ""}, {"ismos", 5, "ismo"}, + {"ista", 4, ""}, {"istas", 5, ""}, {"ções", 4, "ção"}, + {"ção", 3, ""}}; + + std::vector adverb_reduction_rules = {{"mente", 4, ""}}; + + std::vector remove_vowel_rules = { + {"a", 3, ""}, {"e", 3, ""}, {"o", 3, ""}}; + + // Conjunto de regras para a aplicação do algoritmo + ruleMap_ = { + {"plural_reduction", plural_reduction_rules}, + {"feminine_reduction", feminine_reduction_rules}, + {"augmentative_reduction", augmentative_reduction_rules}, + {"diminutive_reduction", diminutive_reduction_rules}, + {"adverb_reduction", adverb_reduction_rules}, + {"verb_conjugation_reduction", verb_conjugation_reduction_rules}, + {"noun_reduction", noun_reduction_rules}, + {"remove_vowel", remove_vowel_rules}}; +} + +RSPL::~RSPL() {} + +bool RSPL::endsWithA(const std::string& word) { + char last_char = word.back(); + if (last_char == 'a') + return true; + return false; +} + +bool RSPL::endsWithS(const std::string& word) { + char last_char = word.back(); + if (last_char == 's') + return true; + return false; +} + +std::vector RSPL::split(std::string& s) { + std::vector tokens; + size_t pos = 0; + std::string token; + std::string delimiter = " "; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + + return tokens; +} + +std::string RSPL::removeAccents(const std::string& input) { + std::wstring winput = + std::wstring_convert>().from_bytes(input); + std::wstring woutput; + woutput.reserve(winput.size()); // Evitar alocações desnecessárias + + // Processar a string como wstring + for (wchar_t ch : winput) { + if (accentMap_.count(ch)) { + woutput.push_back(accentMap_.at(ch)); // Substituir acentuados + } else { + woutput.push_back(ch); // Mantém o caractere não acentuado + } + } + + // Converter de volta para std::string + return std::wstring_convert>().to_bytes(woutput); +} + +void RSPL::shrinkString(std::string* input) { + if (!input) + return; // Verifica se o ponteiro é válido + + icu::UnicodeString ustr(input->c_str(), "UTF-8"); + ustr.toLower(); + std::string result; + ustr.toUTF8String(result); + *input = result; +} + +bool RSPL::applyRules(std::string& word, const std::vector& rules) { + for (const auto& rule : rules) { + // Verificar se a palavra termina com o sufixo especificado + if (word.size() >= rule.suffixToRemove.size() && + word.compare(word.size() - rule.suffixToRemove.size(), + rule.suffixToRemove.size(), + rule.suffixToRemove) == 0) { + // Calcular o tamanho do radical após a remoção do sufixo + size_t stem_size = word.size() - rule.suffixToRemove.size(); + if (stem_size >= static_cast(rule.minimumStemSize)) { + // Aplicar a regra: remover o sufixo e adicionar o replacement + word = word.substr(0, stem_size) + rule.replacement; + return true; // Regra aplicada + } + } + } + return false; // Nenhuma regra foi aplicada +} + +void RSPL::run(std::string* sentence) { + // Separar a sentença em palavras + this->shrinkString(sentence); + std::cout << *sentence << std::endl; + std::vector words = this->split(*sentence); + + for (std::string& word : words) { + // PLURAL REDUCTION + bool rule_applied = false; + + if (endsWithS(word)) { + rule_applied = applyRules(word, ruleMap_["plural_reduction"]); + } + + // FEMININE REDUCTION + if (endsWithA(word)) { + rule_applied = applyRules(word, ruleMap_["feminine_reduction"]); + } + + // AUGMENTATIVE REDUCTION + rule_applied = applyRules(word, ruleMap_["augmentative_reduction"]); + + rule_applied = applyRules(word, ruleMap_["diminutive_reduction"]); + + // ADVERB REDUCTION + rule_applied = applyRules(word, ruleMap_["adverb_reduction"]); + + // NOUN REDUCTION + rule_applied = applyRules(word, ruleMap_["noun_reduction"]); + + if (!rule_applied) { + rule_applied = applyRules(word, ruleMap_["verb_reduction"]); + if (!rule_applied) + // remove vogal + rule_applied = applyRules(word, ruleMap_["remove_vowel"]); + } + + // Função para remover acentos + word = removeAccents(word); + std::cout << word << std::endl; + } + + // for (auto& word : words) + // std::cout << word << std::endl; + + // Concatena o vetor em uma string + *sentence = std::accumulate(words.begin(), words.end(), std::string(""), + [](const std::string& a, const std::string& b) { + return a + (a.empty() ? "" : " ") + b; + }); +} + +} // namespace stemmer diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt index 34ca14c..3fe8801 100644 --- a/tests/unit-tests/CMakeLists.txt +++ b/tests/unit-tests/CMakeLists.txt @@ -15,6 +15,7 @@ add_executable( subtraction-test.cpp page_rank-test.cpp inverted_index-test.cpp + stemmer-test.cpp ) target_include_directories(LibUnitTests PRIVATE ${CMAKE_SOURCE_DIR}/lib/include/) diff --git a/tests/unit-tests/stemmer-test.cpp b/tests/unit-tests/stemmer-test.cpp new file mode 100644 index 0000000..e90fa3a --- /dev/null +++ b/tests/unit-tests/stemmer-test.cpp @@ -0,0 +1,13 @@ +#include + +#include "preprocessing/stemmer.h" + +TEST(StemmerTest, TestRules) { + // Inicializa o stemmer + stemmer::RSPL stemmer; + std::string sentence = "O coração bate rapidamente na cidade"; + std::string expected_sentence = "o corac bat rapid na cidad"; + + stemmer.run(&sentence); + EXPECT_EQ(sentence, expected_sentence); +} \ No newline at end of file