BianchTech · pedrobiqua · Jan 19, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 19, 2025
diff --git a/docs/Captura de tela de 2025-01-14 21-49-10.png b/docs/Captura de tela de 2025-01-14 21-49-10.png
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -6,6 +6,7 @@ add_library(
             src/hello.cpp
             src/page_rank.cpp
             src/inverted_index.cpp
+            src/preprocessing/stemmer.cpp
 )
 
 target_include_directories(search_engine PUBLIC include)

diff --git a/lib/include/preprocessing/stemmer.h b/lib/include/preprocessing/stemmer.h
@@ -0,0 +1,114 @@
+#ifndef STEMMER_H
+#define STEMMER_H
+
+#include <unicode/locid.h>
+#include <unicode/unistr.h>
+#include <unicode/ustream.h>
+
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <unordered_map>
+#include <vector>
+
+namespace stemmer {
+
+/**
+ * @brief Represents a suffix transformation rule.
+ */
+struct StepRule {
+    std::string suffixToRemove;  ///< Suffix to be removed.
+    int minimumStemSize;      ///< Minimum stem size after removing the suffix.
+    std::string replacement;  ///< Suffix replacement.
+};
+
+using RuleMap = std::map<std::string, std::vector<StepRule>>;
+
+/**
+ * @class RSPL
+ * @brief Implements the RSPL stemming algorithm.
+ */
+class RSPL {
+   public:
+    /**
+     * @brief Default constructor.
+     * Initializes predefined rules.
+     */
+    RSPL();
+
+    /**
+     * @brief Destructor.
+     */
+    ~RSPL();
+
+    /**
+     * @brief Executes the stemming process on a sentence or word.
+     * @param sentence The sentence (or word) to be processed.
+     */
+    void run(std::string* sentence);
+
+   private:
+    RuleMap ruleMap_;  ///< Mapping of rules for each suffix.
+
+    /**
+     * @brief Mapping to normalize accented characters to their ASCII
+     * equivalents.
+     */
+    const std::unordered_map<wchar_t, wchar_t> accentMap_ = {
+        {L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'},
+        {L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'},
+        {L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'},
+        {L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'},
+        {L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'},
+        {L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'},
+        {L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'},
+        {L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'},
+        {L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'},
+        {L'Ç', L'C'}};
+
+    /**
+     * @brief Applies transformation rules to a word.
+     * @param word The word to be processed.
+     * @param rules The transformation rules to be applied.
+     * @return `true` if a rule was applied, otherwise `false`.
+     */
+    bool applyRules(std::string& word, const std::vector<StepRule>& rules);
+
+    /**
+     * @brief Checks if a word ends with 's'.
+     * @param word The word to be checked.
+     * @return `true` if the word ends with 's', otherwise `false`.
+     */
+    bool endsWithS(const std::string& word);
+
+    /**
+     * @brief Checks if a word ends with 'a'.
+     * @param word The word to be checked.
+     * @return `true` if the word ends with 'a', otherwise `false`.
+     */
+    bool endsWithA(const std::string& word);
+
+    /**
+     * @brief Splits a string into parts based on delimiters.
+     * @param s The string to be split.
+     * @return A vector containing the parts of the string.
+     */
+    std::vector<std::string> split(std::string& s);
+
+    /**
+     * @brief Removes accents from a string.
+     * @param input The input string.
+     * @return The string without accents.
+     */
+    std::string removeAccents(const std::string& input);
+
+    /**
+     * @brief Shrinks the size of a string to normalize it.
+     * @param input Pointer to the input string.
+     */
+    void shrinkString(std::string* input);
+};
+
+}  // namespace stemmer
+
+#endif  // STEMMER_H
diff --git a/lib/src/preprocessing/stemmer.cpp b/lib/src/preprocessing/stemmer.cpp
@@ -0,0 +1,246 @@
+#include "preprocessing/stemmer.h"
+#include <codecvt>
+#include <locale>
+
+namespace stemmer {
+RSPL::RSPL() {
+    // As regras são inicializadas no construtor da classe
+    std::vector<StepRule> plural_reduction_rules = {
+        {"ns", 1, "m"},   {"ões", 3, "ão"}, {"ães", 1, "ão"}, {"ais", 1, "al"},
+        {"éis", 2, "el"}, {"eis", 2, "el"}, {"óis", 2, "ol"}, {"is", 2, "il"},
+        {"is", 1, "l"},   {"res", 1, "r"},  {"s", 2, ""}};
+
+    std::vector<StepRule> feminine_reduction_rules = {
+        {"inha", 4, "inho"}, {"esa", 3, "es"}, {"osa", 3, "oso"},
+        {"na", 2, "no"},     {"da", 2, "do"},  {"va", 2, "vo"},
+        {"ia", 2, "io"}};
+
+    std::vector<StepRule> augmentative_reduction_rules = {
+        {"zão", 3, "z"},  {"são", 3, "s"},  {"ão", 2, "o"},
+        {"ona", 3, "on"}, {"ões", 3, "ão"}, {"íssimos", 7, "o"}};
+
+    std::vector<StepRule> diminutive_reduction_rules = {
+        {"zinho", 5, ""},  {"zinha", 6, ""}, {"zinhos", 6, ""},
+        {"zinhas", 7, ""}, {"inho", 4, ""},  {"inha", 5, ""}};
+
+    std::vector<StepRule> verb_conjugation_reduction_rules = {
+        // Infinitive reduction
+        {"ar", 2, ""},
+        {"er", 2, ""},
+        {"ir", 2, ""},
+
+        // Gerund reduction
+        {"ando", 4, ""},
+        {"endo", 4, ""},
+        {"indo", 4, ""},
+
+        // Past participle reduction
+        {"ado", 3, ""},
+        {"ido", 3, ""},
+
+        // Future tense reduction
+        {"arei", 4, "ar"},
+        {"erei", 4, "er"},
+        {"irei", 4, "ir"},
+        {"arás", 4, "ar"},
+        {"erás", 4, "er"},
+        {"irás", 4, "ir"},
+        {"ará", 3, "ar"},
+        {"erá", 3, "er"},
+        {"irá", 3, "ir"},
+        {"aremos", 6, "ar"},
+        {"eremos", 6, "er"},
+        {"iremos", 6, "ir"},
+        {"areis", 5, "ar"},
+        {"ereis", 5, "er"},
+        {"ireis", 5, "ir"},
+        {"arão", 4, "ar"},
+        {"erão", 4, "er"},
+        {"irão", 4, "ir"},
+
+        // Imperfect tense reduction
+        {"ava", 3, "ar"},
+        {"ia", 2, "er"},
+        {"ia", 2, "ir"},
+        {"ávamos", 6, "ar"},
+        {"íamos", 5, "er"},
+        {"íamos", 5, "ir"},
+        {"áveis", 5, "ar"},
+        {"íeis", 5, "er"},
+        {"íeis", 5, "ir"},
+
+        // Present tense reduction
+        {"o", 1, ""},
+        {"as", 2, "ar"},
+        {"es", 2, "er"},
+        {"es", 2, "ir"},
+        {"a", 1, "ar"},
+        {"e", 1, "er"},
+        {"e", 1, "ir"},
+        {"amos", 4, "ar"},
+        {"emos", 4, "er"},
+        {"imos", 4, "ir"},
+        {"ais", 3, "ar"},
+        {"eis", 3, "er"},
+        {"is", 2, "ir"},
+        {"am", 2, "ar"},
+        {"em", 2, "er"},
+        {"em", 2, "ir"}};
+
+    std::vector<StepRule> noun_reduction_rules = {
+        {"ezas", 4, "ez"},      {"ezes", 4, "ez"},     {"eza", 3, "ez"},
+        {"ez", 2, ""},          {"mentos", 6, "ment"}, {"mento", 5, "ment"},
+        {"idades", 7, "idade"}, {"idade", 6, ""},      {"ismos", 5, "ismo"},
+        {"ista", 4, ""},        {"istas", 5, ""},      {"ções", 4, "ção"},
+        {"ção", 3, ""}};
+
+    std::vector<StepRule> adverb_reduction_rules = {{"mente", 4, ""}};
+
+    std::vector<StepRule> remove_vowel_rules = {
+        {"a", 3, ""}, {"e", 3, ""}, {"o", 3, ""}};
+
+    // Conjunto de regras para a aplicação do algoritmo
+    ruleMap_ = {
+        {"plural_reduction", plural_reduction_rules},
+        {"feminine_reduction", feminine_reduction_rules},
+        {"augmentative_reduction", augmentative_reduction_rules},
+        {"diminutive_reduction", diminutive_reduction_rules},
+        {"adverb_reduction", adverb_reduction_rules},
+        {"verb_conjugation_reduction", verb_conjugation_reduction_rules},
+        {"noun_reduction", noun_reduction_rules},
+        {"remove_vowel", remove_vowel_rules}};
+}
+
+RSPL::~RSPL() {}
+
+bool RSPL::endsWithA(const std::string& word) {
+    char last_char = word.back();
+    if (last_char == 'a')
+        return true;
+    return false;
+}
+
+bool RSPL::endsWithS(const std::string& word) {
+    char last_char = word.back();
+    if (last_char == 's')
+        return true;
+    return false;
+}
+
+std::vector<std::string> RSPL::split(std::string& s) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    std::string delimiter = " ";
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+
+    return tokens;
+}
+
+std::string RSPL::removeAccents(const std::string& input) {
+    std::wstring winput =
+        std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
+    std::wstring woutput;
+    woutput.reserve(winput.size());  // Evitar alocações desnecessárias
+
+    // Processar a string como wstring
+    for (wchar_t ch : winput) {
+        if (accentMap_.count(ch)) {
+            woutput.push_back(accentMap_.at(ch));  // Substituir acentuados
+        } else {
+            woutput.push_back(ch);  // Mantém o caractere não acentuado
+        }
+    }
+
+    // Converter de volta para std::string
+    return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
+}
+
+void RSPL::shrinkString(std::string* input) {
+    if (!input)
+        return;  // Verifica se o ponteiro é válido
+
+    icu::UnicodeString ustr(input->c_str(), "UTF-8");
+    ustr.toLower();
+    std::string result;
+    ustr.toUTF8String(result);
+    *input = result;
+}
+
+bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
+    for (const auto& rule : rules) {
+        // Verificar se a palavra termina com o sufixo especificado
+        if (word.size() >= rule.suffixToRemove.size() &&
+            word.compare(word.size() - rule.suffixToRemove.size(),
+                         rule.suffixToRemove.size(),
+                         rule.suffixToRemove) == 0) {
+            // Calcular o tamanho do radical após a remoção do sufixo
+            size_t stem_size = word.size() - rule.suffixToRemove.size();
+            if (stem_size >= static_cast<size_t>(rule.minimumStemSize)) {
+                // Aplicar a regra: remover o sufixo e adicionar o replacement
+                word = word.substr(0, stem_size) + rule.replacement;
+                return true;  // Regra aplicada
+            }
+        }
+    }
+    return false;  // Nenhuma regra foi aplicada
+}
+
+void RSPL::run(std::string* sentence) {
+    // Separar a sentença em palavras
+    this->shrinkString(sentence);
+    std::cout << *sentence << std::endl;
+    std::vector<std::string> words = this->split(*sentence);
+
+    for (std::string& word : words) {
+        // PLURAL REDUCTION
+        bool rule_applied = false;
+
+        if (endsWithS(word)) {
+            rule_applied = applyRules(word, ruleMap_["plural_reduction"]);
+        }
+
+        // FEMININE REDUCTION
+        if (endsWithA(word)) {
+            rule_applied = applyRules(word, ruleMap_["feminine_reduction"]);
+        }
+
+        // AUGMENTATIVE REDUCTION
+        rule_applied = applyRules(word, ruleMap_["augmentative_reduction"]);
+
+        rule_applied = applyRules(word, ruleMap_["diminutive_reduction"]);
+
+        // ADVERB REDUCTION
+        rule_applied = applyRules(word, ruleMap_["adverb_reduction"]);
+
+        // NOUN REDUCTION
+        rule_applied = applyRules(word, ruleMap_["noun_reduction"]);
+
+        if (!rule_applied) {
+            rule_applied = applyRules(word, ruleMap_["verb_reduction"]);
+            if (!rule_applied)
+                // remove vogal
+                rule_applied = applyRules(word, ruleMap_["remove_vowel"]);
+        }
+
+        // Função para remover acentos
+        word = removeAccents(word);
+        std::cout << word << std::endl;
+    }
+
+    // for (auto& word : words)
+    //     std::cout << word << std::endl;
+
+    // Concatena o vetor em uma string
+    *sentence = std::accumulate(words.begin(), words.end(), std::string(""),
+                                [](const std::string& a, const std::string& b) {
+                                    return a + (a.empty() ? "" : " ") + b;
+                                });
+}
+
+}  // namespace stemmer
diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt
@@ -15,6 +15,7 @@ add_executable(
     subtraction-test.cpp
     page_rank-test.cpp
     inverted_index-test.cpp
+    stemmer-test.cpp
 )
 
 target_include_directories(LibUnitTests PRIVATE ${CMAKE_SOURCE_DIR}/lib/include/)

diff --git a/tests/unit-tests/stemmer-test.cpp b/tests/unit-tests/stemmer-test.cpp
@@ -0,0 +1,13 @@
+#include <gtest/gtest.h>
+
+#include "preprocessing/stemmer.h"
+
+TEST(StemmerTest, TestRules) {
+    // Inicializa o stemmer
+    stemmer::RSPL stemmer;
+    std::string sentence = "O coração bate rapidamente na cidade";
+    std::string expected_sentence = "o corac bat rapid na cidad";
+
+    stemmer.run(&sentence);
+    EXPECT_EQ(sentence, expected_sentence);
+}