Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

39 feat/preprocessing rspl stemmer algorithm #41

Merged
merged 3 commits into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added docs/Captura de tela de 2025-01-14 21-49-10.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ add_library(
src/hello.cpp
src/page_rank.cpp
src/inverted_index.cpp
src/preprocessing/stemmer.cpp
)

target_include_directories(search_engine PUBLIC include)
Expand Down
114 changes: 114 additions & 0 deletions lib/include/preprocessing/stemmer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#ifndef STEMMER_H
#define STEMMER_H

#include <unicode/locid.h>
#include <unicode/unistr.h>
#include <unicode/ustream.h>

#include <iostream>
#include <map>
#include <numeric>
#include <unordered_map>
#include <vector>

namespace stemmer {

/**
* @brief Represents a suffix transformation rule.
*/
struct StepRule {
std::string suffixToRemove; ///< Suffix to be removed.
int minimumStemSize; ///< Minimum stem size after removing the suffix.
std::string replacement; ///< Suffix replacement.
};

using RuleMap = std::map<std::string, std::vector<StepRule>>;

/**
* @class RSPL
* @brief Implements the RSPL stemming algorithm.
*/
class RSPL {
public:
/**
* @brief Default constructor.
* Initializes predefined rules.
*/
RSPL();

/**
* @brief Destructor.
*/
~RSPL();

/**
* @brief Executes the stemming process on a sentence or word.
* @param sentence The sentence (or word) to be processed.
*/
void run(std::string* sentence);

private:
RuleMap ruleMap_; ///< Mapping of rules for each suffix.

/**
* @brief Mapping to normalize accented characters to their ASCII
* equivalents.
*/
const std::unordered_map<wchar_t, wchar_t> accentMap_ = {
{L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'},
{L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'},
{L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'},
{L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'},
{L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'},
{L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'},
{L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'},
{L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'},
{L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'},
{L'Ç', L'C'}};

/**
* @brief Applies transformation rules to a word.
* @param word The word to be processed.
* @param rules The transformation rules to be applied.
* @return `true` if a rule was applied, otherwise `false`.
*/
bool applyRules(std::string& word, const std::vector<StepRule>& rules);

/**
* @brief Checks if a word ends with 's'.
* @param word The word to be checked.
* @return `true` if the word ends with 's', otherwise `false`.
*/
bool endsWithS(const std::string& word);

/**
* @brief Checks if a word ends with 'a'.
* @param word The word to be checked.
* @return `true` if the word ends with 'a', otherwise `false`.
*/
bool endsWithA(const std::string& word);

/**
* @brief Splits a string into parts based on delimiters.
* @param s The string to be split.
* @return A vector containing the parts of the string.
*/
std::vector<std::string> split(std::string& s);

/**
* @brief Removes accents from a string.
* @param input The input string.
* @return The string without accents.
*/
std::string removeAccents(const std::string& input);

/**
* @brief Shrinks the size of a string to normalize it.
* @param input Pointer to the input string.
*/
void shrinkString(std::string* input);
};

} // namespace stemmer

#endif // STEMMER_H
246 changes: 246 additions & 0 deletions lib/src/preprocessing/stemmer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#include "preprocessing/stemmer.h"
#include <codecvt>
#include <locale>

namespace stemmer {
RSPL::RSPL() {
// As regras são inicializadas no construtor da classe
std::vector<StepRule> plural_reduction_rules = {
{"ns", 1, "m"}, {"ões", 3, "ão"}, {"ães", 1, "ão"}, {"ais", 1, "al"},
{"éis", 2, "el"}, {"eis", 2, "el"}, {"óis", 2, "ol"}, {"is", 2, "il"},
{"is", 1, "l"}, {"res", 1, "r"}, {"s", 2, ""}};

std::vector<StepRule> feminine_reduction_rules = {
{"inha", 4, "inho"}, {"esa", 3, "es"}, {"osa", 3, "oso"},
{"na", 2, "no"}, {"da", 2, "do"}, {"va", 2, "vo"},
{"ia", 2, "io"}};

std::vector<StepRule> augmentative_reduction_rules = {
{"zão", 3, "z"}, {"são", 3, "s"}, {"ão", 2, "o"},
{"ona", 3, "on"}, {"ões", 3, "ão"}, {"íssimos", 7, "o"}};

std::vector<StepRule> diminutive_reduction_rules = {
{"zinho", 5, ""}, {"zinha", 6, ""}, {"zinhos", 6, ""},
{"zinhas", 7, ""}, {"inho", 4, ""}, {"inha", 5, ""}};

std::vector<StepRule> verb_conjugation_reduction_rules = {
// Infinitive reduction
{"ar", 2, ""},
{"er", 2, ""},
{"ir", 2, ""},

// Gerund reduction
{"ando", 4, ""},
{"endo", 4, ""},
{"indo", 4, ""},

// Past participle reduction
{"ado", 3, ""},
{"ido", 3, ""},

// Future tense reduction
{"arei", 4, "ar"},
{"erei", 4, "er"},
{"irei", 4, "ir"},
{"arás", 4, "ar"},
{"erás", 4, "er"},
{"irás", 4, "ir"},
{"ará", 3, "ar"},
{"erá", 3, "er"},
{"irá", 3, "ir"},
{"aremos", 6, "ar"},
{"eremos", 6, "er"},
{"iremos", 6, "ir"},
{"areis", 5, "ar"},
{"ereis", 5, "er"},
{"ireis", 5, "ir"},
{"arão", 4, "ar"},
{"erão", 4, "er"},
{"irão", 4, "ir"},

// Imperfect tense reduction
{"ava", 3, "ar"},
{"ia", 2, "er"},
{"ia", 2, "ir"},
{"ávamos", 6, "ar"},
{"íamos", 5, "er"},
{"íamos", 5, "ir"},
{"áveis", 5, "ar"},
{"íeis", 5, "er"},
{"íeis", 5, "ir"},

// Present tense reduction
{"o", 1, ""},
{"as", 2, "ar"},
{"es", 2, "er"},
{"es", 2, "ir"},
{"a", 1, "ar"},
{"e", 1, "er"},
{"e", 1, "ir"},
{"amos", 4, "ar"},
{"emos", 4, "er"},
{"imos", 4, "ir"},
{"ais", 3, "ar"},
{"eis", 3, "er"},
{"is", 2, "ir"},
{"am", 2, "ar"},
{"em", 2, "er"},
{"em", 2, "ir"}};

std::vector<StepRule> noun_reduction_rules = {
{"ezas", 4, "ez"}, {"ezes", 4, "ez"}, {"eza", 3, "ez"},
{"ez", 2, ""}, {"mentos", 6, "ment"}, {"mento", 5, "ment"},
{"idades", 7, "idade"}, {"idade", 6, ""}, {"ismos", 5, "ismo"},
{"ista", 4, ""}, {"istas", 5, ""}, {"ções", 4, "ção"},
{"ção", 3, ""}};

std::vector<StepRule> adverb_reduction_rules = {{"mente", 4, ""}};

std::vector<StepRule> remove_vowel_rules = {
{"a", 3, ""}, {"e", 3, ""}, {"o", 3, ""}};

// Conjunto de regras para a aplicação do algoritmo
ruleMap_ = {
{"plural_reduction", plural_reduction_rules},
{"feminine_reduction", feminine_reduction_rules},
{"augmentative_reduction", augmentative_reduction_rules},
{"diminutive_reduction", diminutive_reduction_rules},
{"adverb_reduction", adverb_reduction_rules},
{"verb_conjugation_reduction", verb_conjugation_reduction_rules},
{"noun_reduction", noun_reduction_rules},
{"remove_vowel", remove_vowel_rules}};
}

RSPL::~RSPL() {}

bool RSPL::endsWithA(const std::string& word) {
char last_char = word.back();
if (last_char == 'a')
return true;
return false;
}

bool RSPL::endsWithS(const std::string& word) {
char last_char = word.back();
if (last_char == 's')
return true;
return false;
}

std::vector<std::string> RSPL::split(std::string& s) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
std::string delimiter = " ";
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

std::string RSPL::removeAccents(const std::string& input) {
std::wstring winput =
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
std::wstring woutput;
woutput.reserve(winput.size()); // Evitar alocações desnecessárias

// Processar a string como wstring
for (wchar_t ch : winput) {
if (accentMap_.count(ch)) {
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
} else {
woutput.push_back(ch); // Mantém o caractere não acentuado
}
}

// Converter de volta para std::string
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
}

void RSPL::shrinkString(std::string* input) {
if (!input)
return; // Verifica se o ponteiro é válido

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
}

bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
for (const auto& rule : rules) {
// Verificar se a palavra termina com o sufixo especificado
if (word.size() >= rule.suffixToRemove.size() &&
word.compare(word.size() - rule.suffixToRemove.size(),
rule.suffixToRemove.size(),
rule.suffixToRemove) == 0) {
// Calcular o tamanho do radical após a remoção do sufixo
size_t stem_size = word.size() - rule.suffixToRemove.size();
if (stem_size >= static_cast<size_t>(rule.minimumStemSize)) {
// Aplicar a regra: remover o sufixo e adicionar o replacement
word = word.substr(0, stem_size) + rule.replacement;
return true; // Regra aplicada
}
}
}
return false; // Nenhuma regra foi aplicada
}

void RSPL::run(std::string* sentence) {
// Separar a sentença em palavras
this->shrinkString(sentence);
std::cout << *sentence << std::endl;
std::vector<std::string> words = this->split(*sentence);

for (std::string& word : words) {
// PLURAL REDUCTION
bool rule_applied = false;

if (endsWithS(word)) {
rule_applied = applyRules(word, ruleMap_["plural_reduction"]);
}

// FEMININE REDUCTION
if (endsWithA(word)) {
rule_applied = applyRules(word, ruleMap_["feminine_reduction"]);
}

// AUGMENTATIVE REDUCTION
rule_applied = applyRules(word, ruleMap_["augmentative_reduction"]);

rule_applied = applyRules(word, ruleMap_["diminutive_reduction"]);

// ADVERB REDUCTION
rule_applied = applyRules(word, ruleMap_["adverb_reduction"]);

// NOUN REDUCTION
rule_applied = applyRules(word, ruleMap_["noun_reduction"]);

if (!rule_applied) {
rule_applied = applyRules(word, ruleMap_["verb_reduction"]);
if (!rule_applied)
// remove vogal
rule_applied = applyRules(word, ruleMap_["remove_vowel"]);
}

// Função para remover acentos
word = removeAccents(word);
std::cout << word << std::endl;
}

// for (auto& word : words)
// std::cout << word << std::endl;

// Concatena o vetor em uma string
*sentence = std::accumulate(words.begin(), words.end(), std::string(""),
[](const std::string& a, const std::string& b) {
return a + (a.empty() ? "" : " ") + b;
});
}

} // namespace stemmer
1 change: 1 addition & 0 deletions tests/unit-tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ add_executable(
subtraction-test.cpp
page_rank-test.cpp
inverted_index-test.cpp
stemmer-test.cpp
)

target_include_directories(LibUnitTests PRIVATE ${CMAKE_SOURCE_DIR}/lib/include/)
Expand Down
13 changes: 13 additions & 0 deletions tests/unit-tests/stemmer-test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include <gtest/gtest.h>

#include "preprocessing/stemmer.h"

TEST(StemmerTest, TestRules) {
// Inicializa o stemmer
stemmer::RSPL stemmer;
std::string sentence = "O coração bate rapidamente na cidade";
std::string expected_sentence = "o corac bat rapid na cidad";

stemmer.run(&sentence);
EXPECT_EQ(sentence, expected_sentence);
}
Loading