Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(string_operations.h/cpp): add new util operations #48

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ add_library(
src/inverted_index.cpp
src/preprocessing/stemmer.cpp
src/exceptions/invalid_pointer_exception.cpp
src/utils/string_operations.cpp
)

target_include_directories(search_engine PUBLIC include)
Expand Down
30 changes: 2 additions & 28 deletions lib/include/inverted_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <string>
#include <vector>

#include "utils/string_operations.h"

namespace inverted_index {
/**
* @typedef str
Expand All @@ -22,12 +24,6 @@ typedef std::string str;
*/
typedef std::list<str> list_str;

/**
* @define DELIMITER
* @brief Delimiter used to split strings.
*/
#define DELIMITER " "

/**
* @struct docs
* @brief Structure that stores information about a document.
Expand Down Expand Up @@ -107,23 +103,6 @@ typedef std::set<docs> set_docs;
*/
typedef std::vector<str> vector_str;

/**
* @brief Converts a character to lowercase.
* @param c Character to convert.
* @return Character converted to lowercase.
*/
char to_lowercase(unsigned char c) {
return std::tolower(c);
}

/**
* @brief Splits a string based on a delimiter.
* @param s String to split.
* @param delimiter Delimiter to split the string.
* @return Vector of strings resulting from the split.
*/
vector_str split(str& s, const str& delimiter);

/**
* @brief Adds a new document to the document map.
* @param mp Map of words to lists of documents.
Expand All @@ -149,11 +128,6 @@ list_docs find_doc(map_str_docs& mp, str& word);
*/
list_docs find_answer(map_str_docs& mp, str& input);

/**
* @brief Removes unwanted characters from a string, such as extra spaces.
* @param input String to process.
*/
void shrink_string(std::string* input);
} // namespace inverted_index

#endif // INVERTED_INDEX
21 changes: 1 addition & 20 deletions lib/include/preprocessing/stemmer.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <vector>

#include "exceptions/invalid_pointer_exception.h"
#include "utils/string_operations.h"

namespace stemmer {

Expand Down Expand Up @@ -89,26 +90,6 @@ class RSPL {
* @return `true` if the word ends with 'a', otherwise `false`.
*/
bool endsWithA(const std::string& word);

/**
* @brief Splits a string into parts based on delimiters.
* @param s The string to be split.
* @return A vector containing the parts of the string.
*/
std::vector<std::string> split(std::string& s);

/**
* @brief Removes accents from a string.
* @param input The input string.
* @return The string without accents.
*/
std::string removeAccents(const std::string& input);

/**
* @brief Shrinks the size of a string to normalize it.
* @param input Pointer to the input string.
*/
void shrinkString(std::string* input);
};

} // namespace stemmer
Expand Down
56 changes: 56 additions & 0 deletions lib/include/utils/string_operations.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <unicode/locid.h>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <codecvt>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>

namespace utils {

/**
* @define DELIMITER
* @brief Delimiter used to split strings.
*/
constexpr std::string DELIMITER = " ";

/**
* @brief Mapping to normalize accented characters to their ASCII
* equivalents.
*/
const std::unordered_map<wchar_t, wchar_t> accentMap_ = {
{L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'},
{L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'},
{L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'},
{L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'},
{L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'},
{L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'},
{L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'},
{L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'},
{L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'},
{L'Ç', L'C'}};
// Funções para manipular string

/**
* @brief Removes unwanted characters from a string, such as extra spaces.
* @param input String to process.
*/
void shrink_string(std::string* input);

/**
* @brief Splits a string based on a delimiter.
* @param s String to split.
* @param delimiter Delimiter to split the string.
* @return Vector of strings resulting from the split.
*/
std::vector<std::string> split(std::string& s, const std::string& delimiter);

/**
* @brief Removes accents from a string.
* @param input The input string.
* @return The string without accents.
*/
std::string removeAccents(const std::string& input);

} // namespace utils
33 changes: 4 additions & 29 deletions lib/src/inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,11 @@

using namespace inverted_index;

vector_str inverted_index::split(str& s, const str& delimiter) {
vector_str tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

void inverted_index::shrink_string(std::string* input) {
if (!input)
return; // Verifica se o ponteiro é válido

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
}

map_str_docs inverted_index::add_doc(map_str_docs& mp,
const str& doc_name,
str& text) {
shrink_string(&text);
auto words = inverted_index::split(text, DELIMITER);
utils::shrink_string(&text);
auto words = utils::split(text, utils::DELIMITER);

for (const auto& word : words) {
docs target = {doc_name, 1};
Expand All @@ -61,8 +36,8 @@ list_docs inverted_index::find_answer(map_str_docs& mp, str& input) {
list_docs result;
set_docs unique_docs;

shrink_string(&input);
auto words = inverted_index::split(input, DELIMITER);
utils::shrink_string(&input);
auto words = utils::split(input, utils::DELIMITER);

for (auto& word : words) {
list_docs docs = inverted_index::find_doc(
Expand Down
55 changes: 3 additions & 52 deletions lib/src/preprocessing/stemmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,55 +164,6 @@ bool RSPL::endsWithS(const std::string& word) {
return false;
}

std::vector<std::string> RSPL::split(std::string& s) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
std::string delimiter = " ";
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

std::string RSPL::removeAccents(const std::string& input) {
std::wstring winput =
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
std::wstring woutput;
woutput.reserve(winput.size()); // Evitar alocações desnecessárias

// Processar a string como wstring
for (wchar_t ch : winput) {
if (accentMap_.count(ch)) {
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
} else {
woutput.push_back(ch); // Mantém o caractere não acentuado
}
}

// Converter de volta para std::string
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
}

void RSPL::shrinkString(std::string* input) {
try {
if (!input)
throw exceptions::invalid_pointer_exception();

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
} catch (const std::exception& e) {
std::cerr << e.what() << '\n';
}
}

bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
for (const auto& rule : rules) {
// Verificar se a palavra termina com o sufixo especificado
Expand All @@ -234,9 +185,9 @@ bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {

void RSPL::run(std::string* sentence) {
// Separar a sentença em palavras
this->shrinkString(sentence);
utils::shrink_string(sentence);
// std::cout << *sentence << std::endl;
std::vector<std::string> words = this->split(*sentence);
std::vector<std::string> words = utils::split(*sentence, utils::DELIMITER);

for (std::string& word : words) {
// PLURAL REDUCTION
Expand Down Expand Up @@ -271,7 +222,7 @@ void RSPL::run(std::string* sentence) {
}

// Função para remover acentos
word = removeAccents(word);
word = utils::removeAccents(word);
// std::cout << word << std::endl;
}

Expand Down
49 changes: 49 additions & 0 deletions lib/src/utils/string_operations.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "utils/string_operations.h"

namespace utils {

void shrink_string(std::string* input) {
if (!input)
return; // Verifica se o ponteiro é válido

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
}

std::vector<std::string> split(std::string& s, const std::string& delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

std::string removeAccents(const std::string& input) {
std::wstring winput =
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
std::wstring woutput;
woutput.reserve(winput.size()); // Evitar alocações desnecessárias

// Processar a string como wstring
for (wchar_t ch : winput) {
if (accentMap_.count(ch)) {
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
} else {
woutput.push_back(ch); // Mantém o caractere não acentuado
}
}

// Converter de volta para std::string
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
}

} // namespace utils
Loading