|
1 | 1 | #include "bm25.h"
|
2 | 2 |
|
3 |
| -// [[Rcpp::export]] |
4 |
| -Rcpp::NumericVector rcpp_bm25 (const Rcpp::DataFrame &idfs, const Rcpp::List &tokensList, Rcpp::DataFrame &these_tokens, const double ntoks_avg) { |
5 |
| - |
6 |
| - // Fixed parameters used in the BM25 function. See wikipedia reference for |
7 |
| - // these values. |
8 |
| - const double k = 1.2; |
9 |
| - const double b = 0.75; |
| 3 | +void make_idf_map ( |
| 4 | + const Rcpp::DataFrame &idfs, |
| 5 | + std::unordered_map <std::string, double> &idf_map) { |
10 | 6 |
|
11 |
| - // Set up primary 'idf_map' to map all tokens to their IDFs over whole corpus: |
12 |
| - std::unordered_map <std::string, double> idf_map; |
13 | 7 | const Rcpp::CharacterVector idf_tokens = idfs ["token"];
|
14 | 8 | const Rcpp::NumericVector idf_idf = idfs ["idf"];
|
15 | 9 | for (int i = 0; i < idfs.nrow (); i++) {
|
16 | 10 | std::string this_tok = static_cast<std::string> (idf_tokens [i]);
|
17 | 11 | idf_map.emplace (this_tok, idf_idf [i]);
|
18 | 12 | }
|
| 13 | +} |
19 | 14 |
|
20 |
| - const int ndocs = tokensList.size(); |
| 15 | +void make_these_tokens_map ( |
| 16 | + const Rcpp::DataFrame &these_tokens, |
| 17 | + std::unordered_map <std::string, int> &these_tokens_map) { |
21 | 18 |
|
22 |
| - // Then make a map of the input tokens and counts: |
23 |
| - std::unordered_map <std::string, int> these_tokens_map; |
24 | 19 | const Rcpp::CharacterVector these_tokens_str = these_tokens ["token"];
|
25 | 20 | const Rcpp::IntegerVector these_tokens_n = these_tokens ["np"];
|
26 | 21 |
|
27 | 22 | for (int i = 0; i < these_tokens.nrow (); i++) {
|
28 | 23 | const std::string this_string = static_cast <std::string> (these_tokens_str [i]);
|
29 | 24 | these_tokens_map.emplace (this_string, these_tokens_n [i]);
|
30 | 25 | }
|
| 26 | +} |
31 | 27 |
|
| 28 | +// [[Rcpp::export]] |
| 29 | +Rcpp::NumericVector rcpp_bm25 (const Rcpp::DataFrame &idfs, const Rcpp::List &tokensList, Rcpp::DataFrame &these_tokens, const double ntoks_avg) { |
| 30 | + |
| 31 | + // Fixed parameters used in the BM25 function. See wikipedia reference for |
| 32 | + // these values. |
| 33 | + const double k = 1.2; |
| 34 | + const double b = 0.75; |
| 35 | + |
| 36 | + // Set up primary 'idf_map' to map all tokens to their IDFs over whole corpus: |
| 37 | + std::unordered_map <std::string, double> idf_map; |
| 38 | + make_idf_map (idfs, idf_map); |
| 39 | + |
| 40 | + std::unordered_map <std::string, int> these_tokens_map; |
| 41 | + make_these_tokens_map (these_tokens, these_tokens_map); |
| 42 | + |
| 43 | + const int ndocs = tokensList.size(); |
32 | 44 | Rcpp::NumericVector bm25 (ndocs, 0.0);
|
33 | 45 |
|
34 | 46 | for (int i = 0; i < ndocs; i++) {
|
|
0 commit comments