Skip to content

Commit 62ec188

Browse files
msariyuceandrewseidl
authored andcommitted
Renamed CsvParserUtils to DelimitedParserUtils. Add newline at eof of DelimitedParserUtils.h. Minor performance tweak for add_value().
1 parent e051cc8 commit 62ec188

5 files changed

+121
-121
lines changed

Import/CMakeLists.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ endif()
1010

1111
add_library(CsvImport Importer.cpp Importer.h ${S3Archive})
1212

13-
add_library(CsvParserUtils CsvParserUtils.cpp CsvParserUtils.h)
13+
add_library(DelimitedParserUtils DelimitedParserUtils.cpp DelimitedParserUtils.h)
1414

15-
target_link_libraries(CsvImport CsvParserUtils mapd_thrift Shared Catalog Chunk DataMgr StringDictionary ${GDAL_LIBRARIES} ${CMAKE_DL_LIBS}
15+
target_link_libraries(CsvImport DelimitedParserUtils mapd_thrift Shared Catalog Chunk DataMgr StringDictionary ${GDAL_LIBRARIES} ${CMAKE_DL_LIBS}
1616
${LibArchive_LIBRARIES} ${IMPORT_LIBRARIES} ${Arrow_LIBRARIES})
1717

1818
install(DIRECTORY ${CMAKE_SOURCE_DIR}/ThirdParty/gdal-data DESTINATION "ThirdParty")
@@ -22,7 +22,7 @@ install(DIRECTORY ${CMAKE_SOURCE_DIR}/ThirdParty/geo_samples DESTINATION "ThirdP
2222
add_custom_target(geo_samples ALL COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/ThirdParty/geo_samples" "${CMAKE_BINARY_DIR}/ThirdParty/geo_samples")
2323

2424
add_library(RowToColumn RowToColumnLoader.cpp RowToColumnLoader.h)
25-
target_link_libraries(RowToColumn ThriftClient CsvParserUtils)
25+
target_link_libraries(RowToColumn ThriftClient DelimitedParserUtils)
2626

2727
add_executable(StreamImporter StreamImporter.cpp)
2828
target_link_libraries(StreamImporter RowToColumn mapd_thrift Shared ${CMAKE_DL_LIBS} ${Boost_LIBRARIES} ${PROFILER_LIBS})

Import/CsvParserUtils.cpp Import/DelimitedParserUtils.cpp

+76-71
Original file line numberDiff line numberDiff line change
@@ -20,27 +20,45 @@
2020
* @brief Implementation of CsvParserUtils class.
2121
*/
2222

23-
#include "Import/CsvParserUtils.h"
23+
#include "Import/DelimitedParserUtils.h"
2424

2525
#include "Shared/Logger.h"
2626
#include "StringDictionary/StringDictionary.h"
2727

2828
namespace {
29-
static const bool is_eol(const char& p, const std::string& line_delims) {
30-
for (auto i : line_delims) {
31-
if (p == i) {
32-
return true;
33-
}
29+
inline bool is_eol(const char& c, const Importer_NS::CopyParams& copy_params) {
30+
return c == copy_params.line_delim || c == '\n' || c == '\r';
31+
}
32+
33+
inline void trim_space(const char*& field_begin, const char*& field_end) {
34+
while (field_begin < field_end && (*field_begin == ' ' || *field_begin == '\r')) {
35+
++field_begin;
36+
}
37+
while (field_begin < field_end &&
38+
(*(field_end - 1) == ' ' || *(field_end - 1) == '\r')) {
39+
--field_end;
40+
}
41+
}
42+
43+
inline void trim_quotes(const char*& field_begin,
44+
const char*& field_end,
45+
const Importer_NS::CopyParams& copy_params) {
46+
if (copy_params.quoted && field_end - field_begin > 0 &&
47+
*field_begin == copy_params.quote) {
48+
++field_begin;
49+
}
50+
if (copy_params.quoted && field_end - field_begin > 0 &&
51+
*(field_end - 1) == copy_params.quote) {
52+
--field_end;
3453
}
35-
return false;
3654
}
3755
} // namespace
3856

3957
namespace Importer_NS {
40-
size_t CsvParserUtils::find_beginning(const char* buffer,
41-
size_t begin,
42-
size_t end,
43-
const Importer_NS::CopyParams& copy_params) {
58+
size_t DelimitedParserUtils::find_beginning(const char* buffer,
59+
size_t begin,
60+
size_t end,
61+
const Importer_NS::CopyParams& copy_params) {
4462
// @TODO(wei) line_delim is in quotes note supported
4563
if (begin == 0 || (begin > 0 && buffer[begin - 1] == copy_params.line_delim)) {
4664
return 0;
@@ -55,17 +73,27 @@ size_t CsvParserUtils::find_beginning(const char* buffer,
5573
return i;
5674
}
5775

58-
size_t CsvParserUtils::find_end(const char* buffer,
59-
size_t size,
60-
const Importer_NS::CopyParams& copy_params,
61-
unsigned int& num_rows_this_buffer) {
76+
size_t DelimitedParserUtils::find_end(const char* buffer,
77+
size_t size,
78+
const Importer_NS::CopyParams& copy_params,
79+
unsigned int& num_rows_this_buffer) {
6280
size_t last_line_delim_pos = 0;
6381
if (copy_params.quoted) {
6482
const char* current = buffer;
65-
last_line_delim_pos = 0;
6683
bool in_quote = false;
6784

6885
while (current < buffer + size) {
86+
while (!in_quote && current < buffer + size) {
87+
// We are outside of quotes. We have to find the last possible line delimiter.
88+
if (*current == copy_params.line_delim) {
89+
last_line_delim_pos = current - buffer;
90+
++num_rows_this_buffer;
91+
} else if (*current == copy_params.quote) {
92+
in_quote = true;
93+
}
94+
++current;
95+
}
96+
6997
while (in_quote && current < buffer + size) {
7098
// We are in a quoted field. We have to find the ending quote.
7199
if ((*current == copy_params.escape) && (current < buffer + size - 1) &&
@@ -76,17 +104,6 @@ size_t CsvParserUtils::find_end(const char* buffer,
76104
}
77105
++current;
78106
}
79-
80-
// We are outside of quotes. We have to find the last possible line delimiter.
81-
while (!in_quote && current < buffer + size) {
82-
if (*current == copy_params.line_delim) {
83-
last_line_delim_pos = current - buffer;
84-
++num_rows_this_buffer;
85-
} else if (*current == copy_params.quote) {
86-
in_quote = true;
87-
}
88-
++current;
89-
}
90107
}
91108
} else {
92109
const char* current = buffer;
@@ -110,22 +127,21 @@ size_t CsvParserUtils::find_end(const char* buffer,
110127
return last_line_delim_pos + 1;
111128
}
112129

113-
const char* CsvParserUtils::get_row(const char* buf,
114-
const char* buf_end,
115-
const char* entire_buf_end,
116-
const Importer_NS::CopyParams& copy_params,
117-
const bool* is_array,
118-
std::vector<std::string>& row,
119-
bool& try_single_thread) {
130+
const char* DelimitedParserUtils::get_row(const char* buf,
131+
const char* buf_end,
132+
const char* entire_buf_end,
133+
const Importer_NS::CopyParams& copy_params,
134+
const bool* is_array,
135+
std::vector<std::string>& row,
136+
bool& try_single_thread) {
120137
const char* field = buf;
121138
const char* p;
122139
bool in_quote = false;
123140
bool in_array = false;
124141
bool has_escape = false;
125142
bool strip_quotes = false;
126143
try_single_thread = false;
127-
std::string line_endings({copy_params.line_delim, '\r', '\n'});
128-
for (p = buf; p < entire_buf_end; p++) {
144+
for (p = buf; p < entire_buf_end; ++p) {
129145
if (*p == copy_params.escape && p < entire_buf_end - 1 &&
130146
*(p + 1) == copy_params.quote) {
131147
p++;
@@ -138,15 +154,19 @@ const char* CsvParserUtils::get_row(const char* buf,
138154
} else if (!in_quote && is_array != nullptr && *p == copy_params.array_begin &&
139155
is_array[row.size()]) {
140156
in_array = true;
141-
} else if (!in_quote && is_array != nullptr && *p == copy_params.array_end &&
142-
is_array[row.size()]) {
143-
in_array = false;
144-
} else if (*p == copy_params.delimiter || is_eol(*p, line_endings)) {
145-
if (!in_quote && !in_array) {
146-
if ((!has_escape && !strip_quotes) ||
147-
(is_array != nullptr && is_array[row.size()])) {
148-
std::string s = trim_space(field, p - field);
149-
row.push_back(s);
157+
while (p < entire_buf_end - 1) { // Array type will be parsed separately.
158+
++p;
159+
if (*p == copy_params.array_end) {
160+
in_array = false;
161+
break;
162+
}
163+
}
164+
} else if (*p == copy_params.delimiter || is_eol(*p, copy_params)) {
165+
if (!in_quote) {
166+
if (!has_escape && !strip_quotes) {
167+
const char* field_end = p;
168+
trim_space(field, field_end);
169+
row.emplace_back(field, field_end - field);
150170
} else {
151171
auto field_buf = std::make_unique<char[]>(p - field + 1);
152172
int j = 0, i = 0;
@@ -159,22 +179,19 @@ const char* CsvParserUtils::get_row(const char* buf,
159179
field_buf[j] = field[i];
160180
}
161181
}
162-
std::string s = trim_space(field_buf.get(), j);
163-
if (copy_params.quoted && s.size() > 0 && s.front() == copy_params.quote) {
164-
s.erase(0, 1);
165-
}
166-
if (copy_params.quoted && s.size() > 0 && s.back() == copy_params.quote) {
167-
s.pop_back();
168-
}
169-
row.push_back(s);
182+
const char* field_begin = field_buf.get();
183+
const char* field_end = field_buf.get() + j;
184+
trim_space(field_begin, field_end);
185+
trim_quotes(field_begin, field_end, copy_params);
186+
row.emplace_back(field_begin, field_end - field_begin);
170187
}
171188
field = p + 1;
172189
has_escape = false;
173190
strip_quotes = false;
174191

175-
if (is_eol(*p, line_endings)) {
192+
if (is_eol(*p, copy_params)) {
176193
// We are at the end of the row. Skip the line endings now.
177-
while (p + 1 < buf_end && is_eol(*(p + 1), line_endings)) {
194+
while (p + 1 < buf_end && is_eol(*(p + 1), copy_params)) {
178195
p++;
179196
}
180197
break;
@@ -196,9 +213,9 @@ const char* CsvParserUtils::get_row(const char* buf,
196213
return p;
197214
}
198215

199-
void CsvParserUtils::parseStringArray(const std::string& s,
200-
const Importer_NS::CopyParams& copy_params,
201-
std::vector<std::string>& string_vec) {
216+
void DelimitedParserUtils::parseStringArray(const std::string& s,
217+
const Importer_NS::CopyParams& copy_params,
218+
std::vector<std::string>& string_vec) {
202219
if (s == copy_params.null_str || s == "NULL" || s.size() < 1 || s.empty()) {
203220
// TODO: should not convert NULL, empty arrays to {"NULL"},
204221
// need to support NULL, empty properly
@@ -230,16 +247,4 @@ void CsvParserUtils::parseStringArray(const std::string& s,
230247
}
231248
}
232249
}
233-
234-
const std::string CsvParserUtils::trim_space(const char* field, const size_t len) {
235-
size_t i = 0;
236-
size_t j = len;
237-
while (i < j && (field[i] == ' ' || field[i] == '\r')) {
238-
i++;
239-
}
240-
while (i < j && (field[j - 1] == ' ' || field[j - 1] == '\r')) {
241-
j--;
242-
}
243-
return std::string(field + i, j - i);
244-
}
245-
} // namespace Importer_NS
250+
} // namespace Importer_NS

Import/CsvParserUtils.h Import/DelimitedParserUtils.h

+1-11
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#include "Import/CopyParams.h"
2929

3030
namespace Importer_NS {
31-
class CsvParserUtils {
31+
class DelimitedParserUtils {
3232
public:
3333
/**
3434
* @brief Finds the closest possible row beginning in the given buffer.
@@ -94,15 +94,5 @@ class CsvParserUtils {
9494
static void parseStringArray(const std::string& s,
9595
const Importer_NS::CopyParams& copy_params,
9696
std::vector<std::string>& string_vec);
97-
98-
/**
99-
* @brief Trims the "space" and "\r" from the both sides of the given buffer.
100-
*
101-
* @param field Given buffer. (NOT OWN)
102-
* @param len Length of the buffer.
103-
*
104-
* @return Trimmed string.
105-
*/
106-
static const std::string trim_space(const char* field, const size_t len);
10797
};
10898
} // namespace Importer_NS

0 commit comments

Comments
 (0)