20
20
* @brief Implementation of CsvParserUtils class.
21
21
*/
22
22
23
- #include " Import/CsvParserUtils .h"
23
+ #include " Import/DelimitedParserUtils .h"
24
24
25
25
#include " Shared/Logger.h"
26
26
#include " StringDictionary/StringDictionary.h"
27
27
28
28
namespace {
29
- static const bool is_eol (const char & p, const std::string& line_delims) {
30
- for (auto i : line_delims) {
31
- if (p == i) {
32
- return true ;
33
- }
29
+ inline bool is_eol (const char & c, const Importer_NS::CopyParams& copy_params) {
30
+ return c == copy_params.line_delim || c == ' \n ' || c == ' \r ' ;
31
+ }
32
+
33
+ inline void trim_space (const char *& field_begin, const char *& field_end) {
34
+ while (field_begin < field_end && (*field_begin == ' ' || *field_begin == ' \r ' )) {
35
+ ++field_begin;
36
+ }
37
+ while (field_begin < field_end &&
38
+ (*(field_end - 1 ) == ' ' || *(field_end - 1 ) == ' \r ' )) {
39
+ --field_end;
40
+ }
41
+ }
42
+
43
+ inline void trim_quotes (const char *& field_begin,
44
+ const char *& field_end,
45
+ const Importer_NS::CopyParams& copy_params) {
46
+ if (copy_params.quoted && field_end - field_begin > 0 &&
47
+ *field_begin == copy_params.quote ) {
48
+ ++field_begin;
49
+ }
50
+ if (copy_params.quoted && field_end - field_begin > 0 &&
51
+ *(field_end - 1 ) == copy_params.quote ) {
52
+ --field_end;
34
53
}
35
- return false ;
36
54
}
37
55
} // namespace
38
56
39
57
namespace Importer_NS {
40
- size_t CsvParserUtils ::find_beginning (const char * buffer,
41
- size_t begin,
42
- size_t end,
43
- const Importer_NS::CopyParams& copy_params) {
58
+ size_t DelimitedParserUtils ::find_beginning (const char * buffer,
59
+ size_t begin,
60
+ size_t end,
61
+ const Importer_NS::CopyParams& copy_params) {
44
62
// @TODO(wei) line_delim is in quotes note supported
45
63
if (begin == 0 || (begin > 0 && buffer[begin - 1 ] == copy_params.line_delim )) {
46
64
return 0 ;
@@ -55,17 +73,27 @@ size_t CsvParserUtils::find_beginning(const char* buffer,
55
73
return i;
56
74
}
57
75
58
- size_t CsvParserUtils ::find_end (const char * buffer,
59
- size_t size,
60
- const Importer_NS::CopyParams& copy_params,
61
- unsigned int & num_rows_this_buffer) {
76
+ size_t DelimitedParserUtils ::find_end (const char * buffer,
77
+ size_t size,
78
+ const Importer_NS::CopyParams& copy_params,
79
+ unsigned int & num_rows_this_buffer) {
62
80
size_t last_line_delim_pos = 0 ;
63
81
if (copy_params.quoted ) {
64
82
const char * current = buffer;
65
- last_line_delim_pos = 0 ;
66
83
bool in_quote = false ;
67
84
68
85
while (current < buffer + size) {
86
+ while (!in_quote && current < buffer + size) {
87
+ // We are outside of quotes. We have to find the last possible line delimiter.
88
+ if (*current == copy_params.line_delim ) {
89
+ last_line_delim_pos = current - buffer;
90
+ ++num_rows_this_buffer;
91
+ } else if (*current == copy_params.quote ) {
92
+ in_quote = true ;
93
+ }
94
+ ++current;
95
+ }
96
+
69
97
while (in_quote && current < buffer + size) {
70
98
// We are in a quoted field. We have to find the ending quote.
71
99
if ((*current == copy_params.escape ) && (current < buffer + size - 1 ) &&
@@ -76,17 +104,6 @@ size_t CsvParserUtils::find_end(const char* buffer,
76
104
}
77
105
++current;
78
106
}
79
-
80
- // We are outside of quotes. We have to find the last possible line delimiter.
81
- while (!in_quote && current < buffer + size) {
82
- if (*current == copy_params.line_delim ) {
83
- last_line_delim_pos = current - buffer;
84
- ++num_rows_this_buffer;
85
- } else if (*current == copy_params.quote ) {
86
- in_quote = true ;
87
- }
88
- ++current;
89
- }
90
107
}
91
108
} else {
92
109
const char * current = buffer;
@@ -110,22 +127,21 @@ size_t CsvParserUtils::find_end(const char* buffer,
110
127
return last_line_delim_pos + 1 ;
111
128
}
112
129
113
- const char * CsvParserUtils ::get_row (const char * buf,
114
- const char * buf_end,
115
- const char * entire_buf_end,
116
- const Importer_NS::CopyParams& copy_params,
117
- const bool * is_array,
118
- std::vector<std::string>& row,
119
- bool & try_single_thread) {
130
+ const char * DelimitedParserUtils ::get_row (const char * buf,
131
+ const char * buf_end,
132
+ const char * entire_buf_end,
133
+ const Importer_NS::CopyParams& copy_params,
134
+ const bool * is_array,
135
+ std::vector<std::string>& row,
136
+ bool & try_single_thread) {
120
137
const char * field = buf;
121
138
const char * p;
122
139
bool in_quote = false ;
123
140
bool in_array = false ;
124
141
bool has_escape = false ;
125
142
bool strip_quotes = false ;
126
143
try_single_thread = false ;
127
- std::string line_endings ({copy_params.line_delim , ' \r ' , ' \n ' });
128
- for (p = buf; p < entire_buf_end; p++) {
144
+ for (p = buf; p < entire_buf_end; ++p) {
129
145
if (*p == copy_params.escape && p < entire_buf_end - 1 &&
130
146
*(p + 1 ) == copy_params.quote ) {
131
147
p++;
@@ -138,15 +154,19 @@ const char* CsvParserUtils::get_row(const char* buf,
138
154
} else if (!in_quote && is_array != nullptr && *p == copy_params.array_begin &&
139
155
is_array[row.size ()]) {
140
156
in_array = true ;
141
- } else if (!in_quote && is_array != nullptr && *p == copy_params.array_end &&
142
- is_array[row.size ()]) {
143
- in_array = false ;
144
- } else if (*p == copy_params.delimiter || is_eol (*p, line_endings)) {
145
- if (!in_quote && !in_array) {
146
- if ((!has_escape && !strip_quotes) ||
147
- (is_array != nullptr && is_array[row.size ()])) {
148
- std::string s = trim_space (field, p - field);
149
- row.push_back (s);
157
+ while (p < entire_buf_end - 1 ) { // Array type will be parsed separately.
158
+ ++p;
159
+ if (*p == copy_params.array_end ) {
160
+ in_array = false ;
161
+ break ;
162
+ }
163
+ }
164
+ } else if (*p == copy_params.delimiter || is_eol (*p, copy_params)) {
165
+ if (!in_quote) {
166
+ if (!has_escape && !strip_quotes) {
167
+ const char * field_end = p;
168
+ trim_space (field, field_end);
169
+ row.emplace_back (field, field_end - field);
150
170
} else {
151
171
auto field_buf = std::make_unique<char []>(p - field + 1 );
152
172
int j = 0 , i = 0 ;
@@ -159,22 +179,19 @@ const char* CsvParserUtils::get_row(const char* buf,
159
179
field_buf[j] = field[i];
160
180
}
161
181
}
162
- std::string s = trim_space (field_buf.get (), j);
163
- if (copy_params.quoted && s.size () > 0 && s.front () == copy_params.quote ) {
164
- s.erase (0 , 1 );
165
- }
166
- if (copy_params.quoted && s.size () > 0 && s.back () == copy_params.quote ) {
167
- s.pop_back ();
168
- }
169
- row.push_back (s);
182
+ const char * field_begin = field_buf.get ();
183
+ const char * field_end = field_buf.get () + j;
184
+ trim_space (field_begin, field_end);
185
+ trim_quotes (field_begin, field_end, copy_params);
186
+ row.emplace_back (field_begin, field_end - field_begin);
170
187
}
171
188
field = p + 1 ;
172
189
has_escape = false ;
173
190
strip_quotes = false ;
174
191
175
- if (is_eol (*p, line_endings )) {
192
+ if (is_eol (*p, copy_params )) {
176
193
// We are at the end of the row. Skip the line endings now.
177
- while (p + 1 < buf_end && is_eol (*(p + 1 ), line_endings )) {
194
+ while (p + 1 < buf_end && is_eol (*(p + 1 ), copy_params )) {
178
195
p++;
179
196
}
180
197
break ;
@@ -196,9 +213,9 @@ const char* CsvParserUtils::get_row(const char* buf,
196
213
return p;
197
214
}
198
215
199
- void CsvParserUtils ::parseStringArray (const std::string& s,
200
- const Importer_NS::CopyParams& copy_params,
201
- std::vector<std::string>& string_vec) {
216
+ void DelimitedParserUtils ::parseStringArray (const std::string& s,
217
+ const Importer_NS::CopyParams& copy_params,
218
+ std::vector<std::string>& string_vec) {
202
219
if (s == copy_params.null_str || s == " NULL" || s.size () < 1 || s.empty ()) {
203
220
// TODO: should not convert NULL, empty arrays to {"NULL"},
204
221
// need to support NULL, empty properly
@@ -230,16 +247,4 @@ void CsvParserUtils::parseStringArray(const std::string& s,
230
247
}
231
248
}
232
249
}
233
-
234
- const std::string CsvParserUtils::trim_space (const char * field, const size_t len) {
235
- size_t i = 0 ;
236
- size_t j = len;
237
- while (i < j && (field[i] == ' ' || field[i] == ' \r ' )) {
238
- i++;
239
- }
240
- while (i < j && (field[j - 1 ] == ' ' || field[j - 1 ] == ' \r ' )) {
241
- j--;
242
- }
243
- return std::string (field + i, j - i);
244
- }
245
- } // namespace Importer_NS
250
+ } // namespace Importer_NS
0 commit comments