Skip to content

Commit e4b2dd0

Browse files
committed
feat(csvclean): Add --join-short-rows and --separator options. BREAKING: Fixes are opt-in. Default separator is newline, not space.
1 parent 2f1ac26 commit e4b2dd0

File tree

6 files changed

+78
-20
lines changed

6 files changed

+78
-20
lines changed

CHANGELOG.rst

+8-2
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44
**BACKWARDS-INCOMPATIBLE CHANGES:**
55

66
* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
7+
* :doc:`/scripts/csvclean` no longer fixes errors by default. Opt in using the :code:`--join-short-rows` option.
8+
* :doc:`/scripts/csvclean` joins short rows using a newline by default, instead of a space.
79

810
Other changes:
911

10-
* feat: :doc:`/scripts/csvclean` adds a :code:`--header-normalize-space` option to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header.
12+
* feat: :doc:`/scripts/csvclean` adds the options:
13+
14+
* :code:`--header-normalize-space`, to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header
15+
* :code:`--separator`, to change the string with which to join short rows
16+
1117
* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
1218
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
13-
* fix: :doc:`/scripts/csvclean` no longer reports length mismatch errors that were fixed.
19+
* fix: :doc:`/scripts/csvclean`: The :code:`--join-short-rows` option no longer reports length mismatch errors that were fixed.
1420
* fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNUMERIC>`__). Use the :code:`--locale` option to set the locale of any formatted numbers.
1521

1622
1.5.0 - March 28, 2024

csvkit/cleanup.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from csvkit.exceptions import CSVTestException, LengthMismatchError
44

55

6-
def join_rows(rows, joiner=' '):
6+
def join_rows(rows, separator):
77
"""
88
Given a series of rows, return them as a single row where the inner edge cells are merged.
99
10-
:param joiner:
11-
The separator between cells, a single space by default.
10+
:param separator:
11+
The string with which to join the cells.
1212
"""
1313
rows = list(rows)
1414
fixed_row = rows[0][:]
@@ -17,7 +17,7 @@ def join_rows(rows, joiner=' '):
1717
if len(row) == 0:
1818
row = ['']
1919

20-
fixed_row[-1] += f"{joiner}{row[0]}"
20+
fixed_row[-1] += f"{separator}{row[0]}"
2121
fixed_row.extend(row[1:])
2222

2323
return fixed_row
@@ -28,8 +28,10 @@ class RowChecker:
2828
Iterate over rows of a CSV producing cleaned rows and storing error rows.
2929
"""
3030

31-
def __init__(self, reader, header_normalize_space=False):
31+
def __init__(self, reader, header_normalize_space=False, join_short_rows=False, separator='\n'):
3232
self.reader = reader
33+
self.join_short_rows = join_short_rows
34+
self.separator = separator
3335

3436
try:
3537
self.column_names = next(reader)
@@ -64,12 +66,15 @@ def checked_rows(self):
6466
joinable_row_errors = []
6567
continue
6668

69+
if not self.join_short_rows:
70+
continue
71+
6772
joinable_row_errors.append(length_mismatch_error)
6873
if len(joinable_row_errors) == 1:
6974
continue
7075

7176
while joinable_row_errors:
72-
fixed_row = join_rows([error.row for error in joinable_row_errors], joiner=' ')
77+
fixed_row = join_rows([error.row for error in joinable_row_errors], separator=self.separator)
7378

7479
if len(fixed_row) < length:
7580
break

csvkit/utilities/csvclean.py

+8
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def add_arguments(self):
1717
'--header-normalize-space', dest='header_normalize_space', action='store_true',
1818
help='Strip leading and trailing whitespace and replace sequences of whitespace characters by a single '
1919
'space in the header.')
20+
self.argparser.add_argument(
21+
'--join-short-rows', dest='join_short_rows', action='store_true',
22+
help='Merges short rows into a single row.')
23+
self.argparser.add_argument(
24+
'--separator', dest='separator', default='\n',
25+
help='The string with which to join short rows. Defaults to a newline.')
2026

2127
def main(self):
2228
if self.additional_input_expected():
@@ -27,6 +33,8 @@ def main(self):
2733
checker = RowChecker(
2834
reader,
2935
header_normalize_space=self.args.header_normalize_space,
36+
join_short_rows=self.args.join_short_rows,
37+
separator=self.args.separator,
3038
)
3139

3240
output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)

docs/scripts/csvclean.rst

+37-9
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,46 @@ Description
77

88
Cleans a CSV file of common syntax errors:
99

10-
* reports rows that have a different number of columns than the header row
11-
* attempts to correct the CSV by merging short rows into a single row
10+
- reports rows that have a different number of columns than the header row
11+
- If a CSV has unquoted cells that contain line breaks, like:
1212

13-
Note that every csvkit tool does the following:
13+
.. code-block:: none
1414
15-
* removes optional quote characters, unless the `--quoting` (`-u`) option is set to change this behavior
16-
* changes the field delimiter to a comma, if the input delimiter is set with the `--delimiter` (`-d`) or `--tabs` (`-t`) options
17-
* changes the record delimiter to a line feed (LF or ``\n``)
18-
* changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
19-
* changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option
15+
id,address,country
16+
1,1 Main St
17+
Springfield,US
18+
2,123 Acadia Avenue
19+
London,GB
2020
21-
All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1::
21+
Use :code:`--join-short-rows` to attempt to correct the errors by merging short rows into a single row:
22+
23+
.. code-block:: none
24+
25+
id,address,country
26+
1,"1 Main St
27+
Springfield",US
28+
2,"123 Acadia Avenue
29+
London",GB
30+
31+
To change the string used to join the lines, use :code:`--separator`. For example, with :code:`--separator ", "`:
32+
33+
.. code-block:: none
34+
35+
id,address,country
36+
1,"1 Main St, Springfield",US
37+
2,"123 Acadia Avenue, London",GB
38+
39+
All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1.
40+
41+
.. note::
42+
43+
Every csvkit tool does the following:
44+
45+
- Removes optional quote characters, unless the `--quoting` (`-u`) option is set to change this behavior
46+
- Changes the field delimiter to a comma, if the input delimiter is set with the `--delimiter` (`-d`) or `--tabs` (`-t`) options
47+
- Changes the record delimiter to a line feed (LF or ``\n``)
48+
- Changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
49+
- Changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option
2250

2351
.. code-block:: none
2452

examples/test_join_short_rows.csv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
a,b,c
2+
1,cat
3+
dog,3
4+
3,b,c

tests/test_utilities/test_csvclean.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,17 @@ def test_header_normalize_space(self):
8383
['d', 'e', 'f'],
8484
], [])
8585

86-
def test_merge_short_rows(self):
87-
self.assertCleaned(['examples/test_merge_short_rows.csv'], [
86+
def test_join_short_rows(self):
87+
self.assertCleaned(['--join-short-rows', 'examples/test_join_short_rows.csv'], [
8888
['a', 'b', 'c'],
89-
['1', 'cat dog', '3'],
89+
['1', 'cat\ndog', '3'],
90+
['3', 'b', 'c'],
91+
])
92+
93+
def test_join_short_rows_separator(self):
94+
self.assertCleaned(['--join-short-rows', '--separator', 'XYZ', 'examples/test_join_short_rows.csv'], [
95+
['a', 'b', 'c'],
96+
['1', 'catXYZdog', '3'],
9097
['3', 'b', 'c'],
9198
])
9299

0 commit comments

Comments
 (0)