Skip to content

Commit b8d152b

Browse files
committed
feat(csvclean): Add --empty-columns option, closes #426
1 parent d05f414 commit b8d152b

File tree

7 files changed

+108
-45
lines changed

7 files changed

+108
-45
lines changed

CHANGELOG.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
**BACKWARDS-INCOMPATIBLE CHANGES:**
55

6-
* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
7-
* :doc:`/scripts/csvclean` no longer fixes errors by default. Opt in using the :code:`--join-short-rows` option.
8-
* :doc:`/scripts/csvclean` joins short rows using a newline by default, instead of a space.
6+
* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` option to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
7+
* :doc:`/scripts/csvclean` no longer fixes errors by default. Opt in to the original behavior using the :code:`--join-short-rows` option.
8+
* :doc:`/scripts/csvclean` joins short rows using a newline by default, instead of a space. Restore the original behavior using the :code:`--separator " "` option.
99

1010
Other changes:
1111

@@ -15,6 +15,7 @@ Other changes:
1515
* :code:`--separator`, to change the string with which to join short rows
1616
* :code:`--fill-short-rows`, to fill short rows with the missing cells
1717
* :code:`--fillvalue`, to change the value with which to fill short rows
18+
* :code:`--empty-columns`, to error on empty columns
1819

1920
* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
2021
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.

csvkit/cleanup.py

+44-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#!/usr/bin/env python
2+
from dataclasses import dataclass
23

3-
from csvkit.exceptions import CSVTestException, LengthMismatchError
4+
5+
@dataclass
6+
class Error:
7+
line_number: int
8+
row: int
9+
msg: str
410

511

612
def join_rows(rows, separator):
@@ -36,12 +42,16 @@ def __init__(
3642
separator='\n',
3743
fill_short_rows=False,
3844
fillvalue=None,
45+
empty_columns=False,
46+
zero_based=False,
3947
):
4048
self.reader = reader
4149
self.join_short_rows = join_short_rows
4250
self.separator = separator
4351
self.fill_short_rows = fill_short_rows
4452
self.fillvalue = fillvalue
53+
self.empty_columns = empty_columns
54+
self.zero_based = zero_based
4555

4656
try:
4757
self.column_names = next(reader)
@@ -56,13 +66,23 @@ def checked_rows(self):
5666
"""
5767
A generator which yields rows which are ready to write to output.
5868
"""
59-
length = len(self.column_names)
69+
len_column_names = len(self.column_names)
6070
joinable_row_errors = []
6171

72+
row_count = 0
73+
empty_counts = [0 for _ in range(len_column_names)]
74+
6275
for row in self.reader:
63-
row_length = len(row)
76+
line_number = self.reader.line_num - 1
77+
row_count += 1
78+
len_row = len(row)
6479

65-
if row_length == length:
80+
if self.empty_columns:
81+
for i, value in enumerate(row):
82+
if value == '':
83+
empty_counts[i] += 1
84+
85+
if len_row == len_column_names:
6686
yield row
6787

6888
if self.join_short_rows:
@@ -71,32 +91,32 @@ def checked_rows(self):
7191

7292
continue
7393

74-
if self.fill_short_rows and row_length < length:
75-
yield row + [self.fillvalue] * (length - row_length)
94+
if self.fill_short_rows and len_row < len_column_names:
95+
yield row + [self.fillvalue] * (len_column_names - len_row)
7696

7797
continue
7898

79-
length_mismatch_error = LengthMismatchError(self.reader.line_num - 1, row, length)
99+
length_error = Error(line_number, row, f'Expected {len_column_names} columns, found {len_row} columns')
80100

81-
self.errors.append(length_mismatch_error)
101+
self.errors.append(length_error)
82102

83103
if self.join_short_rows:
84-
if row_length > length:
104+
if len_row > len_column_names:
85105
# Don't join with long rows.
86106
joinable_row_errors = []
87107
continue
88108

89-
joinable_row_errors.append(length_mismatch_error)
109+
joinable_row_errors.append(length_error)
90110
if len(joinable_row_errors) == 1:
91111
continue
92112

93113
while joinable_row_errors:
94114
fixed_row = join_rows([error.row for error in joinable_row_errors], separator=self.separator)
95115

96-
if len(fixed_row) < length:
116+
if len(fixed_row) < len_column_names:
97117
break
98118

99-
if len(fixed_row) == length:
119+
if len(fixed_row) == len_column_names:
100120
yield fixed_row
101121

102122
for fixed in joinable_row_errors:
@@ -107,3 +127,15 @@ def checked_rows(self):
107127

108128
# keep trying in case we're too long because of a straggler
109129
joinable_row_errors = joinable_row_errors[1:]
130+
131+
if row_count:
132+
if empty_columns := [i for i, count in enumerate(empty_counts) if count == row_count]:
133+
offset = 0 if self.zero_based else 1
134+
self.errors.append(
135+
Error(
136+
1,
137+
["" for _ in range(len_column_names)],
138+
f"Empty columns named {', '.join(repr(self.column_names[i]) for i in empty_columns)}! "
139+
f"Try: csvcut -C {','.join(str(i + offset) for i in empty_columns)}",
140+
)
141+
)

csvkit/exceptions.py

-26
Original file line numberDiff line numberDiff line change
@@ -23,32 +23,6 @@ class ColumnIdentifierError(CustomException):
2323
pass
2424

2525

26-
class CSVTestException(CustomException):
27-
"""
28-
Superclass for all row-test-failed exceptions.
29-
All must have a line number, the problematic row, and a text explanation.
30-
"""
31-
32-
def __init__(self, line_number, row, msg):
33-
super().__init__(msg)
34-
self.line_number = line_number
35-
self.row = row
36-
37-
38-
class LengthMismatchError(CSVTestException):
39-
"""
40-
Encapsulate information about a row which as the wrong length.
41-
"""
42-
43-
def __init__(self, line_number, row, expected_length):
44-
msg = 'Expected %i columns, found %i columns' % (expected_length, len(row))
45-
super().__init__(line_number, row, msg)
46-
47-
@property
48-
def length(self):
49-
return len(self.row)
50-
51-
5226
class InvalidValueForTypeException(CustomException):
5327
"""
5428
Exception raised when a value can not be normalized to a specified type.

csvkit/utilities/csvclean.py

+5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ def add_arguments(self):
2929
self.argparser.add_argument(
3030
'--fillvalue', dest='fillvalue',
3131
help='The value with which to fill short rows. Defaults to none.')
32+
self.argparser.add_argument(
33+
'--empty-columns', dest='empty_columns', action='store_true',
34+
help='Report empty columns as errors.')
3235

3336
def main(self):
3437
if self.additional_input_expected():
@@ -46,6 +49,8 @@ def main(self):
4649
separator=self.args.separator,
4750
fill_short_rows=self.args.fill_short_rows,
4851
fillvalue=self.args.fillvalue,
52+
empty_columns=self.args.empty_columns,
53+
zero_based=self.args.zero_based,
4954
)
5055

5156
output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)

docs/scripts/csvclean.rst

+27-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ Description
77

88
Cleans a CSV file of common syntax errors:
99

10-
- reports rows that have a different number of columns than the header row
10+
- Reports rows that have a different number of columns than the header row.
11+
- Reports columns that are empty, if the :code:`--empty-columns` option is set.
1112
- If a CSV has unquoted cells that contain line breaks, like:
1213

1314
.. code-block:: none
@@ -103,13 +104,14 @@ All valid rows are written to standard output, and all error rows along with lin
103104
--fillvalue FILLVALUE
104105
The value with which to fill short rows. Defaults to
105106
none.
107+
--empty-columns Report empty columns as errors.
106108
107109
See also: :doc:`../common_arguments`.
108110

109111
Examples
110112
========
111113

112-
Test a file with known bad rows:
114+
Test a file with data rows that are shorter and longer than the header row:
113115

114116
.. code-block:: console
115117
@@ -125,6 +127,29 @@ Test a file with known bad rows:
125127

126128
If any data rows are longer than the header row, you need to add columns manually: for example, by adding one or more delimiters (``,``) to the end of the header row. :code:`csvclean` can't do this, because it is designed to work with standard input, and correcting an error at the start of the CSV data based on an observation later in the CSV data would require holding all the CSV data in memory – which is not an option for large files.
127129

130+
Test a file with empty columns:
131+
132+
.. code-block:: console
133+
134+
$ csvclean --empty-columns examples/test_empty_columns.csv 2> errors.csv
135+
a,b,c,,
136+
a,,,,
137+
,,c,,
138+
,,,,
139+
$ cat errors.csv
140+
line_number,msg,a,b,c,,
141+
1,"Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5",,,,,
142+
143+
Use :doc:`csvcut` to exclude the empty columns:
144+
145+
.. code-block:: bash
146+
147+
$ csvcut -C 2,4,5 examples/test_empty_columns.csv
148+
a,c
149+
a,
150+
,c
151+
,
152+
128153
To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use:
129154

130155
.. code-block:: bash

examples/test_empty_columns.csv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
a,b,c,,
2+
a,,,,
3+
,,c,,
4+
,,,,

tests/test_utilities/test_csvclean.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,13 @@ def test_simple(self):
7575
def test_no_header_row(self):
7676
self.assertCleaned(['examples/no_header_row.csv'], [
7777
['1', '2', '3'],
78-
], [])
78+
])
7979

8080
def test_header_normalize_space(self):
8181
self.assertCleaned(['--header-normalize-space', 'examples/test_header_newline.csv'], [
8282
['start end', 'b', 'c'],
8383
['d', 'e', 'f'],
84-
], [])
84+
])
8585

8686
def test_join_short_rows(self):
8787
self.assertCleaned(['--join-short-rows', 'examples/test_join_short_rows.csv'], [
@@ -113,6 +113,28 @@ def test_fill_short_rows_separator(self):
113113
['3', 'b', 'c'],
114114
])
115115

116+
def test_empty_columns(self):
117+
self.assertCleaned(['--empty-columns', 'examples/test_empty_columns.csv'], [
118+
['a', 'b', 'c', '', ''],
119+
['a', '', '', '', ''],
120+
['', '', 'c', '', ''],
121+
['', '', '', '', ''],
122+
], [
123+
['line_number', 'msg', 'a', 'b', 'c', '', ''],
124+
['1', "Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5", '', '', '', '', ''],
125+
])
126+
127+
def test_empty_columns_zero(self):
128+
self.assertCleaned(['--empty-columns', '--zero', 'examples/test_empty_columns.csv'], [
129+
['a', 'b', 'c', '', ''],
130+
['a', '', '', '', ''],
131+
['', '', 'c', '', ''],
132+
['', '', '', '', ''],
133+
], [
134+
['line_number', 'msg', 'a', 'b', 'c', '', ''],
135+
['1', "Empty columns named 'b', '', ''! Try: csvcut -C 1,3,4", '', '', '', '', ''],
136+
])
137+
116138
def test_removes_optional_quote_characters(self):
117139
self.assertCleaned(['examples/optional_quote_characters.csv'], [
118140
['a', 'b', 'c'],

0 commit comments

Comments
 (0)