Skip to content

Commit c6265f4

Browse files
committed
feat(csvclean): Add --fill-short-rows and --fillvalue options
1 parent e4b2dd0 commit c6265f4

File tree

6 files changed

+111
-29
lines changed

6 files changed

+111
-29
lines changed

CHANGELOG.rst

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ Other changes:
1313

1414
* :code:`--header-normalize-space`, to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header
1515
* :code:`--separator`, to change the string with which to join short rows
16+
* :code:`--fill-short-rows`, to fill short rows with the missing cells
17+
* :code:`--fillvalue`, to change the value with which to fill short rows
1618

1719
* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
1820
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.

csvkit/cleanup.py

+43-26
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,20 @@ class RowChecker:
2828
Iterate over rows of a CSV producing cleaned rows and storing error rows.
2929
"""
3030

31-
def __init__(self, reader, header_normalize_space=False, join_short_rows=False, separator='\n'):
31+
def __init__(
32+
self,
33+
reader,
34+
header_normalize_space=False,
35+
join_short_rows=False,
36+
separator='\n',
37+
fill_short_rows=False,
38+
fillvalue=None,
39+
):
3240
self.reader = reader
3341
self.join_short_rows = join_short_rows
3442
self.separator = separator
43+
self.fill_short_rows = fill_short_rows
44+
self.fillvalue = fillvalue
3545

3646
try:
3747
self.column_names = next(reader)
@@ -50,43 +60,50 @@ def checked_rows(self):
5060
joinable_row_errors = []
5161

5262
for row in self.reader:
53-
if len(row) == length:
63+
row_length = len(row)
64+
65+
if row_length == length:
5466
yield row
5567

56-
# Don't join rows across valid rows.
57-
joinable_row_errors = []
68+
if self.join_short_rows:
69+
# Don't join rows across valid rows.
70+
joinable_row_errors = []
71+
72+
continue
73+
74+
if self.fill_short_rows and row_length < length:
75+
yield row + [self.fillvalue] * (length - row_length)
76+
5877
continue
5978

6079
length_mismatch_error = LengthMismatchError(self.reader.line_num - 1, row, length)
6180

6281
self.errors.append(length_mismatch_error)
6382

64-
if len(row) > length:
65-
# Don't join with long rows.
66-
joinable_row_errors = []
67-
continue
68-
69-
if not self.join_short_rows:
70-
continue
83+
if self.join_short_rows:
84+
if row_length > length:
85+
# Don't join with long rows.
86+
joinable_row_errors = []
87+
continue
7188

72-
joinable_row_errors.append(length_mismatch_error)
73-
if len(joinable_row_errors) == 1:
74-
continue
89+
joinable_row_errors.append(length_mismatch_error)
90+
if len(joinable_row_errors) == 1:
91+
continue
7592

76-
while joinable_row_errors:
77-
fixed_row = join_rows([error.row for error in joinable_row_errors], separator=self.separator)
93+
while joinable_row_errors:
94+
fixed_row = join_rows([error.row for error in joinable_row_errors], separator=self.separator)
7895

79-
if len(fixed_row) < length:
80-
break
96+
if len(fixed_row) < length:
97+
break
8198

82-
if len(fixed_row) == length:
83-
yield fixed_row
99+
if len(fixed_row) == length:
100+
yield fixed_row
84101

85-
for fixed in joinable_row_errors:
86-
self.errors.remove(fixed)
102+
for fixed in joinable_row_errors:
103+
self.errors.remove(fixed)
87104

88-
joinable_row_errors = []
89-
break
105+
joinable_row_errors = []
106+
break
90107

91-
# keep trying in case we're too long because of a straggler
92-
joinable_row_errors = joinable_row_errors[1:]
108+
# keep trying in case we're too long because of a straggler
109+
joinable_row_errors = joinable_row_errors[1:]

csvkit/utilities/csvclean.py

+11
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,29 @@ def add_arguments(self):
2323
self.argparser.add_argument(
2424
'--separator', dest='separator', default='\n',
2525
help='The string with which to join short rows. Defaults to a newline.')
26+
self.argparser.add_argument(
27+
'--fill-short-rows', dest='fill_short_rows', action='store_true',
28+
help='Fill short rows with the missing cells.')
29+
self.argparser.add_argument(
30+
'--fillvalue', dest='fillvalue',
31+
help='The value with which to fill short rows. Defaults to none.')
2632

2733
def main(self):
2834
if self.additional_input_expected():
2935
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')
3036

37+
if self.args.join_short_rows and self.args.fill_short_rows:
38+
self.argparser.error('The --join-short-rows and --fill-short-rows options are mutually exclusive.')
39+
3140
reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
3241

3342
checker = RowChecker(
3443
reader,
3544
header_normalize_space=self.args.header_normalize_space,
3645
join_short_rows=self.args.join_short_rows,
3746
separator=self.args.separator,
47+
fill_short_rows=self.args.fill_short_rows,
48+
fillvalue=self.args.fillvalue,
3849
)
3950

4051
output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)

docs/scripts/csvclean.rst

+36
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,30 @@ Cleans a CSV file of common syntax errors:
3636
1,"1 Main St, Springfield",US
3737
2,"123 Acadia Avenue, London",GB
3838
39+
- If a CSV has missing delimiters, like:
40+
41+
.. code-block:: none
42+
43+
id,name,country
44+
1,Alice
45+
2,Bob,CA
46+
47+
You can add the missing delimiters with :code:`--fill-short-rows`:
48+
49+
.. code-block:: none
50+
51+
id,name,country
52+
1,Alice,
53+
2,Bob,CA
54+
55+
To change the value used to fill short rows, use :code:`--fillvalue`. For example, with :code:`--fillvalue "US"`:
56+
57+
.. code-block:: none
58+
59+
id,name,country
60+
1,Alice,US
61+
2,Bob,CA
62+
3963
All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1.
4064

4165
.. note::
@@ -63,6 +87,18 @@ All valid rows are written to standard output, and all error rows along with lin
6387
6488
optional arguments:
6589
-h, --help show this help message and exit
90+
--header-normalize-space
91+
Strip leading and trailing whitespace and replace
92+
sequences of whitespace characters by a single space
93+
in the header.
94+
--join-short-rows Merges short rows into a single row.
95+
--separator SEPARATOR
96+
The string with which to join short rows. Defaults to
97+
a newline.
98+
--fill-short-rows Fill short rows with the missing cells.
99+
--fillvalue FILLVALUE
100+
The value with which to fill short rows. Defaults to
101+
none.
66102
67103
See also: :doc:`../common_arguments`.
68104

examples/test_join_short_rows.csv

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
a,b,c
22
1,cat
3-
dog,3
3+
dog,c
44
3,b,c

tests/test_utilities/test_csvclean.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,30 @@ def test_header_normalize_space(self):
8686
def test_join_short_rows(self):
8787
self.assertCleaned(['--join-short-rows', 'examples/test_join_short_rows.csv'], [
8888
['a', 'b', 'c'],
89-
['1', 'cat\ndog', '3'],
89+
['1', 'cat\ndog', 'c'],
9090
['3', 'b', 'c'],
9191
])
9292

9393
def test_join_short_rows_separator(self):
9494
self.assertCleaned(['--join-short-rows', '--separator', 'XYZ', 'examples/test_join_short_rows.csv'], [
9595
['a', 'b', 'c'],
96-
['1', 'catXYZdog', '3'],
96+
['1', 'catXYZdog', 'c'],
97+
['3', 'b', 'c'],
98+
])
99+
100+
def test_fill_short_rows(self):
101+
self.assertCleaned(['--fill-short-rows', 'examples/test_join_short_rows.csv'], [
102+
['a', 'b', 'c'],
103+
['1', 'cat', ''],
104+
['dog', 'c', ''],
105+
['3', 'b', 'c'],
106+
])
107+
108+
def test_fill_short_rows_separator(self):
109+
self.assertCleaned(['--fill-short-rows', '--fillvalue', 'XYZ', 'examples/test_join_short_rows.csv'], [
110+
['a', 'b', 'c'],
111+
['1', 'cat', 'XYZ'],
112+
['dog', 'c', 'XYZ'],
97113
['3', 'b', 'c'],
98114
])
99115

0 commit comments

Comments
 (0)