Skip to content

Commit 8d700e8

Browse files
committed
csvclean: Use standard output and standard error, and use exit code 1 if errors #781 #195
1 parent d00ea20 commit 8d700e8

File tree

6 files changed

+103
-145
lines changed

6 files changed

+103
-145
lines changed

CHANGELOG.rst

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
2.0.0 - Unreleased
2+
------------------
3+
4+
**BACKWARDS-INCOMPATIBLE CHANGES**
5+
6+
* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
7+
18
1.5.0 - March 28, 2024
29
----------------------
310

csvkit/cleanup.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55

66
def join_rows(rows, joiner=' '):
77
"""
8-
Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a
9-
single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg.
8+
Given a series of rows, return them as a single row where the inner edge cells are merged.
9+
10+
:param joiner:
11+
The separator between cells, a single space by default.
1012
"""
1113
rows = list(rows)
1214
fixed_row = rows[0][:]
@@ -33,8 +35,6 @@ def __init__(self, reader):
3335
except StopIteration:
3436
self.column_names = []
3537
self.errors = []
36-
self.rows_joined = 0
37-
self.joins = 0
3838

3939
def checked_rows(self):
4040
"""
@@ -69,9 +69,6 @@ def checked_rows(self):
6969
break
7070

7171
if len(fixed_row) == length:
72-
self.rows_joined += len(joinable_row_errors)
73-
self.joins += 1
74-
7572
yield fixed_row
7673

7774
for fixed in joinable_row_errors:

csvkit/cli.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -68,19 +68,26 @@ class CSVKitUtility:
6868
epilog = ''
6969
override_flags = ''
7070

71-
def __init__(self, args=None, output_file=None):
71+
def __init__(self, args=None, output_file=None, error_file=None):
7272
"""
7373
Perform argument processing and other setup for a CSVKitUtility.
7474
"""
7575
self._init_common_parser()
7676
self.add_arguments()
7777
self.args = self.argparser.parse_args(args)
78+
7879
# Output file is only set during testing.
7980
if output_file is None:
8081
self.output_file = sys.stdout
8182
else:
8283
self.output_file = output_file
8384

85+
# Error file is only set during testing.
86+
if error_file is None:
87+
self.error_file = sys.stderr
88+
else:
89+
self.error_file = error_file
90+
8491
self.reader_kwargs = self._extract_csv_reader_kwargs()
8592
self.writer_kwargs = self._extract_csv_writer_kwargs()
8693

csvkit/utilities/csvclean.py

+12-60
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env python
22

33
import sys
4-
from os.path import splitext
54

65
import agate
76

@@ -14,75 +13,28 @@ class CSVClean(CSVKitUtility):
1413
override_flags = ['L', 'blanks', 'date-format', 'datetime-format']
1514

1615
def add_arguments(self):
17-
self.argparser.add_argument(
18-
'-n', '--dry-run', dest='dryrun', action='store_true',
19-
help='Do not create output files. Information about what would have been done will be printed to STDERR.')
16+
pass
2017

2118
def main(self):
2219
if self.additional_input_expected():
2320
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')
2421

2522
reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
2623

27-
if self.args.dryrun:
28-
checker = RowChecker(reader)
24+
checker = RowChecker(reader)
2925

30-
for _row in checker.checked_rows():
31-
pass
26+
output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
27+
output_writer.writerow(checker.column_names)
28+
for row in checker.checked_rows():
29+
output_writer.writerow(row)
3230

33-
if checker.errors:
34-
for e in checker.errors:
35-
self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg))
36-
else:
37-
self.output_file.write('No errors.\n')
31+
if checker.errors:
32+
error_writer = agate.csv.writer(self.error_file, **self.writer_kwargs)
33+
error_writer.writerow(['line_number', 'msg'] + checker.column_names)
34+
for error in checker.errors:
35+
error_writer.writerow([error.line_number, error.msg] + error.row)
3836

39-
if checker.joins:
40-
self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected '
41-
'internal line breaks.\n' % (checker.rows_joined, checker.joins))
42-
else:
43-
if self.input_file == sys.stdin:
44-
base = 'stdin' # "<stdin>_out.csv" is invalid on Windows
45-
else:
46-
base = splitext(self.input_file.name)[0]
47-
48-
with open(f'{base}_out.csv', 'w') as f:
49-
clean_writer = agate.csv.writer(f, **self.writer_kwargs)
50-
51-
checker = RowChecker(reader)
52-
clean_writer.writerow(checker.column_names)
53-
54-
for row in checker.checked_rows():
55-
clean_writer.writerow(row)
56-
57-
if checker.errors:
58-
error_filename = f'{base}_err.csv'
59-
60-
with open(error_filename, 'w') as f:
61-
error_writer = agate.csv.writer(f, **self.writer_kwargs)
62-
63-
error_header = ['line_number', 'msg']
64-
error_header.extend(checker.column_names)
65-
error_writer.writerow(error_header)
66-
67-
error_count = len(checker.errors)
68-
69-
for e in checker.errors:
70-
error_writer.writerow(self._format_error_row(e))
71-
72-
self.output_file.write('%i error%s logged to %s\n' % (
73-
error_count, '' if error_count == 1 else 's', error_filename))
74-
else:
75-
self.output_file.write('No errors.\n')
76-
77-
if checker.joins:
78-
self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal '
79-
'line breaks.\n' % (checker.rows_joined, checker.joins))
80-
81-
def _format_error_row(self, error):
82-
row = [error.line_number, error.msg]
83-
row.extend(error.row)
84-
85-
return row
37+
sys.exit(1)
8638

8739

8840
def launch_new_instance():

docs/scripts/csvclean.rst

+9-7
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ Note that every csvkit tool does the following:
1818
* changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
1919
* changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option
2020

21-
Outputs [basename]_out.csv and [basename]_err.csv, the former containing all valid rows and the latter containing all error rows along with line numbers and descriptions:
21+
All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1::
2222

2323
.. code-block:: none
2424
2525
usage: csvclean [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
2626
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H]
27-
[-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-n]
27+
[-K SKIP_LINES] [-v] [-l] [--zero] [-V]
2828
[FILE]
2929
3030
Fix common errors in a CSV file.
@@ -35,8 +35,6 @@ Outputs [basename]_out.csv and [basename]_err.csv, the former containing all val
3535
3636
optional arguments:
3737
-h, --help show this help message and exit
38-
-n, --dry-run Do not create output files. Information about what
39-
would have been done will be printed to STDERR.
4038
4139
See also: :doc:`../common_arguments`.
4240

@@ -47,9 +45,13 @@ Test a file with known bad rows:
4745

4846
.. code-block:: console
4947
50-
$ csvclean -n examples/bad.csv
51-
Line 1: Expected 3 columns, found 4 columns
52-
Line 2: Expected 3 columns, found 2 columns
48+
$ csvclean examples/bad.csv 2> errors.csv
49+
column_a,column_b,column_c
50+
0,mixed types.... uh oh,17
51+
$ cat errors.csv
52+
line_number,msg,column_a,column_b,column_c
53+
1,"Expected 3 columns, found 4 columns",1,27,,I'm too long!
54+
2,"Expected 3 columns, found 2 columns",,I'm too short!
5355
5456
To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use:
5557

tests/test_utilities/test_csvclean.py

+63-70
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import sys
44
from unittest.mock import patch
55

6+
import agate
7+
68
from csvkit.utilities.csvclean import CSVClean, launch_new_instance
79
from tests.utils import CSVKitTestCase, EmptyFileTests
810

@@ -15,98 +17,89 @@ def tearDown(self):
1517
if os.path.isfile(output_file):
1618
os.remove(output_file)
1719

18-
def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]):
19-
args = [f'examples/{basename}.csv'] + additional_args
20+
def assertCleaned(self, args, output_rows, error_rows=[]):
2021
output_file = io.StringIO()
22+
error_file = io.StringIO()
2123

22-
utility = CSVClean(args, output_file)
23-
utility.run()
24+
utility = CSVClean(args, output_file, error_file)
2425

25-
output_file.close()
26+
if error_rows:
27+
with self.assertRaises(SystemExit) as e:
28+
utility.run()
29+
30+
self.assertEqual(e.exception.code, 1)
31+
else:
32+
utility.run()
33+
34+
output_file.seek(0)
35+
error_file.seek(0)
2636

27-
output_file = f'examples/{basename}_out.csv'
28-
error_file = f'examples/{basename}_err.csv'
29-
30-
self.assertEqual(os.path.exists(output_file), bool(output_lines))
31-
self.assertEqual(os.path.exists(error_file), bool(error_lines))
32-
33-
try:
34-
if output_lines:
35-
with open(output_file) as f:
36-
for line in output_lines:
37-
self.assertEqual(next(f), line)
38-
self.assertRaises(StopIteration, next, f)
39-
if error_lines:
40-
with open(error_file) as f:
41-
for line in error_lines:
42-
self.assertEqual(next(f), line)
43-
self.assertRaises(StopIteration, next, f)
44-
finally:
45-
if output_lines:
46-
os.remove(output_file)
47-
if error_lines:
48-
os.remove(error_file)
37+
if output_rows:
38+
reader = agate.csv.reader(output_file)
39+
for row in output_rows:
40+
self.assertEqual(next(reader), row)
41+
self.assertRaises(StopIteration, next, reader)
42+
if error_rows:
43+
reader = agate.csv.reader(error_file)
44+
for row in error_rows:
45+
self.assertEqual(next(reader), row)
46+
self.assertRaises(StopIteration, next, reader)
47+
48+
output_file.close()
49+
error_file.close()
4950

5051
def test_launch_new_instance(self):
51-
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/bad.csv']):
52+
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']):
5253
launch_new_instance()
5354

5455
def test_skip_lines(self):
55-
self.assertCleaned('bad_skip_lines', [
56-
'column_a,column_b,column_c\n',
57-
'0,mixed types.... uh oh,17\n',
56+
self.assertCleaned(['--skip-lines', '3', 'examples/bad_skip_lines.csv'], [
57+
['column_a', 'column_b', 'column_c'],
58+
['0', 'mixed types.... uh oh', '17'],
5859
], [
59-
'line_number,msg,column_a,column_b,column_c\n',
60-
'1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
61-
'2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
62-
], ['--skip-lines', '3'])
60+
['line_number', 'msg', 'column_a', 'column_b', 'column_c'],
61+
['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"],
62+
['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"],
63+
])
6364

6465
def test_simple(self):
65-
self.assertCleaned('bad', [
66-
'column_a,column_b,column_c\n',
67-
'0,mixed types.... uh oh,17\n',
66+
self.assertCleaned(['examples/bad.csv'], [
67+
['column_a', 'column_b', 'column_c'],
68+
['0', 'mixed types.... uh oh', '17'],
6869
], [
69-
'line_number,msg,column_a,column_b,column_c\n',
70-
'1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
71-
'2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
70+
['line_number', 'msg', 'column_a', 'column_b', 'column_c'],
71+
['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"],
72+
['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"],
7273
])
7374

7475
def test_no_header_row(self):
75-
self.assertCleaned('no_header_row', [
76-
'1,2,3\n',
76+
self.assertCleaned(['examples/no_header_row.csv'], [
77+
['1', '2', '3'],
7778
], [])
7879

7980
def test_removes_optional_quote_characters(self):
80-
self.assertCleaned('optional_quote_characters', [
81-
'a,b,c\n',
82-
'1,2,3\n',
83-
], [])
81+
self.assertCleaned(['examples/optional_quote_characters.csv'], [
82+
['a', 'b', 'c'],
83+
['1', '2', '3'],
84+
])
8485

8586
def test_changes_line_endings(self):
86-
self.assertCleaned('mac_newlines', [
87-
'a,b,c\n',
88-
'1,2,3\n',
89-
'"Once upon\n',
90-
'a time",5,6\n',
91-
], [])
87+
self.assertCleaned(['examples/mac_newlines.csv'], [
88+
['a', 'b', 'c'],
89+
['1', '2', '3'],
90+
['Once upon\na time', '5', '6'],
91+
])
9292

9393
def test_changes_character_encoding(self):
94-
self.assertCleaned('test_latin1', [
95-
'a,b,c\n',
96-
'1,2,3\n',
97-
'4,5,©\n',
98-
], [], ['-e', 'latin1'])
94+
self.assertCleaned(['-e', 'latin1', 'examples/test_latin1.csv'], [
95+
['a', 'b', 'c'],
96+
['1', '2', '3'],
97+
['4', '5', u'©'],
98+
])
9999

100100
def test_removes_bom(self):
101-
self.assertCleaned('test_utf8_bom', [
102-
'foo,bar,baz\n',
103-
'1,2,3\n',
104-
'4,5,ʤ\n',
105-
], [], [])
106-
107-
def test_dry_run(self):
108-
output = self.get_output_as_io(['-n', 'examples/bad.csv'])
109-
self.assertFalse(os.path.exists('examples/bad_err.csv'))
110-
self.assertFalse(os.path.exists('examples/bad_out.csv'))
111-
self.assertEqual(next(output)[:6], 'Line 1')
112-
self.assertEqual(next(output)[:6], 'Line 2')
101+
self.assertCleaned(['examples/test_utf8_bom.csv'], [
102+
['foo', 'bar', 'baz'],
103+
['1', '2', '3'],
104+
['4', '5', 'ʤ'],
105+
])

0 commit comments

Comments
 (0)