Skip to content

Commit 95dc26d

Browse files
committed
fix: csvformat supports --out-quoting 2. --quoting (and --out-quoting) support options from Python 3.12.
1 parent b3f68a3 commit 95dc26d

File tree

8 files changed

+159
-43
lines changed

8 files changed

+159
-43
lines changed

CHANGELOG.rst

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
2.0.0 - Unreleased
22
------------------
33

4-
**BACKWARDS-INCOMPATIBLE CHANGES**
4+
**BACKWARDS-INCOMPATIBLE CHANGES:**
55

66
* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
77

8+
Other changes:
9+
10+
* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
11+
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
12+
* fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNUMERIC>`__). Use the :code:`--locale` option to set the locale of any formatted numbers.
13+
814
1.5.0 - March 28, 2024
915
----------------------
1016

@@ -21,7 +27,7 @@
2127
* :code:`--sniff-limit``
2228
* :code:`--no-inference``
2329

24-
* feat: :doc:`/scripts/csvpy` removes the ``--linenumbers`` and ``--zero`` output options, which had no effect.
30+
* feat: :doc:`/scripts/csvpy` removes the :code:`--linenumbers` and :code:`--zero` output options, which had no effect.
2531
* feat: :doc:`/scripts/in2csv` adds a :code:`--reset-dimensions` option to `recalculate <https://openpyxl.readthedocs.io/en/stable/optimized.html#worksheet-dimensions>`_ the dimensions of an XLSX file, instead of trusting the file's metadata. csvkit's dependency `agate-excel <https://agate-excel.readthedocs.io/en/latest/>`_ 0.4.0 automatically recalculates the dimensions if the file's metadata expresses dimensions of "A1:A1" (a single cell).
2632
* fix: :doc:`/scripts/csvlook` only reads up to :code:`--max-rows` rows instead of the entire file.
2733
* fix: :doc:`/scripts/csvpy` supports the existing input options:
@@ -61,7 +67,7 @@
6167
1.2.0 - October 4, 2023
6268
-----------------------
6369

64-
* fix: :doc:`/scripts/csvjoin` uses the correct columns when performing a ``--right`` join.
70+
* fix: :doc:`/scripts/csvjoin` uses the correct columns when performing a :code:`--right` join.
6571
* Add SQLAlchemy 2 support.
6672
* Drop Python 3.7 support (end-of-life was June 5, 2023).
6773

csvkit/cli.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
32
import argparse
43
import bz2
54
import csv
@@ -22,6 +21,8 @@
2221
except ImportError:
2322
zstandard = None
2423

24+
QUOTING_CHOICES = sorted(getattr(csv, name) for name in dir(csv) if name.startswith('QUOTE_'))
25+
2526

2627
class LazyFile:
2728
"""
@@ -170,17 +171,17 @@ def _init_common_parser(self):
170171
help='Character used to quote strings in the input CSV file.')
171172
if 'u' not in self.override_flags:
172173
self.argparser.add_argument(
173-
'-u', '--quoting', dest='quoting', type=int, choices=[0, 1, 2, 3],
174-
help='Quoting style used in the input CSV file. 0 = Quote Minimal, 1 = Quote All, '
175-
'2 = Quote Non-numeric, 3 = Quote None.')
174+
'-u', '--quoting', dest='quoting', type=int, choices=QUOTING_CHOICES,
175+
help='Quoting style used in the input CSV file: 0 quote minimal, 1 quote all, '
176+
'2 quote non-numeric, 3 quote none.')
176177
if 'b' not in self.override_flags:
177178
self.argparser.add_argument(
178179
'-b', '--no-doublequote', dest='doublequote', action='store_false',
179180
help='Whether or not double quotes are doubled in the input CSV file.')
180181
if 'p' not in self.override_flags:
181182
self.argparser.add_argument(
182183
'-p', '--escapechar', dest='escapechar',
183-
help='Character used to escape the delimiter if --quoting 3 ("Quote None") is specified and to escape '
184+
help='Character used to escape the delimiter if --quoting 3 ("quote none") is specified and to escape '
184185
'the QUOTECHAR if --no-doublequote is specified.')
185186
if 'z' not in self.override_flags:
186187
self.argparser.add_argument(
@@ -337,12 +338,13 @@ def get_column_types(self):
337338
type_kwargs['null_values'].append(null_value)
338339

339340
text_type = agate.Text(**type_kwargs)
341+
number_type = agate.Number(locale=self.args.locale, **type_kwargs)
340342

341-
if self.args.no_inference:
343+
if getattr(self.args, 'no_inference', None):
342344
types = [text_type]
345+
elif getattr(self.args, 'out_quoting', None) == 2:
346+
types = [number_type, text_type]
343347
else:
344-
number_type = agate.Number(locale=self.args.locale, **type_kwargs)
345-
346348
# See the order in the `agate.TypeTester` class.
347349
types = [
348350
agate.Boolean(**type_kwargs),

csvkit/utilities/csvformat.py

+30-15
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
import agate
66

7-
from csvkit.cli import CSVKitUtility, make_default_headers
7+
from csvkit.cli import QUOTING_CHOICES, CSVKitUtility, make_default_headers
88

99

1010
class CSVFormat(CSVKitUtility):
1111
description = 'Convert a CSV file to a custom output format.'
12-
override_flags = ['L', 'blanks', 'date-format', 'datetime-format']
12+
override_flags = ['blanks', 'date-format', 'datetime-format']
1313

1414
def add_arguments(self):
1515
self.argparser.add_argument(
@@ -29,9 +29,9 @@ def add_arguments(self):
2929
'-Q', '--out-quotechar', dest='out_quotechar',
3030
help='Character used to quote strings in the output file.')
3131
self.argparser.add_argument(
32-
'-U', '--out-quoting', dest='out_quoting', type=int, choices=[0, 1, 2, 3],
33-
help='Quoting style used in the output file. 0 = Quote Minimal, 1 = Quote All, '
34-
'2 = Quote Non-numeric, 3 = Quote None.')
32+
'-U', '--out-quoting', dest='out_quoting', type=int, choices=QUOTING_CHOICES,
33+
help='Quoting style used in the output file: 0 quote minimal, 1 quote all, '
34+
'2 quote non-numeric, 3 quote none.')
3535
self.argparser.add_argument(
3636
'-B', '--out-no-doublequote', dest='out_doublequote', action='store_false',
3737
help='Whether or not double quotes are doubled in the output file.')
@@ -72,18 +72,33 @@ def main(self):
7272
if self.additional_input_expected():
7373
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')
7474

75-
reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
7675
writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
77-
if self.args.no_header_row:
78-
# Peek at a row to get the number of columns.
79-
_row = next(reader)
80-
headers = make_default_headers(len(_row))
81-
reader = itertools.chain([headers, _row], reader)
8276

83-
if self.args.skip_header:
84-
next(reader)
85-
86-
writer.writerows(reader)
77+
if self.args.out_quoting == 2:
78+
table = agate.Table.from_csv(
79+
self.input_file,
80+
skip_lines=self.args.skip_lines,
81+
column_types=self.get_column_types(),
82+
**self.reader_kwargs,
83+
)
84+
85+
# table.to_csv() has no option to omit the column names.
86+
if not self.args.skip_header:
87+
writer.writerow(table.column_names)
88+
89+
writer.writerows(table.rows)
90+
else:
91+
reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
92+
if self.args.no_header_row:
93+
# Peek at a row to get the number of columns.
94+
_row = next(reader)
95+
headers = make_default_headers(len(_row))
96+
reader = itertools.chain([headers, _row], reader)
97+
98+
if self.args.skip_header:
99+
next(reader)
100+
101+
writer.writerows(reader)
87102

88103

89104
def launch_new_instance():

docs/common_arguments.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ csvkit's tools share a set of common command-line arguments. Not every argument
1313
-q QUOTECHAR, --quotechar QUOTECHAR
1414
Character used to quote strings in the input CSV file.
1515
-u {0,1,2,3}, --quoting {0,1,2,3}
16-
Quoting style used in the input CSV file. 0 = Quote
17-
Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 =
18-
Quote None.
16+
Quoting style used in the input CSV file: 0 quote
17+
minimal, 1 quote all, 2 quote non-numeric, 3 quote
18+
none.
1919
-b, --no-doublequote Whether or not double quotes are doubled in the input
2020
CSV file.
2121
-p ESCAPECHAR, --escapechar ESCAPECHAR
2222
Character used to escape the delimiter if --quoting 3
23-
("Quote None") is specified and to escape the
23+
("quote none") is specified and to escape the
2424
QUOTECHAR if --no-doublequote is specified.
2525
-z FIELD_SIZE_LIMIT, --maxfieldsize FIELD_SIZE_LIMIT
2626
Maximum length of a single field in the input CSV

docs/contributing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ Currently, the following tools stream:
7070

7171
* :doc:`/scripts/csvclean`
7272
* :doc:`/scripts/csvcut`
73-
* :doc:`/scripts/csvformat`
73+
* :doc:`/scripts/csvformat` unless :code:`--quoting 2` is set
7474
* :doc:`/scripts/csvgrep`
7575
* :doc:`/scripts/csvstack`
7676
* :doc:`/scripts/sql2csv`

docs/release.rst

-6
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,6 @@
22
Release process
33
===============
44

5-
.. admonition:: One-time setup
6-
7-
.. code-block:: bash
8-
9-
pip install --upgrade build twine
10-
115
#. All tests pass on continuous integration
126
#. The changelog is up-to-date and dated
137
#. If new options are added, regenerate the usage information in the documentation with, for example:

docs/scripts/csvformat.rst

+6-6
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ Convert a CSV file to a custom output format.:
1010
.. code-block:: none
1111
1212
usage: csvformat [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
13-
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H]
14-
[-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-E]
15-
[-D OUT_DELIMITER] [-T] [-A] [-Q OUT_QUOTECHAR]
13+
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING]
14+
[-L LOCALE] [-S] [-H] [-K SKIP_LINES] [-v] [-l] [--zero] [-V]
15+
[-E] [-D OUT_DELIMITER] [-T] [-A] [-Q OUT_QUOTECHAR]
1616
[-U {0,1,2,3}] [-B] [-P OUT_ESCAPECHAR]
1717
[-M OUT_LINETERMINATOR]
1818
[FILE]
@@ -36,9 +36,9 @@ Convert a CSV file to a custom output format.:
3636
-Q OUT_QUOTECHAR, --out-quotechar OUT_QUOTECHAR
3737
Character used to quote strings in the output file.
3838
-U {0,1,2,3}, --out-quoting {0,1,2,3}
39-
Quoting style used in the output file. 0 = Quote
40-
Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 =
41-
Quote None.
39+
Quoting style used in the output file: 0 quote
40+
minimal, 1 quote all, 2 quote non-numeric, 3 quote
41+
none.
4242
-B, --out-no-doublequote
4343
Whether or not double quotes are doubled in the output
4444
CSV file.

tests/test_utilities/test_csvformat.py

+99
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,102 @@ def test_lineterminator(self):
9595
self.assertLines(['-M', 'XYZ', 'examples/dummy.csv'], [
9696
'a,b,cXYZ1,2,3XYZ',
9797
], newline_at_eof=False)
98+
99+
100+
class TestCSVFormatQuoteNonNumeric(CSVKitTestCase, EmptyFileTests):
101+
Utility = CSVFormat
102+
103+
# New test compared to TestCSVFormat.
104+
def test_locale(self):
105+
self.assertLines(['-U', '2', '--locale', 'de_DE', 'examples/test_locale.csv'], [
106+
'"a","b","c"',
107+
'1.7,200000000,""',
108+
])
109+
110+
111+
def test_launch_new_instance(self):
112+
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']):
113+
launch_new_instance()
114+
115+
def test_skip_lines(self):
116+
self.assertLines(['-U', '2', '--skip-lines', '3', '-D', '|', 'examples/test_skip_lines.csv'], [
117+
'"a"|"b"|"c"',
118+
'1|2|3',
119+
])
120+
121+
def test_skip_header(self):
122+
self.assertLines(['-U', '2', '--skip-header', 'examples/dummy.csv'], [
123+
'1,2,3',
124+
])
125+
126+
def test_skip_header_no_header_row(self):
127+
self.assertLines(['-U', '2', '--no-header-row', '--skip-header', 'examples/no_header_row.csv'], [
128+
'1,2,3',
129+
])
130+
131+
def test_no_header_row(self):
132+
self.assertLines(['-U', '2', '--no-header-row', 'examples/no_header_row.csv'], [
133+
'"a","b","c"',
134+
'1,2,3',
135+
])
136+
137+
def test_linenumbers(self):
138+
self.assertLines(['-U', '2', '--linenumbers', 'examples/dummy.csv'], [
139+
'"line_number","a","b","c"',
140+
'1,1,2,3',
141+
])
142+
143+
def test_delimiter(self):
144+
self.assertLines(['-U', '2', '-D', '|', 'examples/dummy.csv'], [
145+
'"a"|"b"|"c"',
146+
'1|2|3',
147+
])
148+
149+
def test_tabs(self):
150+
self.assertLines(['-U', '2', '-T', 'examples/dummy.csv'], [
151+
'"a"\t"b"\t"c"',
152+
'1\t2\t3',
153+
])
154+
155+
def test_asv(self):
156+
self.assertLines(['-U', '2', '-A', 'examples/dummy.csv'], [
157+
'"a"\x1f"b"\x1f"c"\x1e1\x1f2\x1f3\x1e',
158+
], newline_at_eof=False)
159+
160+
def test_quotechar(self):
161+
input_file = io.BytesIO(b'a,b,c\n1*2,3,4\n')
162+
163+
with stdin_as_string(input_file):
164+
self.assertLines(['-U', '2', '-Q', '*'], [
165+
'*a*,*b*,*c*',
166+
'*1**2*,3,4',
167+
])
168+
169+
input_file.close()
170+
171+
def test_doublequote(self):
172+
input_file = io.BytesIO(b'a\n"a ""quoted"" string"')
173+
174+
with stdin_as_string(input_file):
175+
self.assertLines(['-U', '2', '-P', '#', '-B'], [
176+
'"a"',
177+
'"a #"quoted#" string"',
178+
])
179+
180+
input_file.close()
181+
182+
def test_escapechar(self):
183+
input_file = io.BytesIO(b'a,b,c\n1"2,3,4\n')
184+
185+
with stdin_as_string(input_file):
186+
self.assertLines(['-U', '2', '-P', '#', '-U', '3'], [
187+
'a,b,c',
188+
'1#"2,3,4',
189+
])
190+
191+
input_file.close()
192+
193+
def test_lineterminator(self):
194+
self.assertLines(['-U', '2', '-M', 'XYZ', 'examples/dummy.csv'], [
195+
'"a","b","c"XYZ1,2,3XYZ',
196+
], newline_at_eof=False)

0 commit comments

Comments
 (0)