Skip to content

Commit

Permalink
feat: Add lossy decoding to read_csv for non-utf8 encodings (#21433)
Browse files Browse the repository at this point in the history
  • Loading branch information
ghuls authored Mar 2, 2025
1 parent 7332717 commit 2ae7287
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 7 deletions.
31 changes: 25 additions & 6 deletions py-polars/polars/io/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ def prepare_file_arg(
When `encoding` is not `utf8` or `utf8-lossy`, the whole file is
first read in Python and decoded using the specified encoding and
returned as a `BytesIO` (for usage with `read_csv`).
returned as a `BytesIO` (for usage with `read_csv`). If encoding
ends with "-lossy", characters that can't be decoded are replaced
with `�`.
A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`.
Expand All @@ -157,14 +159,19 @@ def managed_file(file: Any) -> Iterator[Any]:
encoding in {"utf8", "utf8-lossy"} if encoding else True
)
encoding_str = encoding if encoding else "utf8"
encoding_str, encoding_errors = (
(encoding_str[:-6], "replace")
if encoding_str.endswith("-lossy")
else (encoding_str, "strict")
)

# PyArrow allows directories, so we only check that something is not
# a dir if we are not using PyArrow
check_not_dir = not use_pyarrow

if isinstance(file, bytes):
if not has_utf8_utf8_lossy_encoding:
file = file.decode(encoding_str).encode("utf8")
file = file.decode(encoding_str, errors=encoding_errors).encode("utf8")
return _check_empty(
BytesIO(file), context="bytes", raise_if_empty=raise_if_empty
)
Expand All @@ -180,7 +187,11 @@ def managed_file(file: Any) -> Iterator[Any]:
if isinstance(file, BytesIO):
if not has_utf8_utf8_lossy_encoding:
return _check_empty(
BytesIO(file.read().decode(encoding_str).encode("utf8")),
BytesIO(
file.read()
.decode(encoding_str, errors=encoding_errors)
.encode("utf8")
),
context="BytesIO",
read_position=file.tell(),
raise_if_empty=raise_if_empty,
Expand All @@ -197,7 +208,11 @@ def managed_file(file: Any) -> Iterator[Any]:
if isinstance(file, Path):
if not has_utf8_utf8_lossy_encoding:
return _check_empty(
BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")),
BytesIO(
file.read_bytes()
.decode(encoding_str, errors=encoding_errors)
.encode("utf8")
),
context=f"Path ({file!r})",
raise_if_empty=raise_if_empty,
)
Expand All @@ -220,13 +235,16 @@ def managed_file(file: Any) -> Iterator[Any]:
normalize_filepath(file, check_not_directory=check_not_dir)
)
# decode first
with Path(file).open(encoding=encoding_str) as f:
with Path(file).open(
encoding=encoding_str, errors=encoding_errors
) as f:
return _check_empty(
BytesIO(f.read().encode("utf8")),
context=f"{file!r}",
raise_if_empty=raise_if_empty,
)
storage_options["encoding"] = encoding
storage_options["errors"] = encoding_errors
return fsspec.open(file, **storage_options)

if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file):
Expand All @@ -242,12 +260,13 @@ def managed_file(file: Any) -> Iterator[Any]:
]
)
storage_options["encoding"] = encoding
storage_options["errors"] = encoding_errors
return fsspec.open_files(file, **storage_options)

if isinstance(file, str):
file = normalize_filepath(file, check_not_directory=check_not_dir)
if not has_utf8_utf8_lossy_encoding:
with Path(file).open(encoding=encoding_str) as f:
with Path(file).open(encoding=encoding_str, errors=encoding_errors) as f:
return _check_empty(
BytesIO(f.read().encode("utf8")),
context=f"{file!r}",
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/io/csv/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def read_csv(
Stop reading from CSV file after reading `n_rows`.
During multi-threaded parsing, an upper bound of `n_rows`
rows cannot be guaranteed.
encoding : {'utf8', 'utf8-lossy', ...}
encoding : {'utf8', 'utf8-lossy', 'windows-1252', 'windows-1252-lossy', ...}
Lossy means that invalid utf8 values are replaced with `�`
characters. When using other encodings than `utf8` or
`utf8-lossy`, the input is first decoded in memory with
Expand Down
30 changes: 30 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,36 @@ def test_read_csv_encoding(tmp_path: Path) -> None:
)


@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch
@pytest.mark.write_disk
def test_read_csv_encoding_lossy(tmp_path: Path) -> None:
tmp_path.mkdir(exist_ok=True)

bts = (
b"\xc8\xec\xff,\xc2\xee\xe7\xf0\xe0\xf1\xf2,\xc3\xee\xf0\xee\xe4\n"
b"\xc8\xe2\xe0\xed,25,\xcc\xee\xf1\xea\xe2\xe0\n"
# \x98 is not supported in "windows-1251".
b"\xce\xeb\xfc\xe3\xe0,30,\xd1\xe0\xed\xea\xf2-\x98\xcf\xe5\xf2\xe5\xf0\xe1\xf3\xf0\xe3\n"
)

file_path = tmp_path / "encoding_lossy.csv"
file_path.write_bytes(bts)

file_str = str(file_path)
bytesio = io.BytesIO(bts)
bytesio.seek(0)

for file in [file_path, file_str, bts, bytesio]:
assert_series_equal(
pl.read_csv(
file, # type: ignore[arg-type]
encoding="windows-1251-lossy",
use_pyarrow=False,
).get_column("Город"),
pl.Series("Город", ["Москва", "Санкт-�Петербург"]),
)


@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch
def test_column_rename_and_schema_overrides() -> None:
csv = textwrap.dedent(
Expand Down

0 comments on commit 2ae7287

Please sign in to comment.