feat: Add lossy decoding to read_csv for non-utf8 encodings (#21433)

pola-rs · Mar 2, 2025 · 2ae7287 · 2ae7287
1 parent 7332717
commit 2ae7287
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 7 deletions.
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
@@ -131,7 +131,9 @@ def prepare_file_arg(
 
     When `encoding` is not `utf8` or `utf8-lossy`, the whole file is
     first read in Python and decoded using the specified encoding and
-    returned as a `BytesIO` (for usage with `read_csv`).
+    returned as a `BytesIO` (for usage with `read_csv`). If encoding
+    ends with "-lossy", characters that can't be decoded are replaced
+    with `�`.
 
     A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`.
 
@@ -157,14 +159,19 @@ def managed_file(file: Any) -> Iterator[Any]:
         encoding in {"utf8", "utf8-lossy"} if encoding else True
     )
     encoding_str = encoding if encoding else "utf8"
+    encoding_str, encoding_errors = (
+        (encoding_str[:-6], "replace")
+        if encoding_str.endswith("-lossy")
+        else (encoding_str, "strict")
+    )
 
     # PyArrow allows directories, so we only check that something is not
     # a dir if we are not using PyArrow
     check_not_dir = not use_pyarrow
 
     if isinstance(file, bytes):
         if not has_utf8_utf8_lossy_encoding:
-            file = file.decode(encoding_str).encode("utf8")
+            file = file.decode(encoding_str, errors=encoding_errors).encode("utf8")
         return _check_empty(
             BytesIO(file), context="bytes", raise_if_empty=raise_if_empty
         )
@@ -180,7 +187,11 @@ def managed_file(file: Any) -> Iterator[Any]:
     if isinstance(file, BytesIO):
         if not has_utf8_utf8_lossy_encoding:
             return _check_empty(
-                BytesIO(file.read().decode(encoding_str).encode("utf8")),
+                BytesIO(
+                    file.read()
+                    .decode(encoding_str, errors=encoding_errors)
+                    .encode("utf8")
+                ),
                 context="BytesIO",
                 read_position=file.tell(),
                 raise_if_empty=raise_if_empty,
@@ -197,7 +208,11 @@ def managed_file(file: Any) -> Iterator[Any]:
     if isinstance(file, Path):
         if not has_utf8_utf8_lossy_encoding:
             return _check_empty(
-                BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")),
+                BytesIO(
+                    file.read_bytes()
+                    .decode(encoding_str, errors=encoding_errors)
+                    .encode("utf8")
+                ),
                 context=f"Path ({file!r})",
                 raise_if_empty=raise_if_empty,
             )
@@ -220,13 +235,16 @@ def managed_file(file: Any) -> Iterator[Any]:
                         normalize_filepath(file, check_not_directory=check_not_dir)
                     )
                 # decode first
-                with Path(file).open(encoding=encoding_str) as f:
+                with Path(file).open(
+                    encoding=encoding_str, errors=encoding_errors
+                ) as f:
                     return _check_empty(
                         BytesIO(f.read().encode("utf8")),
                         context=f"{file!r}",
                         raise_if_empty=raise_if_empty,
                     )
             storage_options["encoding"] = encoding
+            storage_options["errors"] = encoding_errors
             return fsspec.open(file, **storage_options)
 
     if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file):
@@ -242,12 +260,13 @@ def managed_file(file: Any) -> Iterator[Any]:
                         ]
                     )
             storage_options["encoding"] = encoding
+            storage_options["errors"] = encoding_errors
             return fsspec.open_files(file, **storage_options)
 
     if isinstance(file, str):
         file = normalize_filepath(file, check_not_directory=check_not_dir)
         if not has_utf8_utf8_lossy_encoding:
-            with Path(file).open(encoding=encoding_str) as f:
+            with Path(file).open(encoding=encoding_str, errors=encoding_errors) as f:
                 return _check_empty(
                     BytesIO(f.read().encode("utf8")),
                     context=f"{file!r}",

diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
@@ -171,7 +171,7 @@ def read_csv(
         Stop reading from CSV file after reading `n_rows`.
         During multi-threaded parsing, an upper bound of `n_rows`
         rows cannot be guaranteed.
-    encoding : {'utf8', 'utf8-lossy', ...}
+    encoding : {'utf8', 'utf8-lossy', 'windows-1252', 'windows-1252-lossy', ...}
         Lossy means that invalid utf8 values are replaced with `�`
         characters. When using other encodings than `utf8` or
         `utf8-lossy`, the input is first decoded in memory with

diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
@@ -503,6 +503,36 @@ def test_read_csv_encoding(tmp_path: Path) -> None:
             )
 
 
+@pytest.mark.may_fail_auto_streaming  # read->scan_csv dispatch
+@pytest.mark.write_disk
+def test_read_csv_encoding_lossy(tmp_path: Path) -> None:
+    tmp_path.mkdir(exist_ok=True)
+
+    bts = (
+        b"\xc8\xec\xff,\xc2\xee\xe7\xf0\xe0\xf1\xf2,\xc3\xee\xf0\xee\xe4\n"
+        b"\xc8\xe2\xe0\xed,25,\xcc\xee\xf1\xea\xe2\xe0\n"
+        # \x98 is not supported in "windows-1251".
+        b"\xce\xeb\xfc\xe3\xe0,30,\xd1\xe0\xed\xea\xf2-\x98\xcf\xe5\xf2\xe5\xf0\xe1\xf3\xf0\xe3\n"
+    )
+
+    file_path = tmp_path / "encoding_lossy.csv"
+    file_path.write_bytes(bts)
+
+    file_str = str(file_path)
+    bytesio = io.BytesIO(bts)
+    bytesio.seek(0)
+
+    for file in [file_path, file_str, bts, bytesio]:
+        assert_series_equal(
+            pl.read_csv(
+                file,  # type: ignore[arg-type]
+                encoding="windows-1251-lossy",
+                use_pyarrow=False,
+            ).get_column("Город"),
+            pl.Series("Город", ["Москва", "Санкт-�Петербург"]),
+        )
+
+
 @pytest.mark.may_fail_auto_streaming  # read->scan_csv dispatch
 def test_column_rename_and_schema_overrides() -> None:
     csv = textwrap.dedent(