diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 6c069dbf003e..c6e293d13659 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -131,7 +131,9 @@ def prepare_file_arg( When `encoding` is not `utf8` or `utf8-lossy`, the whole file is first read in Python and decoded using the specified encoding and - returned as a `BytesIO` (for usage with `read_csv`). + returned as a `BytesIO` (for usage with `read_csv`). If encoding + ends with "-lossy", characters that can't be decoded are replaced + with `�`. A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`. @@ -157,6 +159,11 @@ def managed_file(file: Any) -> Iterator[Any]: encoding in {"utf8", "utf8-lossy"} if encoding else True ) encoding_str = encoding if encoding else "utf8" + encoding_str, encoding_errors = ( + (encoding_str[:-6], "replace") + if encoding_str.endswith("-lossy") + else (encoding_str, "strict") + ) # PyArrow allows directories, so we only check that something is not # a dir if we are not using PyArrow @@ -164,7 +171,7 @@ def managed_file(file: Any) -> Iterator[Any]: if isinstance(file, bytes): if not has_utf8_utf8_lossy_encoding: - file = file.decode(encoding_str).encode("utf8") + file = file.decode(encoding_str, errors=encoding_errors).encode("utf8") return _check_empty( BytesIO(file), context="bytes", raise_if_empty=raise_if_empty ) @@ -180,7 +187,11 @@ def managed_file(file: Any) -> Iterator[Any]: if isinstance(file, BytesIO): if not has_utf8_utf8_lossy_encoding: return _check_empty( - BytesIO(file.read().decode(encoding_str).encode("utf8")), + BytesIO( + file.read() + .decode(encoding_str, errors=encoding_errors) + .encode("utf8") + ), context="BytesIO", read_position=file.tell(), raise_if_empty=raise_if_empty, @@ -197,7 +208,11 @@ def managed_file(file: Any) -> Iterator[Any]: if isinstance(file, Path): if not has_utf8_utf8_lossy_encoding: return _check_empty( - BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")), + BytesIO( + file.read_bytes() + .decode(encoding_str, errors=encoding_errors) + .encode("utf8") + ), context=f"Path ({file!r})", raise_if_empty=raise_if_empty, ) @@ -220,13 +235,16 @@ def managed_file(file: Any) -> Iterator[Any]: normalize_filepath(file, check_not_directory=check_not_dir) ) # decode first - with Path(file).open(encoding=encoding_str) as f: + with Path(file).open( + encoding=encoding_str, errors=encoding_errors + ) as f: return _check_empty( BytesIO(f.read().encode("utf8")), context=f"{file!r}", raise_if_empty=raise_if_empty, ) storage_options["encoding"] = encoding + storage_options["errors"] = encoding_errors return fsspec.open(file, **storage_options) if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file): @@ -242,12 +260,13 @@ def managed_file(file: Any) -> Iterator[Any]: ] ) storage_options["encoding"] = encoding + storage_options["errors"] = encoding_errors return fsspec.open_files(file, **storage_options) if isinstance(file, str): file = normalize_filepath(file, check_not_directory=check_not_dir) if not has_utf8_utf8_lossy_encoding: - with Path(file).open(encoding=encoding_str) as f: + with Path(file).open(encoding=encoding_str, errors=encoding_errors) as f: return _check_empty( BytesIO(f.read().encode("utf8")), context=f"{file!r}", diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 0f1d32d668dd..162c1080f73c 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -171,7 +171,7 @@ def read_csv( Stop reading from CSV file after reading `n_rows`. During multi-threaded parsing, an upper bound of `n_rows` rows cannot be guaranteed. - encoding : {'utf8', 'utf8-lossy', ...} + encoding : {'utf8', 'utf8-lossy', 'windows-1252', 'windows-1252-lossy', ...} Lossy means that invalid utf8 values are replaced with `�` characters. When using other encodings than `utf8` or `utf8-lossy`, the input is first decoded in memory with diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 1450f2f03a76..dfde0ba7eddf 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -503,6 +503,36 @@ def test_read_csv_encoding(tmp_path: Path) -> None: ) +@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch +@pytest.mark.write_disk +def test_read_csv_encoding_lossy(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + bts = ( + b"\xc8\xec\xff,\xc2\xee\xe7\xf0\xe0\xf1\xf2,\xc3\xee\xf0\xee\xe4\n" + b"\xc8\xe2\xe0\xed,25,\xcc\xee\xf1\xea\xe2\xe0\n" + # \x98 is not supported in "windows-1251". + b"\xce\xeb\xfc\xe3\xe0,30,\xd1\xe0\xed\xea\xf2-\x98\xcf\xe5\xf2\xe5\xf0\xe1\xf3\xf0\xe3\n" + ) + + file_path = tmp_path / "encoding_lossy.csv" + file_path.write_bytes(bts) + + file_str = str(file_path) + bytesio = io.BytesIO(bts) + bytesio.seek(0) + + for file in [file_path, file_str, bts, bytesio]: + assert_series_equal( + pl.read_csv( + file, # type: ignore[arg-type] + encoding="windows-1251-lossy", + use_pyarrow=False, + ).get_column("Город"), + pl.Series("Город", ["Москва", "Санкт-�Петербург"]), + ) + + @pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch def test_column_rename_and_schema_overrides() -> None: csv = textwrap.dedent(