Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: add utf8 string operation tests to highlight substr inconsistencies #3699

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ check-hidden = true
ignore-words-list = "crate,arithmetics,ser"
# Feel free to un-skip examples, and experimental, you will just need to
# work through many typos (--write-changes and --interactive will help)
skip = "tests/series/*,target,.git,.venv,venv,data,*.csv,*.csv.*,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb,*.tiktoken,*.sql,tests/table/utf8/*,tests/table/binary/*"
skip = "tests/series/*,tests/table/utf8/*,tests/table/binary/*,target,.git,.venv,venv,data,*.csv,*.csv.*,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb,*.tiktoken,*.sql"

[tool.maturin]
# "python" tells pyo3 we want to build an extension module (skips linking against libpython.so)
Expand Down
211 changes: 211 additions & 0 deletions tests/table/utf8/test_concat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from __future__ import annotations

import pytest

from daft.expressions import col, lit
from daft.table import MicroPartition


@pytest.mark.parametrize(
"input_a,input_b,expected_result",
[
# Basic ASCII concatenation
(
["Hello", "Test", "", "End"],
[" World", "ing", "Empty", "!"],
["Hello World", "Testing", "Empty", "End!"],
),
# Special UTF-8 sequences
(
[
"☃", # Snowman
"😉", # Winking face
"🌈", # Rainbow
"Hello☃", # String with UTF-8
"Hello\u0000", # String with null
],
[
"😉", # Winking face
"☃", # Snowman
"☃", # Snowman
"World", # ASCII
"\u0000World", # Null and string
],
[
"☃😉", # Snowman + Winking face
"😉☃", # Winking face + Snowman
"🌈☃", # Rainbow + Snowman
"Hello☃World", # String with UTF-8 + ASCII
"Hello\u0000\u0000World", # String with multiple nulls
],
),
# Nulls and empty strings
(
["Hello", None, "", "Test", None, "End", ""],
[" World", "!", None, None, "ing", "", "Empty"],
["Hello World", None, None, None, None, "End", "Empty"],
),
# Mixed length concatenation
(
["a", "ab", "abc", "abcd"],
["1", "12", "123", "1234"],
["a1", "ab12", "abc123", "abcd1234"],
),
# Empty string combinations
(
["", "", "Hello", "World", ""],
["", "Test", "", "", "!"],
["", "Test", "Hello", "World", "!"],
),
# Complex UTF-8 sequences
(
[
"☃", # Snowman
"😉", # Winking face
"🌈", # Rainbow
"☃😉", # Snowman + Winking face
],
[
"😉", # Winking face
"☃", # Snowman
"☃", # Snowman
"🌈", # Rainbow
],
[
"☃😉", # Snowman + Winking face
"😉☃", # Winking face + Snowman
"🌈☃", # Rainbow + Snowman
"☃😉🌈", # Snowman + Winking face + Rainbow
],
),
# Null characters in different positions
(
[
"\u0000abc", # Leading null
"abc\u0000", # Trailing null
"ab\u0000c", # Middle null
"\u0000ab\u0000c\u0000", # Multiple nulls
],
[
"def\u0000", # Trailing null
"\u0000def", # Leading null
"d\u0000ef", # Middle null
"\u0000de\u0000f\u0000", # Multiple nulls
],
[
"\u0000abcdef\u0000", # Nulls at ends
"abc\u0000\u0000def", # Adjacent nulls
"ab\u0000cd\u0000ef", # Separated nulls
"\u0000ab\u0000c\u0000\u0000de\u0000f\u0000", # Many nulls
],
),
],
)
def test_utf8_concat(input_a: list[str | None], input_b: list[str | None], expected_result: list[str | None]) -> None:
table = MicroPartition.from_pydict({"a": input_a, "b": input_b})
result = table.eval_expression_list([col("a").str.concat(col("b"))])
assert result.to_pydict() == {"a": expected_result}


@pytest.mark.parametrize(
"input_data,literal,expected_result",
[
# Basic broadcasting
(
["Hello", "Goodbye", "Test"],
" World!",
["Hello World!", "Goodbye World!", "Test World!"],
),
# Broadcasting with nulls
(
["Hello", None, "Test"],
" World!",
["Hello World!", None, "Test World!"],
),
# Broadcasting with UTF-8 sequences
(
["Hello", "Test", "Goodbye"],
"☃", # Snowman
["Hello☃", "Test☃", "Goodbye☃"],
),
# Broadcasting with null characters
(
["Hello", "Test\u0000", "\u0000World"],
"\u0000",
["Hello\u0000", "Test\u0000\u0000", "\u0000World\u0000"],
),
# Broadcasting with empty strings
(
["", "Test", ""],
"☃",
["☃", "Test☃", "☃"],
),
# Broadcasting with complex UTF-8
(
["Hello", "Test", "Goodbye"],
"☃😉🌈", # Snowman + Winking face + Rainbow
["Hello☃😉🌈", "Test☃😉🌈", "Goodbye☃😉🌈"],
),
# Broadcasting with literal None
(
["Hello", None, "Test", ""],
None,
[None, None, None, None], # Any concat with None should result in None
),
],
)
def test_utf8_concat_broadcast(
input_data: list[str | None], literal: str | None, expected_result: list[str | None]
) -> None:
# Test right-side broadcasting
table = MicroPartition.from_pydict({"a": input_data})
result = table.eval_expression_list([col("a").str.concat(literal)])
assert result.to_pydict() == {"a": expected_result}

# Test left-side broadcasting
table = MicroPartition.from_pydict({"b": input_data})
result = table.eval_expression_list([lit(literal).str.concat(col("b"))])
if literal is None:
# When literal is None, all results should be None
assert result.to_pydict() == {"literal": [None] * len(input_data)}
else:
assert result.to_pydict() == {
"literal": [
lit + data if data is not None else None for lit, data in zip([literal] * len(input_data), input_data)
]
}


def test_utf8_concat_edge_cases() -> None:
# Test various edge cases
table = MicroPartition.from_pydict(
{
"a": [
"", # Empty string
"\u0000", # Single null character
"Hello", # Normal string
None, # Null value
"☃", # UTF-8 sequence
"😉", # Another UTF-8 sequence
],
"b": [
"", # Empty + Empty
"\u0000", # Null + Null
"", # Normal + Empty
None, # Null + Null
"😉", # UTF-8 + UTF-8
"☃", # UTF-8 + UTF-8
],
}
)
result = table.eval_expression_list([col("a").str.concat(col("b"))])
assert result.to_pydict() == {
"a": [
"", # Empty + Empty = Empty
"\u0000\u0000", # Null + Null = Two nulls
"Hello", # Normal + Empty = Normal
None, # Null + Null = Null
"☃😉", # Snowman + Winking face
"😉☃", # Winking face + Snowman
]
}
97 changes: 94 additions & 3 deletions tests/table/utf8/test_length.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,101 @@
from __future__ import annotations

import pytest

from daft.expressions import col
from daft.table import MicroPartition


def test_utf8_length():
table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux", "😉test", ""]})
@pytest.mark.parametrize(
"input_data,expected_lengths",
[
# Basic ASCII strings
(
["Hello", "World!", "", "Test"],
[5, 6, 0, 4],
),
# Special UTF-8 sequences
(
[
"☃", # UTF-8 encoded snowman
"😉", # UTF-8 encoded winking face
"🌈", # UTF-8 encoded rainbow
"Hello☃World", # Mixed ASCII and UTF-8
"☃😉🌈", # Multiple UTF-8 characters
"Hello\u0000World", # String with null character
],
[1, 1, 1, 11, 3, 11],
),
# Nulls and empty strings
(
["Hello", None, "", "\u0000", None, "Test", ""],
[5, None, 0, 1, None, 4, 0],
),
# Large strings
(
["x" * 1000, "y" * 10000, "z" * 100000],
[1000, 10000, 100000],
),
# Mixed strings with different sizes
(
[
"a", # Single character
"ab", # Two characters
"abc", # Three characters
"☃", # Single UTF-8 character
"☃☃", # Two UTF-8 characters
"☃☃☃", # Three UTF-8 characters
],
[1, 2, 3, 1, 2, 3],
),
# Strings with repeated patterns
(
[
"\u0000" * 5, # Repeated null characters
"ab" * 5, # Repeated ASCII pattern
"☃" * 5, # Repeated UTF-8 snowman
"😉" * 5, # Repeated UTF-8 winking face
],
[5, 10, 5, 5],
),
# Edge cases with single characters
(
[
"\u0000", # Null character
"\u0001", # Start of heading
"\u001f", # Unit separator
" ", # Space
"\u007f", # Delete
"☃", # Snowman
"😉", # Winking face
],
[1, 1, 1, 1, 1, 1, 1],
),
# Complex UTF-8 sequences
(
[
"☃", # Snowman
"😉", # Winking face
"☃😉", # Snowman + Winking face
"🌈", # Rainbow
"🌈☃", # Rainbow + Snowman
"☃🌈😉", # Snowman + Rainbow + Winking face
],
[1, 1, 2, 1, 2, 3],
),
# Mixed content lengths
(
[
"Hello☃World", # ASCII + UTF-8 + ASCII
"\u0000Hello\u0000World\u0000", # Null-separated
"☃Hello☃World☃", # UTF-8-separated
"Hello😉World", # ASCII + UTF-8 + ASCII
],
[11, 13, 13, 11],
),
],
)
def test_utf8_length(input_data: list[str | None], expected_lengths: list[int | None]) -> None:
table = MicroPartition.from_pydict({"col": input_data})
result = table.eval_expression_list([col("col").str.length()])
assert result.to_pydict() == {"col": [3, None, 6, 4, 5, 0]}
assert result.to_pydict() == {"col": expected_lengths}
Loading
Loading