Skip to content

Commit

Permalink
test: add utf8 string operation tests to highlight substr inconsisten…
Browse files Browse the repository at this point in the history
…cies
  • Loading branch information
f4t4nt committed Jan 17, 2025
1 parent 5549d16 commit fbea790
Show file tree
Hide file tree
Showing 5 changed files with 1,121 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ check-hidden = true
ignore-words-list = "crate,arithmetics,ser"
# Feel free to un-skip examples, and experimental, you will just need to
# work through many typos (--write-changes and --interactive will help)
skip = "tests/series/*,target,.git,.venv,venv,data,*.csv,*.csv.*,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb,*.tiktoken,*.sql,tests/table/utf8/*,tests/table/binary/*"
skip = "tests/series/*,tests/table/utf8/*,tests/table/binary/*,target,.git,.venv,venv,data,*.csv,*.csv.*,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb,*.tiktoken,*.sql"

[tool.maturin]
# "python" tells pyo3 we want to build an extension module (skips linking against libpython.so)
Expand Down
211 changes: 211 additions & 0 deletions tests/table/utf8/test_concat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from __future__ import annotations

import pytest

from daft.expressions import col, lit
from daft.table import MicroPartition


@pytest.mark.parametrize(
"input_a,input_b,expected_result",
[
# Basic ASCII concatenation
(
["Hello", "Test", "", "End"],
[" World", "ing", "Empty", "!"],
["Hello World", "Testing", "Empty", "End!"],
),
# Special UTF-8 sequences
(
[
"☃", # Snowman
"😉", # Winking face
"🌈", # Rainbow
"Hello☃", # String with UTF-8
"Hello\u0000", # String with null
],
[
"😉", # Winking face
"☃", # Snowman
"☃", # Snowman
"World", # ASCII
"\u0000World", # Null and string
],
[
"☃😉", # Snowman + Winking face
"😉☃", # Winking face + Snowman
"🌈☃", # Rainbow + Snowman
"Hello☃World", # String with UTF-8 + ASCII
"Hello\u0000\u0000World", # String with multiple nulls
],
),
# Nulls and empty strings
(
["Hello", None, "", "Test", None, "End", ""],
[" World", "!", None, None, "ing", "", "Empty"],
["Hello World", None, None, None, None, "End", "Empty"],
),
# Mixed length concatenation
(
["a", "ab", "abc", "abcd"],
["1", "12", "123", "1234"],
["a1", "ab12", "abc123", "abcd1234"],
),
# Empty string combinations
(
["", "", "Hello", "World", ""],
["", "Test", "", "", "!"],
["", "Test", "Hello", "World", "!"],
),
# Complex UTF-8 sequences
(
[
"☃", # Snowman
"😉", # Winking face
"🌈", # Rainbow
"☃😉", # Snowman + Winking face
],
[
"😉", # Winking face
"☃", # Snowman
"☃", # Snowman
"🌈", # Rainbow
],
[
"☃😉", # Snowman + Winking face
"😉☃", # Winking face + Snowman
"🌈☃", # Rainbow + Snowman
"☃😉🌈", # Snowman + Winking face + Rainbow
],
),
# Null characters in different positions
(
[
"\u0000abc", # Leading null
"abc\u0000", # Trailing null
"ab\u0000c", # Middle null
"\u0000ab\u0000c\u0000", # Multiple nulls
],
[
"def\u0000", # Trailing null
"\u0000def", # Leading null
"d\u0000ef", # Middle null
"\u0000de\u0000f\u0000", # Multiple nulls
],
[
"\u0000abcdef\u0000", # Nulls at ends
"abc\u0000\u0000def", # Adjacent nulls
"ab\u0000cd\u0000ef", # Separated nulls
"\u0000ab\u0000c\u0000\u0000de\u0000f\u0000", # Many nulls
],
),
],
)
def test_utf8_concat(input_a: list[str | None], input_b: list[str | None], expected_result: list[str | None]) -> None:
table = MicroPartition.from_pydict({"a": input_a, "b": input_b})
result = table.eval_expression_list([col("a").str.concat(col("b"))])
assert result.to_pydict() == {"a": expected_result}


@pytest.mark.parametrize(
"input_data,literal,expected_result",
[
# Basic broadcasting
(
["Hello", "Goodbye", "Test"],
" World!",
["Hello World!", "Goodbye World!", "Test World!"],
),
# Broadcasting with nulls
(
["Hello", None, "Test"],
" World!",
["Hello World!", None, "Test World!"],
),
# Broadcasting with UTF-8 sequences
(
["Hello", "Test", "Goodbye"],
"☃", # Snowman
["Hello☃", "Test☃", "Goodbye☃"],
),
# Broadcasting with null characters
(
["Hello", "Test\u0000", "\u0000World"],
"\u0000",
["Hello\u0000", "Test\u0000\u0000", "\u0000World\u0000"],
),
# Broadcasting with empty strings
(
["", "Test", ""],
"☃",
["☃", "Test☃", "☃"],
),
# Broadcasting with complex UTF-8
(
["Hello", "Test", "Goodbye"],
"☃😉🌈", # Snowman + Winking face + Rainbow
["Hello☃😉🌈", "Test☃😉🌈", "Goodbye☃😉🌈"],
),
# Broadcasting with literal None
(
["Hello", None, "Test", ""],
None,
[None, None, None, None], # Any concat with None should result in None
),
],
)
def test_utf8_concat_broadcast(
input_data: list[str | None], literal: str | None, expected_result: list[str | None]
) -> None:
# Test right-side broadcasting
table = MicroPartition.from_pydict({"a": input_data})
result = table.eval_expression_list([col("a").str.concat(literal)])
assert result.to_pydict() == {"a": expected_result}

# Test left-side broadcasting
table = MicroPartition.from_pydict({"b": input_data})
result = table.eval_expression_list([lit(literal).str.concat(col("b"))])
if literal is None:
# When literal is None, all results should be None
assert result.to_pydict() == {"literal": [None] * len(input_data)}
else:
assert result.to_pydict() == {
"literal": [
lit + data if data is not None else None for lit, data in zip([literal] * len(input_data), input_data)
]
}


def test_utf8_concat_edge_cases() -> None:
# Test various edge cases
table = MicroPartition.from_pydict(
{
"a": [
"", # Empty string
"\u0000", # Single null character
"Hello", # Normal string
None, # Null value
"☃", # UTF-8 sequence
"😉", # Another UTF-8 sequence
],
"b": [
"", # Empty + Empty
"\u0000", # Null + Null
"", # Normal + Empty
None, # Null + Null
"😉", # UTF-8 + UTF-8
"☃", # UTF-8 + UTF-8
],
}
)
result = table.eval_expression_list([col("a").str.concat(col("b"))])
assert result.to_pydict() == {
"a": [
"", # Empty + Empty = Empty
"\u0000\u0000", # Null + Null = Two nulls
"Hello", # Normal + Empty = Normal
None, # Null + Null = Null
"☃😉", # Snowman + Winking face
"😉☃", # Winking face + Snowman
]
}
97 changes: 94 additions & 3 deletions tests/table/utf8/test_length.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,101 @@
from __future__ import annotations

import pytest

from daft.expressions import col
from daft.table import MicroPartition


def test_utf8_length():
table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux", "😉test", ""]})
@pytest.mark.parametrize(
"input_data,expected_lengths",
[
# Basic ASCII strings
(
["Hello", "World!", "", "Test"],
[5, 6, 0, 4],
),
# Special UTF-8 sequences
(
[
"☃", # UTF-8 encoded snowman
"😉", # UTF-8 encoded winking face
"🌈", # UTF-8 encoded rainbow
"Hello☃World", # Mixed ASCII and UTF-8
"☃😉🌈", # Multiple UTF-8 characters
"Hello\u0000World", # String with null character
],
[1, 1, 1, 11, 3, 11],
),
# Nulls and empty strings
(
["Hello", None, "", "\u0000", None, "Test", ""],
[5, None, 0, 1, None, 4, 0],
),
# Large strings
(
["x" * 1000, "y" * 10000, "z" * 100000],
[1000, 10000, 100000],
),
# Mixed strings with different sizes
(
[
"a", # Single character
"ab", # Two characters
"abc", # Three characters
"☃", # Single UTF-8 character
"☃☃", # Two UTF-8 characters
"☃☃☃", # Three UTF-8 characters
],
[1, 2, 3, 1, 2, 3],
),
# Strings with repeated patterns
(
[
"\u0000" * 5, # Repeated null characters
"ab" * 5, # Repeated ASCII pattern
"☃" * 5, # Repeated UTF-8 snowman
"😉" * 5, # Repeated UTF-8 winking face
],
[5, 10, 5, 5],
),
# Edge cases with single characters
(
[
"\u0000", # Null character
"\u0001", # Start of heading
"\u001f", # Unit separator
" ", # Space
"\u007f", # Delete
"☃", # Snowman
"😉", # Winking face
],
[1, 1, 1, 1, 1, 1, 1],
),
# Complex UTF-8 sequences
(
[
"☃", # Snowman
"😉", # Winking face
"☃😉", # Snowman + Winking face
"🌈", # Rainbow
"🌈☃", # Rainbow + Snowman
"☃🌈😉", # Snowman + Rainbow + Winking face
],
[1, 1, 2, 1, 2, 3],
),
# Mixed content lengths
(
[
"Hello☃World", # ASCII + UTF-8 + ASCII
"\u0000Hello\u0000World\u0000", # Null-separated
"☃Hello☃World☃", # UTF-8-separated
"Hello😉World", # ASCII + UTF-8 + ASCII
],
[11, 13, 13, 11],
),
],
)
def test_utf8_length(input_data: list[str | None], expected_lengths: list[int | None]) -> None:
table = MicroPartition.from_pydict({"col": input_data})
result = table.eval_expression_list([col("col").str.length()])
assert result.to_pydict() == {"col": [3, None, 6, 4, 5, 0]}
assert result.to_pydict() == {"col": expected_lengths}
Loading

0 comments on commit fbea790

Please sign in to comment.