test: add utf8 string operation tests to highlight substr inconsisten…

…cies
Eventual-Inc · Jan 17, 2025 · fbea790 · fbea790
1 parent 5549d16
commit fbea790
Show file tree

Hide file tree

Showing 5 changed files with 1,121 additions and 7 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,7 +55,7 @@ check-hidden = true
 ignore-words-list = "crate,arithmetics,ser"
 # Feel free to un-skip examples, and experimental, you will just need to
 # work through many typos (--write-changes and --interactive will help)
-skip = "tests/series/*,target,.git,.venv,venv,data,*.csv,*.csv.*,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb,*.tiktoken,*.sql,tests/table/utf8/*,tests/table/binary/*"
+skip = "tests/series/*,tests/table/utf8/*,tests/table/binary/*,target,.git,.venv,venv,data,*.csv,*.csv.*,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb,*.tiktoken,*.sql"
 
 [tool.maturin]
 # "python" tells pyo3 we want to build an extension module (skips linking against libpython.so)

diff --git a/tests/table/utf8/test_concat.py b/tests/table/utf8/test_concat.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import pytest
+
+from daft.expressions import col, lit
+from daft.table import MicroPartition
+
+
+@pytest.mark.parametrize(
+    "input_a,input_b,expected_result",
+    [
+        # Basic ASCII concatenation
+        (
+            ["Hello", "Test", "", "End"],
+            [" World", "ing", "Empty", "!"],
+            ["Hello World", "Testing", "Empty", "End!"],
+        ),
+        # Special UTF-8 sequences
+        (
+            [
+                "☃",  # Snowman
+                "😉",  # Winking face
+                "🌈",  # Rainbow
+                "Hello☃",  # String with UTF-8
+                "Hello\u0000",  # String with null
+            ],
+            [
+                "😉",  # Winking face
+                "☃",  # Snowman
+                "☃",  # Snowman
+                "World",  # ASCII
+                "\u0000World",  # Null and string
+            ],
+            [
+                "☃😉",  # Snowman + Winking face
+                "😉☃",  # Winking face + Snowman
+                "🌈☃",  # Rainbow + Snowman
+                "Hello☃World",  # String with UTF-8 + ASCII
+                "Hello\u0000\u0000World",  # String with multiple nulls
+            ],
+        ),
+        # Nulls and empty strings
+        (
+            ["Hello", None, "", "Test", None, "End", ""],
+            [" World", "!", None, None, "ing", "", "Empty"],
+            ["Hello World", None, None, None, None, "End", "Empty"],
+        ),
+        # Mixed length concatenation
+        (
+            ["a", "ab", "abc", "abcd"],
+            ["1", "12", "123", "1234"],
+            ["a1", "ab12", "abc123", "abcd1234"],
+        ),
+        # Empty string combinations
+        (
+            ["", "", "Hello", "World", ""],
+            ["", "Test", "", "", "!"],
+            ["", "Test", "Hello", "World", "!"],
+        ),
+        # Complex UTF-8 sequences
+        (
+            [
+                "☃",  # Snowman
+                "😉",  # Winking face
+                "🌈",  # Rainbow
+                "☃😉",  # Snowman + Winking face
+            ],
+            [
+                "😉",  # Winking face
+                "☃",  # Snowman
+                "☃",  # Snowman
+                "🌈",  # Rainbow
+            ],
+            [
+                "☃😉",  # Snowman + Winking face
+                "😉☃",  # Winking face + Snowman
+                "🌈☃",  # Rainbow + Snowman
+                "☃😉🌈",  # Snowman + Winking face + Rainbow
+            ],
+        ),
+        # Null characters in different positions
+        (
+            [
+                "\u0000abc",  # Leading null
+                "abc\u0000",  # Trailing null
+                "ab\u0000c",  # Middle null
+                "\u0000ab\u0000c\u0000",  # Multiple nulls
+            ],
+            [
+                "def\u0000",  # Trailing null
+                "\u0000def",  # Leading null
+                "d\u0000ef",  # Middle null
+                "\u0000de\u0000f\u0000",  # Multiple nulls
+            ],
+            [
+                "\u0000abcdef\u0000",  # Nulls at ends
+                "abc\u0000\u0000def",  # Adjacent nulls
+                "ab\u0000cd\u0000ef",  # Separated nulls
+                "\u0000ab\u0000c\u0000\u0000de\u0000f\u0000",  # Many nulls
+            ],
+        ),
+    ],
+)
+def test_utf8_concat(input_a: list[str | None], input_b: list[str | None], expected_result: list[str | None]) -> None:
+    table = MicroPartition.from_pydict({"a": input_a, "b": input_b})
+    result = table.eval_expression_list([col("a").str.concat(col("b"))])
+    assert result.to_pydict() == {"a": expected_result}
+
+
+@pytest.mark.parametrize(
+    "input_data,literal,expected_result",
+    [
+        # Basic broadcasting
+        (
+            ["Hello", "Goodbye", "Test"],
+            " World!",
+            ["Hello World!", "Goodbye World!", "Test World!"],
+        ),
+        # Broadcasting with nulls
+        (
+            ["Hello", None, "Test"],
+            " World!",
+            ["Hello World!", None, "Test World!"],
+        ),
+        # Broadcasting with UTF-8 sequences
+        (
+            ["Hello", "Test", "Goodbye"],
+            "☃",  # Snowman
+            ["Hello☃", "Test☃", "Goodbye☃"],
+        ),
+        # Broadcasting with null characters
+        (
+            ["Hello", "Test\u0000", "\u0000World"],
+            "\u0000",
+            ["Hello\u0000", "Test\u0000\u0000", "\u0000World\u0000"],
+        ),
+        # Broadcasting with empty strings
+        (
+            ["", "Test", ""],
+            "☃",
+            ["☃", "Test☃", "☃"],
+        ),
+        # Broadcasting with complex UTF-8
+        (
+            ["Hello", "Test", "Goodbye"],
+            "☃😉🌈",  # Snowman + Winking face + Rainbow
+            ["Hello☃😉🌈", "Test☃😉🌈", "Goodbye☃😉🌈"],
+        ),
+        # Broadcasting with literal None
+        (
+            ["Hello", None, "Test", ""],
+            None,
+            [None, None, None, None],  # Any concat with None should result in None
+        ),
+    ],
+)
+def test_utf8_concat_broadcast(
+    input_data: list[str | None], literal: str | None, expected_result: list[str | None]
+) -> None:
+    # Test right-side broadcasting
+    table = MicroPartition.from_pydict({"a": input_data})
+    result = table.eval_expression_list([col("a").str.concat(literal)])
+    assert result.to_pydict() == {"a": expected_result}
+
+    # Test left-side broadcasting
+    table = MicroPartition.from_pydict({"b": input_data})
+    result = table.eval_expression_list([lit(literal).str.concat(col("b"))])
+    if literal is None:
+        # When literal is None, all results should be None
+        assert result.to_pydict() == {"literal": [None] * len(input_data)}
+    else:
+        assert result.to_pydict() == {
+            "literal": [
+                lit + data if data is not None else None for lit, data in zip([literal] * len(input_data), input_data)
+            ]
+        }
+
+
+def test_utf8_concat_edge_cases() -> None:
+    # Test various edge cases
+    table = MicroPartition.from_pydict(
+        {
+            "a": [
+                "",  # Empty string
+                "\u0000",  # Single null character
+                "Hello",  # Normal string
+                None,  # Null value
+                "☃",  # UTF-8 sequence
+                "😉",  # Another UTF-8 sequence
+            ],
+            "b": [
+                "",  # Empty + Empty
+                "\u0000",  # Null + Null
+                "",  # Normal + Empty
+                None,  # Null + Null
+                "😉",  # UTF-8 + UTF-8
+                "☃",  # UTF-8 + UTF-8
+            ],
+        }
+    )
+    result = table.eval_expression_list([col("a").str.concat(col("b"))])
+    assert result.to_pydict() == {
+        "a": [
+            "",  # Empty + Empty = Empty
+            "\u0000\u0000",  # Null + Null = Two nulls
+            "Hello",  # Normal + Empty = Normal
+            None,  # Null + Null = Null
+            "☃😉",  # Snowman + Winking face
+            "😉☃",  # Winking face + Snowman
+        ]
+    }
diff --git a/tests/table/utf8/test_length.py b/tests/table/utf8/test_length.py
@@ -1,10 +1,101 @@
 from __future__ import annotations
 
+import pytest
+
 from daft.expressions import col
 from daft.table import MicroPartition
 
 
-def test_utf8_length():
-    table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux", "😉test", ""]})
+@pytest.mark.parametrize(
+    "input_data,expected_lengths",
+    [
+        # Basic ASCII strings
+        (
+            ["Hello", "World!", "", "Test"],
+            [5, 6, 0, 4],
+        ),
+        # Special UTF-8 sequences
+        (
+            [
+                "☃",  # UTF-8 encoded snowman
+                "😉",  # UTF-8 encoded winking face
+                "🌈",  # UTF-8 encoded rainbow
+                "Hello☃World",  # Mixed ASCII and UTF-8
+                "☃😉🌈",  # Multiple UTF-8 characters
+                "Hello\u0000World",  # String with null character
+            ],
+            [1, 1, 1, 11, 3, 11],
+        ),
+        # Nulls and empty strings
+        (
+            ["Hello", None, "", "\u0000", None, "Test", ""],
+            [5, None, 0, 1, None, 4, 0],
+        ),
+        # Large strings
+        (
+            ["x" * 1000, "y" * 10000, "z" * 100000],
+            [1000, 10000, 100000],
+        ),
+        # Mixed strings with different sizes
+        (
+            [
+                "a",  # Single character
+                "ab",  # Two characters
+                "abc",  # Three characters
+                "☃",  # Single UTF-8 character
+                "☃☃",  # Two UTF-8 characters
+                "☃☃☃",  # Three UTF-8 characters
+            ],
+            [1, 2, 3, 1, 2, 3],
+        ),
+        # Strings with repeated patterns
+        (
+            [
+                "\u0000" * 5,  # Repeated null characters
+                "ab" * 5,  # Repeated ASCII pattern
+                "☃" * 5,  # Repeated UTF-8 snowman
+                "😉" * 5,  # Repeated UTF-8 winking face
+            ],
+            [5, 10, 5, 5],
+        ),
+        # Edge cases with single characters
+        (
+            [
+                "\u0000",  # Null character
+                "\u0001",  # Start of heading
+                "\u001f",  # Unit separator
+                " ",  # Space
+                "\u007f",  # Delete
+                "☃",  # Snowman
+                "😉",  # Winking face
+            ],
+            [1, 1, 1, 1, 1, 1, 1],
+        ),
+        # Complex UTF-8 sequences
+        (
+            [
+                "☃",  # Snowman
+                "😉",  # Winking face
+                "☃😉",  # Snowman + Winking face
+                "🌈",  # Rainbow
+                "🌈☃",  # Rainbow + Snowman
+                "☃🌈😉",  # Snowman + Rainbow + Winking face
+            ],
+            [1, 1, 2, 1, 2, 3],
+        ),
+        # Mixed content lengths
+        (
+            [
+                "Hello☃World",  # ASCII + UTF-8 + ASCII
+                "\u0000Hello\u0000World\u0000",  # Null-separated
+                "☃Hello☃World☃",  # UTF-8-separated
+                "Hello😉World",  # ASCII + UTF-8 + ASCII
+            ],
+            [11, 13, 13, 11],
+        ),
+    ],
+)
+def test_utf8_length(input_data: list[str | None], expected_lengths: list[int | None]) -> None:
+    table = MicroPartition.from_pydict({"col": input_data})
     result = table.eval_expression_list([col("col").str.length()])
-    assert result.to_pydict() == {"col": [3, None, 6, 4, 5, 0]}
+    assert result.to_pydict() == {"col": expected_lengths}