Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ability to sample seeds #11304

Merged
merged 3 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20250212-173743.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Allow for sampling of ref'd seeds
time: 2025-02-12T17:37:43.554156-06:00
custom:
Author: QMalcolm
Issue: "11300"
14 changes: 12 additions & 2 deletions core/dbt/context/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,13 @@
get_adapter_package_names,
get_adapter_type_names,
)
from dbt.artifacts.resources import NodeConfig, NodeVersion, RefArgs, SourceConfig
from dbt.artifacts.resources import (
NodeConfig,
NodeVersion,
RefArgs,
SeedConfig,
SourceConfig,
)
from dbt.clients.jinja import (
MacroGenerator,
MacroStack,
Expand Down Expand Up @@ -247,7 +253,11 @@ def resolve_event_time_filter(self, target: ManifestNode) -> Optional[EventTimeF

# Only do event time filtering if the base node has the necessary event time configs
if (
(isinstance(target.config, NodeConfig) or isinstance(target.config, SourceConfig))
(
isinstance(target.config, NodeConfig)
or isinstance(target.config, SourceConfig)
or isinstance(target.config, SeedConfig)
)
and target.config.event_time
and isinstance(self.model, ModelNode)
):
Expand Down
69 changes: 69 additions & 0 deletions tests/functional/sample_mode/test_sample_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,21 @@
select 6 as id, TIMESTAMP '2025-01-06 12:32:00-0' as event_time
"""

input_seed_csv = """id,event_time
1,'2020-01-01 01:25:00-0'
2,'2025-01-02 13:47:00-0'
3,'2025-01-03 01:32:00-0'
"""

seed_properties_yml = """
seeds:
- name: input_seed
config:
event_time: event_time
column_types:
event_time: timestamp
"""

sample_mode_model_sql = """
{{ config(materialized='table', event_time='event_time') }}

Expand All @@ -48,6 +63,12 @@
SELECT * FROM {{ ref("input_model") }}
"""

sample_input_seed_sql = """
{{ config(materialized='table') }}

SELECT * FROM {{ ref("input_seed") }}
"""

sample_microbatch_model_sql = """
{{ config(materialized='incremental', incremental_strategy='microbatch', event_time='event_time', batch_size='day', lookback=3, begin='2024-12-25', unique_key='id')}}

Expand Down Expand Up @@ -368,3 +389,51 @@ def test_incremental_model_sample(
relation_name="sample_incremental_merge",
expected_row_count=expected_rows,
)


class TestSampleSeedRefs(BaseSampleMode):
@pytest.fixture(scope="class")
def seeds(self):
return {
"input_seed.csv": input_seed_csv,
"properties.yml": seed_properties_yml,
}

@pytest.fixture(scope="class")
def models(self):
return {
"sample_input_seed.sql": sample_input_seed_sql,
}

@pytest.mark.parametrize(
"sample_mode_available,run_sample_mode,expected_row_count",
[
(True, True, 2),
(True, False, 3),
(False, True, 3),
(False, False, 3),
],
)
@freezegun.freeze_time("2025-01-03T02:03:0Z")
def test_sample_mode(
self,
project,
mocker: MockerFixture,
sample_mode_available: bool,
run_sample_mode: bool,
expected_row_count: int,
):
run_args = ["run"]
if run_sample_mode:
run_args.append("--sample=1 day")

if sample_mode_available:
mocker.patch.dict(os.environ, {"DBT_EXPERIMENTAL_SAMPLE_MODE": "1"})

_ = run_dbt(["seed"])
_ = run_dbt(run_args)
self.assert_row_count(
project=project,
relation_name="sample_input_seed",
expected_row_count=expected_row_count,
)
44 changes: 39 additions & 5 deletions tests/unit/context/test_providers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import os
from argparse import Namespace
from datetime import datetime
from typing import Optional
from typing import Any, Optional
from unittest import mock

import pytest
import pytz
from pytest_mock import MockerFixture

from dbt.adapters.base import BaseRelation
from dbt.artifacts.resources import NodeConfig, Quoting
from dbt.artifacts.resources import NodeConfig, Quoting, SeedConfig
from dbt.artifacts.resources.types import BatchSize
from dbt.context.providers import (
BaseResolver,
Expand Down Expand Up @@ -46,7 +46,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
assert resolver.resolve_limit == expected_resolve_limit

@pytest.mark.parametrize(
"use_microbatch_batches,materialized,incremental_strategy,sample_mode_available,sample,resolver_model_node,expect_filter",
"use_microbatch_batches,materialized,incremental_strategy,sample_mode_available,sample,resolver_model_node,target_type,expect_filter",
[
# Microbatch model without sample
(
Expand All @@ -56,6 +56,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
True,
None,
True,
NodeConfig,
True,
),
# Microbatch model with sample
Expand All @@ -69,6 +70,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
end=datetime(2025, 1, 1, tzinfo=pytz.UTC),
),
True,
NodeConfig,
True,
),
# Normal model with sample
Expand All @@ -82,6 +84,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
end=datetime(2025, 1, 1, tzinfo=pytz.UTC),
),
True,
NodeConfig,
True,
),
# Incremental merge model with sample
Expand All @@ -95,6 +98,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
end=datetime(2025, 1, 1, tzinfo=pytz.UTC),
),
True,
NodeConfig,
True,
),
# Normal model with sample, but sample mode not available
Expand All @@ -108,6 +112,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
end=datetime(2025, 1, 1, tzinfo=pytz.UTC),
),
True,
NodeConfig,
False,
),
# Sample, but not model node
Expand All @@ -121,6 +126,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
end=datetime(2025, 1, 1, tzinfo=pytz.UTC),
),
False,
NodeConfig,
False,
),
# Microbatch, but not model node
Expand All @@ -131,6 +137,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
False,
None,
False,
NodeConfig,
False,
),
# Mircrobatch model, but not using batches
Expand All @@ -141,6 +148,7 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
False,
None,
True,
NodeConfig,
False,
),
# Non microbatch model, but supposed to use batches
Expand All @@ -151,10 +159,35 @@ def test_resolve_limit(self, resolver, empty, expected_resolve_limit):
False,
None,
True,
NodeConfig,
False,
),
# Incremental merge
(True, "incremental", "merge", False, None, True, False),
(True, "incremental", "merge", False, None, True, NodeConfig, False),
# Target seed node, with sample
(
False,
"table",
None,
True,
SampleWindow.from_relative_string("2 days"),
True,
SeedConfig,
True,
),
# Target seed node, with sample, but sample mode not availavle
(
False,
"table",
None,
False,
SampleWindow.from_relative_string("2 days"),
True,
SeedConfig,
False,
),
# Target seed node, without sample, but sample mode availavle
(False, "table", None, True, None, True, SeedConfig, False),
],
)
def test_resolve_event_time_filter(
Expand All @@ -167,11 +200,12 @@ def test_resolve_event_time_filter(
sample_mode_available: bool,
sample: Optional[SampleWindow],
resolver_model_node: bool,
target_type: Any,
expect_filter: bool,
) -> None:
# Target mocking
target = mock.Mock()
target.config = mock.MagicMock(NodeConfig)
target.config = mock.MagicMock(target_type)
target.config.event_time = "created_at"

# Declare whether sample mode is available
Expand Down
Loading