Merge branch 'master' into providers-page

wandb · Feb 28, 2025 · 60ddf55 · 60ddf55
2 parents 471b7b1 + 1d702d4
commit 60ddf55
Show file tree

Hide file tree

Showing 74 changed files with 3,309 additions and 1,410 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,5 +1,8 @@
 * @wandb/weave-team
 /docs/ @wandb/docs-team @wandb/weave-team
-weave-js/src/common @wandb/fe-infra-reviewers
-weave-js/src/components @wandb/fe-infra-reviewers @wandb/weave-team
-weave-js/src/assets @wandb/fe-infra-reviewers @wandb/weave-team
+/weave-js/src/common @wandb/fe-infra-reviewers
+/weave-js/src/components @wandb/fe-infra-reviewers @wandb/weave-team
+/weave-js/src/assets @wandb/fe-infra-reviewers @wandb/weave-team
+/weave-js/src/components/Panel2 @wandb/query-engine-reviewers @wandb/weave-team
+/weave-js/src/core @wandb/query-engine-reviewers @wandb/weave-team
+/weave_query/ @wandb/query-engine-reviewers @wandb/weave-team
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -275,6 +275,10 @@ jobs:
           --health-interval=5s
           --health-timeout=3s
           --health-start-period=10s
+      azurite:
+        image: mcr.microsoft.com/azure-storage/azurite
+        ports:
+          - "10000:10000"
       weave_clickhouse:
         image: clickhouse/clickhouse-server
         env:
@@ -307,6 +311,7 @@ jobs:
           WB_SERVER_HOST: http://wandbservice
           WF_CLICKHOUSE_HOST: localhost
           WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
+          DD_TRACE_ENABLED: false
         run: |
           nox -e "tests-${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}(shard='${{ matrix.nox-shard }}')" -- \
             -m "weave_client and not skip_clickhouse_client" \
@@ -324,6 +329,7 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          DD_TRACE_ENABLED: false
         run: |
           nox -e "tests-${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}(shard='${{ matrix.nox-shard }}')"
   trace-tests-matrix-check: # This job does nothing and is only used for the branch protection

diff --git a/docs/docs/guides/core-types/models.md b/docs/docs/guides/core-types/models.md
@@ -76,92 +76,6 @@ A `Model` is a combination of data (which can include configuration, trained mod
         model.predict('world')
     ```
 
-    ## Pairwise evaluation of models
-
-    When [scoring](../evaluation/scorers.md) models in a Weave [evaluation](../core-types/evaluations.md), absolute value metrics (e.g. `9/10` for Model A and `8/10` for Model B) are typically harder to assign than than relative ones (e.g. Model A performs better than Model B). _Pairwise evaluation_ allows you to compare the outputs of two models by ranking them relative to each other. This approach is particularly useful when you want to determine which model performs better for subjective tasks such as text generation, summarization, or question answering. With pairwise evaluation, you can obtain a relative preference ranking that reveals which model is best for specific inputs.
-
-    The following code sample demonstrates how to implement a pairwise evaluation in Weave by creating a [class-based scorer](../evaluation/scorers.md#class-based-scorers) called `PreferenceScorer`. The `PreferenceScorer` compares two models, `ModelA` and `ModelB`, and returns a relative score of the model outputs based on explicit hints in the input text.
-
-    ```python
-    from weave import Model, Evaluation, Scorer, Dataset
-    from weave.flow.model import ApplyModelError, apply_model_async
-
-    class ModelA(Model):
-        @weave.op
-        def predict(self, input_text: str):
-            if "Prefer model A" in input_text:
-                return {"response": "This is a great answer from Model A"}
-            return {"response": "Meh, whatever"}
-
-    class ModelB(Model):
-        @weave.op
-        def predict(self, input_text: str):
-            if "Prefer model B" in input_text:
-                return {"response": "This is a thoughtful answer from Model B"}
-            return {"response": "I don't know"}
-
-    class PreferenceScorer(Scorer):
-        @weave.op
-        async def _get_other_model_output(self, example: dict) -> Any:
-            """Get output from the other model for comparison.
-            Args:
-                example: The input example data to run through the other model
-            Returns:
-                The output from the other model
-            """
-
-            other_model_result = await apply_model_async(
-                self.other_model,
-                example,
-                None,
-            )
-
-            if isinstance(other_model_result, ApplyModelError):
-                return None
-
-            return other_model_result.model_output
-
-        @weave.op
-        async def score(self, output: dict, input_text: str) -> dict:
-            """Compare the output of the primary model with the other model.
-            Args:
-                output (dict): The output from the primary model.
-                other_output (dict): The output from the other model being compared.
-                inputs (str): The input text used to generate the outputs.
-            Returns:
-                dict: A flat dictionary containing the comparison result and reason.
-            """
-            other_output = await self._get_other_model_output(
-                {"input_text": inputs}
-            )
-            if other_output is None:
-                return {"primary_is_better": False, "reason": "Other model failed"}
-
-            if "Prefer model A" in input_text:
-                primary_is_better = True
-                reason = "Model A gave a great answer"
-            else:
-                primary_is_better = False
-                reason = "Model B is preferred for this type of question"
-
-            return {"primary_is_better": primary_is_better, "reason": reason}
-
-    dataset = Dataset(
-        rows=[
-            {"input_text": "Prefer model A: Question 1"},  # Model A wins
-            {"input_text": "Prefer model A: Question 2"},  # Model A wins
-            {"input_text": "Prefer model B: Question 3"},  # Model B wins
-            {"input_text": "Prefer model B: Question 4"},  # Model B wins
-        ]
-    )
-
-    model_a = ModelA()
-    model_b = ModelB()
-    pref_scorer = PreferenceScorer(other_model=model_b)
-    evaluation = Evaluation(dataset=dataset, scorers=[pref_scorer])
-    evaluation.evaluate(model_a)
-```
-
   </TabItem>
   <TabItem value="typescript" label="TypeScript">
     ```plaintext

diff --git a/docs/docs/guides/tracking/faqs.md b/docs/docs/guides/tracking/faqs.md
@@ -51,3 +51,92 @@ When your program is exiting it may appear to pause while any remaining enqueued
 ## How is Weave data ingestion calculated?
 
 We define ingested bytes as bytes that we receive, process, and store on your behalf. This includes trace metadata, LLM inputs/outputs, and any other information you explicitly log to Weave, but does not include communication overhead (e.g., HTTP headers) or any other data that is not placed in long-term storage. We count bytes as "ingested" only once at the time they are received and stored.
+
+## What is pairwise evaluation and how do I do it?
+
+When [scoring](../evaluation/scorers.md) models in a Weave [evaluation](../core-types/evaluations.md), absolute value metrics (e.g. `9/10` for Model A and `8/10` for Model B) are typically harder to assign than relative ones (e.g. Model A performs better than Model B). _Pairwise evaluation_ allows you to compare the outputs of two models by ranking them relative to each other. This approach is particularly useful when you want to determine which model performs better for subjective tasks such as text generation, summarization, or question answering. With pairwise evaluation, you can obtain a relative preference ranking that reveals which model is best for specific inputs.
+
+:::important
+This approach is a workaround and may change in future releases. We are actively working on a more robust API to support pairwise evaluations. Stay tuned for updates!
+:::
+
+The following code sample demonstrates how to implement a pairwise evaluation in Weave by creating a [class-based scorer](../evaluation/scorers.md#class-based-scorers) called `PreferenceScorer`. The `PreferenceScorer` compares two models, `ModelA` and `ModelB`, and returns a relative score of the model outputs based on explicit hints in the input text.
+
+```python
+from weave import Model, Evaluation, Scorer, Dataset
+from weave.flow.model import ApplyModelError, apply_model_async
+
+class ModelA(Model):
+    @weave.op
+    def predict(self, input_text: str):
+        if "Prefer model A" in input_text:
+            return {"response": "This is a great answer from Model A"}
+        return {"response": "Meh, whatever"}
+
+class ModelB(Model):
+    @weave.op
+    def predict(self, input_text: str):
+        if "Prefer model B" in input_text:
+            return {"response": "This is a thoughtful answer from Model B"}
+        return {"response": "I don't know"}
+
+class PreferenceScorer(Scorer):
+    @weave.op
+    async def _get_other_model_output(self, example: dict) -> Any:
+        """Get output from the other model for comparison.
+        Args:
+            example: The input example data to run through the other model
+        Returns:
+            The output from the other model
+        """
+
+        other_model_result = await apply_model_async(
+            self.other_model,
+            example,
+            None,
+        )
+
+        if isinstance(other_model_result, ApplyModelError):
+            return None
+
+        return other_model_result.model_output
+
+    @weave.op
+    async def score(self, output: dict, input_text: str) -> dict:
+        """Compare the output of the primary model with the other model.
+        Args:
+            output (dict): The output from the primary model.
+            input_text (str): The input text used to generate the outputs.
+        Returns:
+            dict: A flat dictionary containing the comparison result and reason.
+        """
+        other_output = await self._get_other_model_output(
+            {"input_text": input_text}
+        )
+        if other_output is None:
+            return {"primary_is_better": False, "reason": "Other model failed"}
+
+        if "Prefer model A" in input_text:
+            primary_is_better = True
+            reason = "Model A gave a great answer"
+        else:
+            primary_is_better = False
+            reason = "Model B is preferred for this type of question"
+
+        return {"primary_is_better": primary_is_better, "reason": reason}
+
+dataset = Dataset(
+    rows=[
+        {"input_text": "Prefer model A: Question 1"},  # Model A wins
+        {"input_text": "Prefer model A: Question 2"},  # Model A wins
+        {"input_text": "Prefer model B: Question 3"},  # Model B wins
+        {"input_text": "Prefer model B: Question 4"},  # Model B wins
+    ]
+)
+
+model_a = ModelA()
+model_b = ModelB()
+pref_scorer = PreferenceScorer(other_model=model_b)
+evaluation = Evaluation(dataset=dataset, scorers=[pref_scorer])
+evaluation.evaluate(model_a)
+```
diff --git a/docs/docs/guides/tracking/tracing.mdx b/docs/docs/guides/tracking/tracing.mdx
@@ -592,7 +592,7 @@ The easiest way to get started is to construct a view in the UI, then learn more
 
 <Tabs groupId="client-layer">
     <TabItem value="python_sdk" label="Python">
-    To fetch calls using the Python API, you can use the [`client.calls`](../../reference/python-sdk/weave/trace/weave.trace.weave_client.md#method-calls) method:
+    To fetch calls using the Python API, you can use the [`client.get_calls`](../../reference/python-sdk/weave/trace/weave.trace.weave_client.md#method-get_calls) method:
 
     ```python
     import weave
@@ -604,24 +604,6 @@ The easiest way to get started is to construct a view in the UI, then learn more
     calls = client.get_calls(filter=...)
     ```
 
-    :::info[Notice: Evolving APIs]
-    Currently, it is easier to use the lower-level [`calls_query_stream`](../../reference/python-sdk/weave/trace_server_bindings/weave.trace_server_bindings.remote_http_trace_server#method-calls_query_stream) API as it is more flexible and powerful.
-    In the near future, we will move all functionality to the above client API.
-
-    ```python
-    import weave
-
-    # Initialize the client
-    client = weave.init("your-project-name")
-
-    calls = client.server.calls_query_stream({
-        "project_id": "",
-        "filter": {},
-        "query": {},
-        "sort_by": [],
-    })
-    ```
-    :::
     </TabItem>
     <TabItem value="typescript" label="TypeScript">
     To fetch calls using the TypeScript API, you can use the [`client.getCalls`](../../reference/typescript-sdk/weave/classes/WeaveClient#getcalls) method.

diff --git a/noxfile.py b/noxfile.py
@@ -72,6 +72,7 @@ def tests(session, shard):
             "WB_SERVER_HOST",
             "WF_CLICKHOUSE_HOST",
             "WEAVE_SERVER_DISABLE_ECOSYSTEM",
+            "DD_TRACE_ENABLED",
         ]
     }
     # Add the GOOGLE_API_KEY environment variable for the "google" shard

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,22 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+# `trace_server` is the dependency list of the trace server itself. We eventually will extract
+# this to a separate package. Note, when that happens, we will need to pull along some of the
+#default dependencies as well.
+trace_server = [
+  "ddtrace>=2.7.0",
+  # BYOB - S3
+  "boto3>=1.34.0",
+  # BYOB - Azure
+  "azure-storage-blob>=12.24.0",
+  # BYOB - GCP
+  "google-cloud-storage>=2.7.0",
+]
+trace_server_tests = [
+  # BYOB - S3
+  "moto[s3]>=5.0.0",
+]
 docs = ["playwright", "lazydocs", "nbformat", "nbconvert"]
 anthropic = ["anthropic>=0.18.0"]
 cerebras = ["cerebras-cloud-sdk"]
@@ -74,7 +90,7 @@ langchain_nvidia_ai_endpoints = [
   "langchain-nvidia-ai-endpoints",
 ]
 litellm = ["litellm>=1.36.1"]
-llamaindex = ["llama-index>=0.10.35"]
+llamaindex = ["llama-index>=0.10.35,<0.12.0"] # temporary max pin b/c 0.12.0 changes call structure and therefor tests
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
 scorers = [
@@ -118,11 +134,11 @@ test = [
   "pillow",
   "filelock",
   "httpx",
+
+  "weave[trace_server]",
+  "weave[trace_server_tests]",
 ]
 
-[project.scripts]
-weave = "weave.trace.cli:cli"
-
 [project.urls]
 Company = "https://wandb.com"
 Documentation = "https://docs.wandb.com/"
@@ -237,7 +253,7 @@ module = "weave_query.*"
 ignore_errors = true
 
 [tool.bumpversion]
-current_version = "0.51.35-dev0"
+current_version = "0.51.36-dev0"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -32,6 +32,53 @@
 os.environ["WANDB_ERROR_REPORTING"] = "false"
 
 
+@pytest.fixture(autouse=True)
+def disable_datadog():
+    """
+    Disables Datadog logging and tracing for tests.
+
+    This prevents Datadog from polluting test logs with messages like
+    'failed to send, dropping 1 traces to intake at...'
+    """
+    # Save original values to restore later
+    original_dd_env = os.environ.get("DD_ENV")
+    original_dd_trace = os.environ.get("DD_TRACE_ENABLED")
+
+    # Disable Datadog
+    os.environ["DD_ENV"] = "none"
+    os.environ["DD_TRACE_ENABLED"] = "false"
+
+    # Silence Datadog loggers
+    dd_loggers = [
+        "ddtrace",
+        "ddtrace.writer",
+        "ddtrace.api",
+        "ddtrace.internal",
+        "datadog",
+        "datadog.dogstatsd",
+        "datadog.api",
+    ]
+
+    original_levels = {}
+    for logger_name in dd_loggers:
+        logger = logging.getLogger(logger_name)
+        original_levels[logger_name] = logger.level
+        logger.setLevel(logging.CRITICAL)  # Only show critical errors
+
+    yield
+
+    # Restore original values
+    if original_dd_env is not None:
+        os.environ["DD_ENV"] = original_dd_env
+    elif "DD_ENV" in os.environ:
+        del os.environ["DD_ENV"]
+
+    if original_dd_trace is not None:
+        os.environ["DD_TRACE_ENABLED"] = original_dd_trace
+    elif "DD_TRACE_ENABLED" in os.environ:
+        del os.environ["DD_TRACE_ENABLED"]
+
+
 def pytest_addoption(parser):
     parser.addoption(
         "--weave-server",