Skip to content

Dataset pipelines#384

Open
Ankur Goyal (ankrgyl) wants to merge 8 commits into
mainfrom
dataset-pipeline
Open

Dataset pipelines#384
Ankur Goyal (ankrgyl) wants to merge 8 commits into
mainfrom
dataset-pipeline

Conversation

@ankrgyl
Copy link
Copy Markdown
Contributor

No description provided.

Comment thread py/src/braintrust/dataset_pipeline.py Outdated
@@ -0,0 +1,161 @@
from __future__ import annotations
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove this

Suggested change
from __future__ import annotations

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add __all__ to the module so not everything here is exposed as public API.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

DatasetPipelineTargetLike: TypeAlias = DatasetPipelineTarget | PipelineTarget


def _drop_none(values: dict[str, Any]) -> dict[str, Any]:
Copy link
Copy Markdown
Member

@AbhiPrasad Abhijeet Prasad (AbhiPrasad) May 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have a util method for this declared in the utils file that we should use instead

def clean_nones(obj: dict[str, Any]) -> dict[str, Any]:

I'll clean up the agent skills as a follow up to make sure they do this less.



@dataclass(frozen=True)
class PipelineSource:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer if we avoid the duplication here between the typed dict and dataclass and just have the SDK API accept one. I can't really see an advantage of accepting a dataclass when this is basically only used for json-like config.

diff --git a/py/src/braintrust/dataset_pipeline.py b/py/src/braintrust/dataset_pipeline.py
index c8be53f4..b6399d68 100644
--- a/py/src/braintrust/dataset_pipeline.py
+++ b/py/src/braintrust/dataset_pipeline.py
@@ -19,8 +19,6 @@ __all__ = [
     "DatasetPipelineTransform",
     "DatasetPipelineTransformArgs",
     "DatasetPipelineTransformResult",
-    "PipelineSource",
-    "PipelineTarget",
 ]
 
 
@@ -35,26 +33,6 @@ class DatasetPipelineSource(TypedDict, total=False):
     scope: DatasetPipelineScope
 
 
-@dataclass(frozen=True)
-class PipelineSource:
-    filter: str | None = None
-    scope: DatasetPipelineScope | None = None
-    project_name: str | None = None
-    project_id: str | None = None
-    org_name: str | None = None
-
-    def as_dict(self) -> DatasetPipelineSource:
-        return _drop_none(
-            {
-                "project_id": self.project_id,
-                "project_name": self.project_name,
-                "org_name": self.org_name,
-                "filter": self.filter,
-                "scope": self.scope,
-            }
-        )
-
-
 class DatasetPipelineTarget(TypedDict):
     dataset_name: str
     project_id: NotRequired[str]
@@ -64,28 +42,6 @@ class DatasetPipelineTarget(TypedDict):
     metadata: NotRequired[Metadata]
 
 
-@dataclass(frozen=True)
-class PipelineTarget:
-    dataset_name: str
-    project_name: str | None = None
-    project_id: str | None = None
-    org_name: str | None = None
-    description: str | None = None
-    metadata: Metadata | None = None
-
-    def as_dict(self) -> DatasetPipelineTarget:
-        return _drop_none(
-            {
-                "project_id": self.project_id,
-                "project_name": self.project_name,
-                "org_name": self.org_name,
-                "dataset_name": self.dataset_name,
-                "description": self.description,
-                "metadata": self.metadata,
-            }
-        )
-
-
 class DatasetPipelineRow(TypedDict, total=False):
     id: str
     input: Any | None
@@ -107,24 +63,6 @@ class DatasetPipelineTransformArgs(TypedDict, total=False):
 
 
 DatasetPipelineTransformResult: TypeAlias = Row | Sequence[Row] | None
-DatasetPipelineSourceLike: TypeAlias = DatasetPipelineSource | PipelineSource
-DatasetPipelineTargetLike: TypeAlias = DatasetPipelineTarget | PipelineTarget
-
-
-def _drop_none(values: dict[str, Any]) -> dict[str, Any]:
-    return {key: value for key, value in values.items() if value is not None}
-
-
-def _normalize_source(source: DatasetPipelineSourceLike) -> DatasetPipelineSource:
-    if isinstance(source, PipelineSource):
-        return source.as_dict()
-    return dict(source)
-
-
-def _normalize_target(target: DatasetPipelineTargetLike) -> DatasetPipelineTarget:
-    if isinstance(target, PipelineTarget):
-        return target.as_dict()
-    return dict(target)
 
 
 class DatasetPipelineTransform(Protocol[Row]):
@@ -160,15 +98,15 @@ def is_dataset_pipeline_definition(value: object) -> bool:
 def DatasetPipeline(
     name: str | None = None,
     *,
-    source: DatasetPipelineSourceLike,
+    source: DatasetPipelineSource,
     transform: DatasetPipelineTransform[DatasetPipelineRow],
-    target: DatasetPipelineTargetLike,
+    target: DatasetPipelineTarget,
 ) -> DatasetPipelineDefinition[DatasetPipelineRow]:
     definition = DatasetPipelineDefinition(
         name=name,
-        source=_normalize_source(source),
+        source=dict(source),
         transform=transform,
-        target=_normalize_target(target),
+        target=dict(target),
     )
     _DATASET_PIPELINES.append(definition)
     return definition

Comment on lines +149 to +157
_DATASET_PIPELINES: list[DatasetPipelineDefinition[Any]] = []


def get_registered_dataset_pipelines() -> list[DatasetPipelineDefinition[Any]]:
return list(_DATASET_PIPELINES)


def is_dataset_pipeline_definition(value: object) -> bool:
return isinstance(value, DatasetPipelineDefinition)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is this logic used? Can we remove?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants