Dataset pipelines#384
Open
Ankur Goyal (ankrgyl) wants to merge 8 commits into
Open
Conversation
| @@ -0,0 +1,161 @@ | |||
| from __future__ import annotations | |||
There was a problem hiding this comment.
we can remove this
Suggested change
| from __future__ import annotations |
There was a problem hiding this comment.
We should add __all__ to the module so not everything here is exposed as public API.
| DatasetPipelineTargetLike: TypeAlias = DatasetPipelineTarget | PipelineTarget | ||
|
|
||
|
|
||
| def _drop_none(values: dict[str, Any]) -> dict[str, Any]: |
Member
There was a problem hiding this comment.
we have a util method for this declared in the utils file that we should use instead
I'll clean up the agent skills as a follow up to make sure they do this less.
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class PipelineSource: |
There was a problem hiding this comment.
I would prefer if we avoid the duplication here between the typed dict and dataclass and just have the SDK API accept one. I can't really see an advantage of accepting a dataclass when this is basically only used for json-like config.
diff --git a/py/src/braintrust/dataset_pipeline.py b/py/src/braintrust/dataset_pipeline.py
index c8be53f4..b6399d68 100644
--- a/py/src/braintrust/dataset_pipeline.py
+++ b/py/src/braintrust/dataset_pipeline.py
@@ -19,8 +19,6 @@ __all__ = [
"DatasetPipelineTransform",
"DatasetPipelineTransformArgs",
"DatasetPipelineTransformResult",
- "PipelineSource",
- "PipelineTarget",
]
@@ -35,26 +33,6 @@ class DatasetPipelineSource(TypedDict, total=False):
scope: DatasetPipelineScope
-@dataclass(frozen=True)
-class PipelineSource:
- filter: str | None = None
- scope: DatasetPipelineScope | None = None
- project_name: str | None = None
- project_id: str | None = None
- org_name: str | None = None
-
- def as_dict(self) -> DatasetPipelineSource:
- return _drop_none(
- {
- "project_id": self.project_id,
- "project_name": self.project_name,
- "org_name": self.org_name,
- "filter": self.filter,
- "scope": self.scope,
- }
- )
-
-
class DatasetPipelineTarget(TypedDict):
dataset_name: str
project_id: NotRequired[str]
@@ -64,28 +42,6 @@ class DatasetPipelineTarget(TypedDict):
metadata: NotRequired[Metadata]
-@dataclass(frozen=True)
-class PipelineTarget:
- dataset_name: str
- project_name: str | None = None
- project_id: str | None = None
- org_name: str | None = None
- description: str | None = None
- metadata: Metadata | None = None
-
- def as_dict(self) -> DatasetPipelineTarget:
- return _drop_none(
- {
- "project_id": self.project_id,
- "project_name": self.project_name,
- "org_name": self.org_name,
- "dataset_name": self.dataset_name,
- "description": self.description,
- "metadata": self.metadata,
- }
- )
-
-
class DatasetPipelineRow(TypedDict, total=False):
id: str
input: Any | None
@@ -107,24 +63,6 @@ class DatasetPipelineTransformArgs(TypedDict, total=False):
DatasetPipelineTransformResult: TypeAlias = Row | Sequence[Row] | None
-DatasetPipelineSourceLike: TypeAlias = DatasetPipelineSource | PipelineSource
-DatasetPipelineTargetLike: TypeAlias = DatasetPipelineTarget | PipelineTarget
-
-
-def _drop_none(values: dict[str, Any]) -> dict[str, Any]:
- return {key: value for key, value in values.items() if value is not None}
-
-
-def _normalize_source(source: DatasetPipelineSourceLike) -> DatasetPipelineSource:
- if isinstance(source, PipelineSource):
- return source.as_dict()
- return dict(source)
-
-
-def _normalize_target(target: DatasetPipelineTargetLike) -> DatasetPipelineTarget:
- if isinstance(target, PipelineTarget):
- return target.as_dict()
- return dict(target)
class DatasetPipelineTransform(Protocol[Row]):
@@ -160,15 +98,15 @@ def is_dataset_pipeline_definition(value: object) -> bool:
def DatasetPipeline(
name: str | None = None,
*,
- source: DatasetPipelineSourceLike,
+ source: DatasetPipelineSource,
transform: DatasetPipelineTransform[DatasetPipelineRow],
- target: DatasetPipelineTargetLike,
+ target: DatasetPipelineTarget,
) -> DatasetPipelineDefinition[DatasetPipelineRow]:
definition = DatasetPipelineDefinition(
name=name,
- source=_normalize_source(source),
+ source=dict(source),
transform=transform,
- target=_normalize_target(target),
+ target=dict(target),
)
_DATASET_PIPELINES.append(definition)
return definition
Comment on lines
+149
to
+157
| _DATASET_PIPELINES: list[DatasetPipelineDefinition[Any]] = [] | ||
|
|
||
|
|
||
| def get_registered_dataset_pipelines() -> list[DatasetPipelineDefinition[Any]]: | ||
| return list(_DATASET_PIPELINES) | ||
|
|
||
|
|
||
| def is_dataset_pipeline_definition(value: object) -> bool: | ||
| return isinstance(value, DatasetPipelineDefinition) |
There was a problem hiding this comment.
where is this logic used? Can we remove?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.