Skip to content

guidellm.data

DataArgs

Bases: PydanticClassRegistryMixin['DataArgs'], ABC

Base class for data loading and processing argument models.

This class serves as a base for defining argument models related to data loading and processing. It inherits from PydanticClassRegistryMixin to enable automatic registration of subclasses, allowing for flexible and extensible data handling configurations.

Attributes:

Name Type Description
schema_discriminator str

Field name for polymorphic deserialization

Source code in src/guidellm/data/schemas/entrypoints.py
class DataArgs(
    PydanticClassRegistryMixin["DataArgs"],
    ABC,
):
    """Base class for data loading and processing argument models.

    This class serves as a base for defining argument models related to data loading
    and processing. It inherits from PydanticClassRegistryMixin to enable automatic
    registration of subclasses, allowing for flexible and extensible data handling
    configurations.

    :cvar schema_discriminator: Field name for polymorphic deserialization
    """

    model_config = standard_model_config()

    schema_discriminator: ClassVar[str] = "kind"

    @classmethod
    def __pydantic_schema_base_type__(cls) -> type[DataArgs]:
        """
        Return base type for polymorphic validation hierarchy.

        :return: Base Profile class for schema validation
        """
        if cls.__name__ == "DataArgs":
            return cls

        return DataArgs

    kind: str = Field(
        description="Type identifier for the data arguments configuration.",
        examples=["text_file", "csv_file"],
    )
    load_kwargs: dict[str, Any] = Field(
        default_factory=dict,
        description=(
            "Additional arguments for data loading. These arguements are "
            "passed to the datasets library when loading the dataset."
        ),
        examples=[{"format": "csv"}],
    )

__pydantic_schema_base_type__() classmethod

Return base type for polymorphic validation hierarchy.

Returns:

Type Description
type[DataArgs]

Base Profile class for schema validation

Source code in src/guidellm/data/schemas/entrypoints.py
@classmethod
def __pydantic_schema_base_type__(cls) -> type[DataArgs]:
    """
    Return base type for polymorphic validation hierarchy.

    :return: Base Profile class for schema validation
    """
    if cls.__name__ == "DataArgs":
        return cls

    return DataArgs

DataFinalizerArgs

Bases: PydanticClassRegistryMixin['DataFinalizerArgs'], ABC

Base class for data finalizer argument models.

This class serves as a base for defining arguments related to data finalization configurations. It inherits from PydanticClassRegistryMixin to enable automatic registration of subclasses, allowing for flexible and extensible data finalization configurations.

Attributes:

Name Type Description
schema_discriminator str

Field name for polymorphic deserialization

Source code in src/guidellm/data/schemas/entrypoints.py
class DataFinalizerArgs(
    PydanticClassRegistryMixin["DataFinalizerArgs"],
    ABC,
):
    """
    Base class for data finalizer argument models.

    This class serves as a base for defining arguments related to data finalization
    configurations. It inherits from PydanticClassRegistryMixin to enable automatic
    registration of subclasses, allowing for flexible and extensible data finalization
    configurations.

    :cvar schema_discriminator: Field name for polymorphic deserialization
    """

    model_config = standard_model_config()

    schema_discriminator: ClassVar[str] = "kind"

    @classmethod
    def __pydantic_schema_base_type__(cls) -> type[DataFinalizerArgs]:
        """
        Return base type for polymorphic validation hierarchy.

        :return: Base DataFinalizerArgs class for schema validation
        """
        if cls.__name__ == "DataFinalizerArgs":
            return cls

        return DataFinalizerArgs

    kind: str = Field(
        description="Type identifier for the data finalizer arguments.",
        examples=["generative"],
    )

__pydantic_schema_base_type__() classmethod

Return base type for polymorphic validation hierarchy.

Returns:

Type Description
type[DataFinalizerArgs]

Base DataFinalizerArgs class for schema validation

Source code in src/guidellm/data/schemas/entrypoints.py
@classmethod
def __pydantic_schema_base_type__(cls) -> type[DataFinalizerArgs]:
    """
    Return base type for polymorphic validation hierarchy.

    :return: Base DataFinalizerArgs class for schema validation
    """
    if cls.__name__ == "DataFinalizerArgs":
        return cls

    return DataFinalizerArgs

DataLoaderArgs

Bases: PydanticClassRegistryMixin['DataLoaderArgs'], ABC

Base class for data loader argument models.

This class serves as a base for defining argument models related to data loading configurations. It inherits from PydanticClassRegistryMixin to enable automatic registration of subclasses, allowing for flexible and extensible data loading configurations.

Attributes:

Name Type Description
schema_discriminator str

Field name for polymorphic deserialization

Source code in src/guidellm/data/schemas/entrypoints.py
class DataLoaderArgs(
    PydanticClassRegistryMixin["DataLoaderArgs"],
    ABC,
):
    """
    Base class for data loader argument models.

    This class serves as a base for defining argument models related to data loading
    configurations. It inherits from PydanticClassRegistryMixin to enable automatic
    registration of subclasses, allowing for flexible and extensible data loading
    configurations.

    :cvar schema_discriminator: Field name for polymorphic deserialization
    """

    model_config = standard_model_config()

    schema_discriminator: ClassVar[str] = "kind"

    @classmethod
    def __pydantic_schema_base_type__(cls) -> type[DataLoaderArgs]:
        """
        Return base type for polymorphic validation hierarchy.

        :return: Base DataLoaderArgs class for schema validation
        """
        if cls.__name__ == "DataLoaderArgs":
            return cls

        return DataLoaderArgs

    kind: str = Field(
        description="Type identifier for the data loader configuration.",
    )
    samples: int = Field(
        default=-1,
        description=(
            "Number of data samples to generate. If -1, the data loader will "
            "generate indefinitely until the dataset is exhausted."
        ),
    )

__pydantic_schema_base_type__() classmethod

Return base type for polymorphic validation hierarchy.

Returns:

Type Description
type[DataLoaderArgs]

Base DataLoaderArgs class for schema validation

Source code in src/guidellm/data/schemas/entrypoints.py
@classmethod
def __pydantic_schema_base_type__(cls) -> type[DataLoaderArgs]:
    """
    Return base type for polymorphic validation hierarchy.

    :return: Base DataLoaderArgs class for schema validation
    """
    if cls.__name__ == "DataLoaderArgs":
        return cls

    return DataLoaderArgs

DataLoaderRegistry

Bases: Generic[DataT_co], RegistryMixin[type[DataLoader]]

Source code in src/guidellm/data/loaders/loader.py
class DataLoaderRegistry(Generic[DataT_co], RegistryMixin[type[DataLoader]]):
    @classmethod
    def create(
        cls,
        config: DataLoaderArgs,
        datasets: list[DatasetType],
        preprocessors: list[DatasetPreprocessor],
        finalizer: DatasetFinalizer[DataT_co],
        random_seed: int,
        **kwargs: Any,
    ) -> DataLoader[DataT_co]:
        """
        Factory method to create a DataLoader instance based on provided configuration.

        :param config: A DataEntrypointArgs object containing the configuration.
        """
        kind = config.kind
        data_loader_cls = cls.get_registered_object(kind)

        if data_loader_cls is None:
            raise ValueError(
                f"DataLoader type '{kind}' is not registered."
                f"Available types: {list(cls.registry.keys()) if cls.registry else []}"
            )

        return data_loader_cls(
            config=config,
            datasets=datasets,
            preprocessors=preprocessors,
            finalizer=finalizer,
            random_seed=random_seed,
            **kwargs,
        )

create(config, datasets, preprocessors, finalizer, random_seed, **kwargs) classmethod

Factory method to create a DataLoader instance based on provided configuration.

Parameters:

Name Type Description Default
config DataLoaderArgs

A DataEntrypointArgs object containing the configuration.

required
Source code in src/guidellm/data/loaders/loader.py
@classmethod
def create(
    cls,
    config: DataLoaderArgs,
    datasets: list[DatasetType],
    preprocessors: list[DatasetPreprocessor],
    finalizer: DatasetFinalizer[DataT_co],
    random_seed: int,
    **kwargs: Any,
) -> DataLoader[DataT_co]:
    """
    Factory method to create a DataLoader instance based on provided configuration.

    :param config: A DataEntrypointArgs object containing the configuration.
    """
    kind = config.kind
    data_loader_cls = cls.get_registered_object(kind)

    if data_loader_cls is None:
        raise ValueError(
            f"DataLoader type '{kind}' is not registered."
            f"Available types: {list(cls.registry.keys()) if cls.registry else []}"
        )

    return data_loader_cls(
        config=config,
        datasets=datasets,
        preprocessors=preprocessors,
        finalizer=finalizer,
        random_seed=random_seed,
        **kwargs,
    )

DataNotSupportedError

Bases: Exception

Exception raised when the data format is not supported by deserializer or config.

Source code in src/guidellm/data/schemas/base.py
class DataNotSupportedError(Exception):
    """
    Exception raised when the data format is not supported by deserializer or config.
    """

DataPreprocessorArgs

Bases: PydanticClassRegistryMixin['DataPreprocessorArgs'], ABC

Base class for data preprocessor argument models.

This class serves as a base for defining arguments related to data preprocessing configurations. It inherits from PydanticClassRegistryMixin to enable automatic registration of subclasses, allowing for flexible and extensible data preprocessing configurations.

Attributes:

Name Type Description
schema_discriminator str

Field name for polymorphic deserialization

Source code in src/guidellm/data/schemas/entrypoints.py
class DataPreprocessorArgs(
    PydanticClassRegistryMixin["DataPreprocessorArgs"],
    ABC,
):
    """
    Base class for data preprocessor argument models.

    This class serves as a base for defining arguments related to data preprocessing
    configurations. It inherits from PydanticClassRegistryMixin to enable automatic
    registration of subclasses, allowing for flexible and extensible data preprocessing
    configurations.

    :cvar schema_discriminator: Field name for polymorphic deserialization
    """

    model_config = standard_model_config()

    schema_discriminator: ClassVar[str] = "kind"

    @classmethod
    def __pydantic_schema_base_type__(cls) -> type[DataPreprocessorArgs]:
        """
        Return base type for polymorphic validation hierarchy.

        :return: Base DataPreprocessorArgs class for schema validation
        """
        if cls.__name__ == "DataPreprocessorArgs":
            return cls

        return DataPreprocessorArgs

    kind: str = Field(
        description="Type identifier for the data preprocessor arguments.",
        examples=["generative_column_mapper", "pooling_column_mapper"],
    )

__pydantic_schema_base_type__() classmethod

Return base type for polymorphic validation hierarchy.

Returns:

Type Description
type[DataPreprocessorArgs]

Base DataPreprocessorArgs class for schema validation

Source code in src/guidellm/data/schemas/entrypoints.py
@classmethod
def __pydantic_schema_base_type__(cls) -> type[DataPreprocessorArgs]:
    """
    Return base type for polymorphic validation hierarchy.

    :return: Base DataPreprocessorArgs class for schema validation
    """
    if cls.__name__ == "DataPreprocessorArgs":
        return cls

    return DataPreprocessorArgs

DataTokenizerArgs

Bases: PydanticClassRegistryMixin['DataTokenizerArgs'], ABC

Base class for data tokenizer argument models.

This class serves as a base for defining arguments related to data tokenization configurations. It inherits from PydanticClassRegistryMixin to enable automatic registration of subclasses, allowing for flexible and extensible data tokenization configurations.

Attributes:

Name Type Description
schema_discriminator str

Field name for polymorphic deserialization

Source code in src/guidellm/data/schemas/entrypoints.py
class DataTokenizerArgs(
    PydanticClassRegistryMixin["DataTokenizerArgs"],
    ABC,
):
    """
    Base class for data tokenizer argument models.

    This class serves as a base for defining arguments related to data tokenization
    configurations. It inherits from PydanticClassRegistryMixin to enable automatic
    registration of subclasses, allowing for flexible and extensible data tokenization
    configurations.

    :cvar schema_discriminator: Field name for polymorphic deserialization
    """

    model_config = standard_model_config()

    schema_discriminator: ClassVar[str] = "kind"

    @classmethod
    def __pydantic_schema_base_type__(cls) -> type[DataTokenizerArgs]:
        """
        Return base type for polymorphic validation hierarchy.

        :return: Base DataTokenizerArgs class for schema validation
        """
        if cls.__name__ == "DataTokenizerArgs":
            return cls

        return DataTokenizerArgs

    kind: str = Field(
        description="Type identifier for the data tokenizer arguments.",
        examples=["huggingface"],
    )
    model: str | None = Field(
        default=None,
        description=(
            "Optional model name or path for the tokenizer. This field can be "
            "used by tokenizer implementations that require a model specification, "
            "such as HuggingFace tokenizers."
        ),
        examples=["gpt2"],
    )

__pydantic_schema_base_type__() classmethod

Return base type for polymorphic validation hierarchy.

Returns:

Type Description
type[DataTokenizerArgs]

Base DataTokenizerArgs class for schema validation

Source code in src/guidellm/data/schemas/entrypoints.py
@classmethod
def __pydantic_schema_base_type__(cls) -> type[DataTokenizerArgs]:
    """
    Return base type for polymorphic validation hierarchy.

    :return: Base DataTokenizerArgs class for schema validation
    """
    if cls.__name__ == "DataTokenizerArgs":
        return cls

    return DataTokenizerArgs

DatasetFinalizer

Bases: Protocol[DataT_co]

Protocol for finalizing dataset rows into a desired data type.

Source code in src/guidellm/data/finalizers/finalizer.py
@runtime_checkable
class DatasetFinalizer(Protocol[DataT_co]):
    """
    Protocol for finalizing dataset rows into a desired data type.
    """

    def __init__(self, config: DataFinalizerArgs) -> None: ...

    def __call__(self, items: list[dict[str, Any]]) -> DataT_co: ...

FinalizerRegistry

Bases: RegistryMixin[type[DatasetFinalizer]]

Source code in src/guidellm/data/finalizers/finalizer.py
class FinalizerRegistry(RegistryMixin[type[DatasetFinalizer]]):
    @classmethod
    def create(cls, config: DataFinalizerArgs) -> DatasetFinalizer:
        """
        Factory method to create a DatasetFinalizer instance based on configuration.

        :param config: A DataFinalizerArgs object containing the configuration.
        """
        kind = config.kind
        finalizer_cls = cls.get_registered_object(kind)

        if finalizer_cls is None:
            raise ValueError(
                f"DatasetFinalizer type '{kind}' is not registered."
                f"Available types: {list(cls.registry.keys()) if cls.registry else []}"
            )

        return finalizer_cls(config)

create(config) classmethod

Factory method to create a DatasetFinalizer instance based on configuration.

Parameters:

Name Type Description Default
config DataFinalizerArgs

A DataFinalizerArgs object containing the configuration.

required
Source code in src/guidellm/data/finalizers/finalizer.py
@classmethod
def create(cls, config: DataFinalizerArgs) -> DatasetFinalizer:
    """
    Factory method to create a DatasetFinalizer instance based on configuration.

    :param config: A DataFinalizerArgs object containing the configuration.
    """
    kind = config.kind
    finalizer_cls = cls.get_registered_object(kind)

    if finalizer_cls is None:
        raise ValueError(
            f"DatasetFinalizer type '{kind}' is not registered."
            f"Available types: {list(cls.registry.keys()) if cls.registry else []}"
        )

    return finalizer_cls(config)

GenerativeRequestFinalizer

Bases: DatasetFinalizer[Iterable[GenerationRequest]]

Finalizer that converts dataset rows into GenerationRequest objects, aggregating usage metrics from the provided columns.

Source code in src/guidellm/data/finalizers/generative.py
@FinalizerRegistry.register("generative")
class GenerativeRequestFinalizer(DatasetFinalizer[Iterable[GenerationRequest]]):
    """
    Finalizer that converts dataset rows into GenerationRequest objects,
    aggregating usage metrics from the provided columns.
    """

    def __init__(self, config: GenerativeRequestFinalizerArgs) -> None:
        self.config = config

    def __call__(self, items: list[dict[str, Any]]) -> list[GenerationRequest]:
        return [self.finalize_turn(item) for item in items]

    def finalize_turn(  # noqa: C901 PLR0912
        self, columns: dict[str, Any]
    ) -> GenerationRequest:
        input_metrics = UsageMetrics()
        output_metrics = UsageMetrics()

        # Sum prompt token column
        if prompt_tokens := sum(
            count for count in columns.get("prompt_tokens_count_column", []) if count
        ):
            input_metrics.text_tokens = prompt_tokens

        # Sum output token column
        if output_tokens := sum(
            count for count in columns.get("output_tokens_count_column", []) if count
        ):
            output_metrics.text_tokens = output_tokens

        # Count words in prefixes
        for prefix in columns.get("prefix_column", []):
            if not prefix:
                continue

            input_metrics.add_text_metrics(prefix)

        # Count words in text prompts
        for text in columns.get("text_column", []):
            if not text:
                continue

            input_metrics.add_text_metrics(text)

        # Count pixels and bytes in images
        for image in columns.get("image_column", []):
            if not image:
                continue

            if (image_pixels := image.get("image_pixels")) is not None:
                input_metrics.image_pixels = (
                    input_metrics.image_pixels or 0
                ) + image_pixels
            if (image_bytes := image.get("image_bytes")) is not None:
                input_metrics.image_bytes = (
                    input_metrics.image_bytes or 0
                ) + image_bytes

        # Count frames, seconds, and bytes in videos
        for video in columns.get("video_column", []):
            if not video:
                continue

            if (video_frames := video.get("video_frames")) is not None:
                input_metrics.video_frames = (
                    input_metrics.video_frames or 0
                ) + video_frames
            if (video_seconds := video.get("video_seconds")) is not None:
                input_metrics.video_seconds = (
                    input_metrics.video_seconds or 0.0
                ) + video_seconds
            if (video_bytes := video.get("video_bytes")) is not None:
                input_metrics.video_bytes = (
                    input_metrics.video_bytes or 0
                ) + video_bytes

        # Count samples, seconds, and bytes in audio
        for audio in columns.get("audio_column", []):
            if not audio:
                continue

            if (audio_samples := audio.get("audio_samples")) is not None:
                input_metrics.audio_samples = (
                    input_metrics.audio_samples or 0
                ) + audio_samples
            if (audio_seconds := audio.get("audio_seconds")) is not None:
                input_metrics.audio_seconds = (
                    input_metrics.audio_seconds or 0.0
                ) + audio_seconds
            if (audio_bytes := audio.get("audio_bytes")) is not None:
                input_metrics.audio_bytes = (
                    input_metrics.audio_bytes or 0
                ) + audio_bytes

        # A turn expects a tool call if it has tool definitions.
        # Which turns carry tools_column is controlled by the data pipeline
        # (synthetic generator or dataset columns).
        expects_tool_call = bool(columns.get("tools_column"))

        return GenerationRequest(
            columns=columns,
            expects_tool_call=expects_tool_call,
            input_metrics=input_metrics,
            output_metrics=output_metrics,
            settings=self._request_settings_from_columns(columns),
        )

    def _request_settings_from_columns(
        self, columns: dict[str, Any]
    ) -> RequestSettings:
        relative_values = columns.get("relative_timestamp_column", [])
        if relative_values and relative_values[0] is not None:
            return RequestSettings(relative_timestamp=float(relative_values[0]))
        return RequestSettings()

GenerativeRequestFinalizerArgs

Bases: DataFinalizerArgs

Model for generative request finalizer arguments.

Source code in src/guidellm/data/finalizers/generative.py
@DataFinalizerArgs.register("generative")
class GenerativeRequestFinalizerArgs(DataFinalizerArgs):
    """Model for generative request finalizer arguments."""

    kind: Literal["generative"] = Field(
        default="generative",
        description="Type identifier for the generative request finalizer.",
    )

PreprocessorRegistry

Bases: RegistryMixin[type[DatasetPreprocessor] | type[DataDependentPreprocessor]]

Source code in src/guidellm/data/preprocessors/preprocessor.py
class PreprocessorRegistry(
    RegistryMixin[type[DatasetPreprocessor] | type[DataDependentPreprocessor]]
):
    @classmethod
    def create(cls, config: DataPreprocessorArgs) -> DatasetPreprocessor:
        """
        Factory method to create a DatasetPreprocessor instance based on configuration.

        :param config: A DataPreprocessorArgs object containing the configuration.
        """
        kind = config.kind
        preprocessor_cls = cls.get_registered_object(kind)

        if preprocessor_cls is None:
            raise ValueError(
                f"DatasetPreprocessor type '{kind}' is not registered."
                f"Available types: {list(cls.registry.keys()) if cls.registry else []}"
            )

        return preprocessor_cls(config)

create(config) classmethod

Factory method to create a DatasetPreprocessor instance based on configuration.

Parameters:

Name Type Description Default
config DataPreprocessorArgs

A DataPreprocessorArgs object containing the configuration.

required
Source code in src/guidellm/data/preprocessors/preprocessor.py
@classmethod
def create(cls, config: DataPreprocessorArgs) -> DatasetPreprocessor:
    """
    Factory method to create a DatasetPreprocessor instance based on configuration.

    :param config: A DataPreprocessorArgs object containing the configuration.
    """
    kind = config.kind
    preprocessor_cls = cls.get_registered_object(kind)

    if preprocessor_cls is None:
        raise ValueError(
            f"DatasetPreprocessor type '{kind}' is not registered."
            f"Available types: {list(cls.registry.keys()) if cls.registry else []}"
        )

    return preprocessor_cls(config)

TorchDataLoaderArgs

Bases: DataLoaderArgs

Model for PyTorch data loader arguments.

Source code in src/guidellm/data/loaders/torch.py
@DataLoaderArgs.register("pytorch")
class TorchDataLoaderArgs(DataLoaderArgs):
    """Model for PyTorch data loader arguments."""

    kind: Literal["pytorch"] = Field(  # type: ignore[assignment]
        default="pytorch",
        description="Type identifier for the generative data loader.",
    )
    shuffle: bool = Field(
        default=False,
        description="Shuffle data rows at every epoch.",
    )
    num_workers: int = Field(
        default=1,
        description=(
            "Number of worker processes for data loading. If 0, data loading "
            "will be performed in the main process."
        ),
    )

create_data_loader(loader_config, data_config, tokenizer_config, column_mapper_config, preprocessors_config, finalizer_config, random_seed=42, console=None) async

Factory function to create a DataLoader instance based on provided configurations.

Parameters:

Name Type Description Default
loader_config DataLoaderArgs

Configuration for the data loader.

required
data_config list[DataArgs]

List of configurations for dataset deserialization.

required
tokenizer_config DataTokenizerArgs

Configuration for the tokenizer factory.

required
column_mapper_config DataPreprocessorArgs

Configuration for the column mapping preprocessor.

required
preprocessors_config list[DataPreprocessorArgs]

List of configurations for additional preprocessors.

required
finalizer_config DataFinalizerArgs

Configuration for the dataset finalizer.

required
random_seed int

Seed for random operations to ensure reproducibility.

42
console Console | None

Optional Console instance for logging and progress display.

None

Returns:

Type Description
DataLoader

An instance of DataLoader configured according to the provided arguments.

Source code in src/guidellm/data/entrypoints.py
async def create_data_loader(
    loader_config: DataLoaderArgs,
    data_config: list[DataArgs],
    tokenizer_config: DataTokenizerArgs,
    column_mapper_config: DataPreprocessorArgs,
    preprocessors_config: list[DataPreprocessorArgs],
    finalizer_config: DataFinalizerArgs,
    random_seed: int = 42,
    console: Console | None = None,
) -> DataLoader:
    """
    Factory function to create a DataLoader instance based on provided configurations.

    :param loader_config: Configuration for the data loader.
    :param data_config: List of configurations for dataset deserialization.
    :param tokenizer_config: Configuration for the tokenizer factory.
    :param column_mapper_config: Configuration for the column mapping preprocessor.
    :param preprocessors_config: List of configurations for additional preprocessors.
    :param finalizer_config: Configuration for the dataset finalizer.
    :param random_seed: Seed for random operations to ensure reproducibility.
    :param console: Optional Console instance for logging and progress display.
    :return: An instance of DataLoader configured according to the provided arguments.
    """
    rng = Random(random_seed)

    tokenizer_factory = TokenizerRegistry.create(tokenizer_config)

    console_step = (
        console.print_update_step(title="Deserializing datasets from configuration")
        if console
        else None
    )

    datasets: list[DatasetType] = [
        DatasetDeserializerFactory.deserialize(
            config=args,
            processor_factory=tokenizer_factory,
            random_seed=rng.getrandbits(32),
        )
        for args in data_config
    ]

    if console_step:
        console_step.finish(
            title=f"{len(datasets)} datasets resolved",
            details=[args.model_dump(mode="json") for args in data_config],
            status_level="success",
        )

    console_step = (
        console.print_update_step(
            title="Initializing preprocessors from configuration",
        )
        if console
        else None
    )

    preproc_configs = [column_mapper_config] + preprocessors_config
    preprocessors: list[DatasetPreprocessor | DataDependentPreprocessor] = [
        PreprocessorRegistry.create(pre) for pre in preproc_configs
    ]

    if console_step:
        console_step.finish(
            title=f"{len(preprocessors)} preprocessors resolved",
            details=[pre.model_dump(mode="json") for pre in preproc_configs],
            status_level="success",
        )

    console_step = (
        console.print_update_step(
            title="Initializing finalizer from configuration",
        )
        if console
        else None
    )

    finalizer: DatasetFinalizer = FinalizerRegistry.create(finalizer_config)

    if console_step:
        console_step.finish(
            title="Finalizer resolved",
            details=finalizer_config.model_dump(mode="json"),
            status_level="success",
        )

    console_step = (
        console.print_update_step(
            title="Initializing request loader from configuration",
        )
        if console
        else None
    )

    data_loader = DataLoaderRegistry.create(
        config=loader_config,
        datasets=datasets,
        preprocessors=preprocessors,
        finalizer=finalizer,
        random_seed=rng.getrandbits(32),
    )

    if console_step:
        samples = loader_config.samples if loader_config.samples > 0 else "inf"
        console_step.finish(
            title=(f"Request loader resolved with {samples} unique requests"),
            details=InfoMixin.extract_from_obj(data_loader),
            status_level="success",
        )

    return data_loader

process_dataset(data, output_path, processor, config, processor_args=None, data_args=None, data_column_mapper=None, short_prompt_strategy=ShortPromptStrategy.IGNORE, pad_char=None, concat_delimiter=None, include_prefix_in_token_count=False, push_to_hub=False, hub_dataset_id=None, random_seed=42)

Main method to process and save a dataset with sampled prompt/output token counts.

Parameters:

Name Type Description Default
data dict

Path or identifier for dataset input.

required
output_path str | Path

File path to save the processed dataset.

required
processor str | Path | PreTrainedTokenizerBase

Tokenizer object or its config.

required
config str | Path

PreprocessDatasetConfig string or file path.

required
processor_args dict[str, Any] | None

Optional processor arguments.

None
data_args dict[str, Any] | None

Optional data loading arguments.

None
data_column_mapper dict[str, str] | None

Optional column mapping dictionary.

None
short_prompt_strategy ShortPromptStrategy

Strategy for handling short prompts.

IGNORE
pad_char str | None

Character used when padding short prompts.

None
concat_delimiter str | None

Delimiter for concatenation strategy.

None
include_prefix_in_token_count bool

Whether to include prefix in prompt token count, simplifying the token counts. When True, prefix trimming is disabled and the prefix is kept as-is. The prefix token count is subtracted from the prompt token budget instead.

False
push_to_hub bool

Whether to push to Hugging Face Hub.

False
hub_dataset_id str | None

Dataset ID on Hugging Face Hub.

None
random_seed int

Seed for random sampling.

42

Raises:

Type Description
ValueError

If the output path is invalid or pushing conditions unmet.

Source code in src/guidellm/data/entrypoints.py
def process_dataset(
    data: dict,
    output_path: str | Path,
    processor: str | Path | PreTrainedTokenizerBase,
    config: str | Path,
    processor_args: dict[str, Any] | None = None,
    data_args: dict[str, Any] | None = None,
    data_column_mapper: dict[str, str] | None = None,
    short_prompt_strategy: ShortPromptStrategy = ShortPromptStrategy.IGNORE,
    pad_char: str | None = None,
    concat_delimiter: str | None = None,
    include_prefix_in_token_count: bool = False,
    push_to_hub: bool = False,
    hub_dataset_id: str | None = None,
    random_seed: int = 42,
) -> None:
    """
    Main method to process and save a dataset with sampled prompt/output token counts.

    :param data: Path or identifier for dataset input.
    :param output_path: File path to save the processed dataset.
    :param processor: Tokenizer object or its config.
    :param config: PreprocessDatasetConfig string or file path.
    :param processor_args: Optional processor arguments.
    :param data_args: Optional data loading arguments.
    :param data_column_mapper: Optional column mapping dictionary.
    :param short_prompt_strategy: Strategy for handling short prompts.
    :param pad_char: Character used when padding short prompts.
    :param concat_delimiter: Delimiter for concatenation strategy.
    :param include_prefix_in_token_count:
        Whether to include prefix in prompt token count, simplifying the token counts.
        When True, prefix trimming is disabled and the prefix is kept as-is. The prefix
        token count is subtracted from the prompt token budget instead.
    :param push_to_hub: Whether to push to Hugging Face Hub.
    :param hub_dataset_id: Dataset ID on Hugging Face Hub.
    :param random_seed: Seed for random sampling.
    :raises ValueError: If the output path is invalid or pushing conditions unmet.
    """
    data_config = DataArgs.model_validate(data)
    builders.process_dataset(
        data_config,
        output_path,
        processor,
        config,
        processor_args,
        data_args,
        data_column_mapper,
        short_prompt_strategy,
        pad_char,
        concat_delimiter,
        include_prefix_in_token_count,
        push_to_hub,
        hub_dataset_id,
        random_seed,
    )