Skip to content

guidellm.data.builders

ShortPromptStrategyHandler

Handler class for short prompt strategies.

Source code in src/guidellm/data/builders.py
class ShortPromptStrategyHandler:
    """Handler class for short prompt strategies."""

    @staticmethod
    def handle_ignore(
        current_prompt: str,
        min_prompt_tokens: int,
        tokenizer: PreTrainedTokenizerBase,
        **_kwargs,
    ) -> str | None:
        """
        Ignores prompts that are shorter than the required minimum token length.

        :param current_prompt: The input prompt string.
        :param min_prompt_tokens: Minimum required token count.
        :param tokenizer: Tokenizer used to count tokens.
        :return: The prompt if it meets the length, otherwise None.
        """

        if len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
            logger.warning("Prompt too short, ignoring")
            return None
        return current_prompt

    @staticmethod
    def handle_concatenate(
        current_prompt: str,
        min_prompt_tokens: int,
        dataset_iterator: Iterator[dict[str, Any]],
        prompt_column: str,
        tokenizer: PreTrainedTokenizerBase,
        concat_delimiter: str,
        **_kwargs,
    ) -> str | None:
        """
        Concatenates prompts until the minimum token requirement is met.

        :param current_prompt: The initial prompt.
        :param min_prompt_tokens: Target minimum token length.
        :param dataset_iterator: Iterator to fetch more prompts.
        :param prompt_column: Column key for prompt extraction.
        :param tokenizer: Tokenizer used to count tokens.
        :param concat_delimiter: Delimiter to use between prompts.
        :return: Concatenated prompt or None if not enough data.
        """

        tokens_len = len(tokenizer.encode(current_prompt))
        while tokens_len < min_prompt_tokens:
            try:
                next_row = next(dataset_iterator)
            except StopIteration:
                logger.warning(
                    "Could not concatenate enough prompts to reach minimum "
                    "length, ignoring"
                )
                return None
            current_prompt += concat_delimiter + next_row[prompt_column]
            tokens_len = len(tokenizer.encode(current_prompt))
        return current_prompt

    @staticmethod
    def handle_pad(
        current_prompt: str,
        min_prompt_tokens: int,
        tokenizer: PreTrainedTokenizerBase,
        pad_char: str,
        pad_multiplier: int = 2,
        **_kwargs,
    ) -> str:
        """
        Pads the prompt with a character until it reaches the minimum token length.

        :param current_prompt: The input prompt.
        :param min_prompt_tokens: Desired minimum token count.
        :param tokenizer: Tokenizer used to count tokens.
        :param pad_char: Character used for padding.
        :param pad_multiplier: Multiplier for padding character length.
        :return: Padded prompt string.
        """
        tokens = tokenizer.encode(current_prompt)
        pad_count = 1
        prompt = current_prompt
        while len(tokens) < min_prompt_tokens:
            prompt += pad_char * pad_count
            tokens = tokenizer.encode(prompt)
            pad_count *= pad_multiplier
        return prompt

    @staticmethod
    def handle_error(
        current_prompt: str,
        min_prompt_tokens: int,
        tokenizer: PreTrainedTokenizerBase,
        **_kwargs,
    ) -> str | None:
        """
        Raises an error if the prompt is too short.

        :param current_prompt: The input prompt.
        :param min_prompt_tokens: Required token count.
        :param tokenizer: Tokenizer used to count tokens.
        :return: The input prompt if valid.
        :raises PromptTooShortError: If the prompt is too short.
        """

        prompt_len = len(tokenizer.encode(current_prompt))
        if prompt_len < min_prompt_tokens:
            raise PromptTooShortError(
                f"Found too short prompt: {current_prompt}, with length: {prompt_len}. "
                f"Minimum length required: {min_prompt_tokens}.",
            )
        return current_prompt

    @classmethod
    def get_strategy_handler(cls, strategy: ShortPromptStrategy) -> Callable[..., Any]:
        """
        Get the handler for a specific strategy.

        :param strategy: The short prompt strategy to get the handler for.
        :return: The handler callable for the specified strategy.
        """
        return cast("Callable[..., Any]", STRATEGY_HANDLERS[strategy])

get_strategy_handler(strategy) classmethod

Get the handler for a specific strategy.

Parameters:

Name Type Description Default
strategy ShortPromptStrategy

The short prompt strategy to get the handler for.

required

Returns:

Type Description
Callable[..., Any]

The handler callable for the specified strategy.

Source code in src/guidellm/data/builders.py
@classmethod
def get_strategy_handler(cls, strategy: ShortPromptStrategy) -> Callable[..., Any]:
    """
    Get the handler for a specific strategy.

    :param strategy: The short prompt strategy to get the handler for.
    :return: The handler callable for the specified strategy.
    """
    return cast("Callable[..., Any]", STRATEGY_HANDLERS[strategy])

handle_concatenate(current_prompt, min_prompt_tokens, dataset_iterator, prompt_column, tokenizer, concat_delimiter, **_kwargs) staticmethod

Concatenates prompts until the minimum token requirement is met.

Parameters:

Name Type Description Default
current_prompt str

The initial prompt.

required
min_prompt_tokens int

Target minimum token length.

required
dataset_iterator Iterator[dict[str, Any]]

Iterator to fetch more prompts.

required
prompt_column str

Column key for prompt extraction.

required
tokenizer PreTrainedTokenizerBase

Tokenizer used to count tokens.

required
concat_delimiter str

Delimiter to use between prompts.

required

Returns:

Type Description
str | None

Concatenated prompt or None if not enough data.

Source code in src/guidellm/data/builders.py
@staticmethod
def handle_concatenate(
    current_prompt: str,
    min_prompt_tokens: int,
    dataset_iterator: Iterator[dict[str, Any]],
    prompt_column: str,
    tokenizer: PreTrainedTokenizerBase,
    concat_delimiter: str,
    **_kwargs,
) -> str | None:
    """
    Concatenates prompts until the minimum token requirement is met.

    :param current_prompt: The initial prompt.
    :param min_prompt_tokens: Target minimum token length.
    :param dataset_iterator: Iterator to fetch more prompts.
    :param prompt_column: Column key for prompt extraction.
    :param tokenizer: Tokenizer used to count tokens.
    :param concat_delimiter: Delimiter to use between prompts.
    :return: Concatenated prompt or None if not enough data.
    """

    tokens_len = len(tokenizer.encode(current_prompt))
    while tokens_len < min_prompt_tokens:
        try:
            next_row = next(dataset_iterator)
        except StopIteration:
            logger.warning(
                "Could not concatenate enough prompts to reach minimum "
                "length, ignoring"
            )
            return None
        current_prompt += concat_delimiter + next_row[prompt_column]
        tokens_len = len(tokenizer.encode(current_prompt))
    return current_prompt

handle_error(current_prompt, min_prompt_tokens, tokenizer, **_kwargs) staticmethod

Raises an error if the prompt is too short.

Parameters:

Name Type Description Default
current_prompt str

The input prompt.

required
min_prompt_tokens int

Required token count.

required
tokenizer PreTrainedTokenizerBase

Tokenizer used to count tokens.

required

Returns:

Type Description
str | None

The input prompt if valid.

Raises:

Type Description
PromptTooShortError

If the prompt is too short.

Source code in src/guidellm/data/builders.py
@staticmethod
def handle_error(
    current_prompt: str,
    min_prompt_tokens: int,
    tokenizer: PreTrainedTokenizerBase,
    **_kwargs,
) -> str | None:
    """
    Raises an error if the prompt is too short.

    :param current_prompt: The input prompt.
    :param min_prompt_tokens: Required token count.
    :param tokenizer: Tokenizer used to count tokens.
    :return: The input prompt if valid.
    :raises PromptTooShortError: If the prompt is too short.
    """

    prompt_len = len(tokenizer.encode(current_prompt))
    if prompt_len < min_prompt_tokens:
        raise PromptTooShortError(
            f"Found too short prompt: {current_prompt}, with length: {prompt_len}. "
            f"Minimum length required: {min_prompt_tokens}.",
        )
    return current_prompt

handle_ignore(current_prompt, min_prompt_tokens, tokenizer, **_kwargs) staticmethod

Ignores prompts that are shorter than the required minimum token length.

Parameters:

Name Type Description Default
current_prompt str

The input prompt string.

required
min_prompt_tokens int

Minimum required token count.

required
tokenizer PreTrainedTokenizerBase

Tokenizer used to count tokens.

required

Returns:

Type Description
str | None

The prompt if it meets the length, otherwise None.

Source code in src/guidellm/data/builders.py
@staticmethod
def handle_ignore(
    current_prompt: str,
    min_prompt_tokens: int,
    tokenizer: PreTrainedTokenizerBase,
    **_kwargs,
) -> str | None:
    """
    Ignores prompts that are shorter than the required minimum token length.

    :param current_prompt: The input prompt string.
    :param min_prompt_tokens: Minimum required token count.
    :param tokenizer: Tokenizer used to count tokens.
    :return: The prompt if it meets the length, otherwise None.
    """

    if len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
        logger.warning("Prompt too short, ignoring")
        return None
    return current_prompt

handle_pad(current_prompt, min_prompt_tokens, tokenizer, pad_char, pad_multiplier=2, **_kwargs) staticmethod

Pads the prompt with a character until it reaches the minimum token length.

Parameters:

Name Type Description Default
current_prompt str

The input prompt.

required
min_prompt_tokens int

Desired minimum token count.

required
tokenizer PreTrainedTokenizerBase

Tokenizer used to count tokens.

required
pad_char str

Character used for padding.

required
pad_multiplier int

Multiplier for padding character length.

2

Returns:

Type Description
str

Padded prompt string.

Source code in src/guidellm/data/builders.py
@staticmethod
def handle_pad(
    current_prompt: str,
    min_prompt_tokens: int,
    tokenizer: PreTrainedTokenizerBase,
    pad_char: str,
    pad_multiplier: int = 2,
    **_kwargs,
) -> str:
    """
    Pads the prompt with a character until it reaches the minimum token length.

    :param current_prompt: The input prompt.
    :param min_prompt_tokens: Desired minimum token count.
    :param tokenizer: Tokenizer used to count tokens.
    :param pad_char: Character used for padding.
    :param pad_multiplier: Multiplier for padding character length.
    :return: Padded prompt string.
    """
    tokens = tokenizer.encode(current_prompt)
    pad_count = 1
    prompt = current_prompt
    while len(tokens) < min_prompt_tokens:
        prompt += pad_char * pad_count
        tokens = tokenizer.encode(prompt)
        pad_count *= pad_multiplier
    return prompt

parse_synthetic_config(config_input)

Parse PreprocessDatasetConfig from string or file path.

Reuses SyntheticTextDatasetDeserializer's parsing logic to support: - JSON strings - Key=value pairs - File paths (.json, .yaml, .yml, .config)

Parameters:

Name Type Description Default
config_input str | Path

String or path to config.

required

Returns:

Type Description
PreprocessDatasetConfig

Parsed PreprocessDatasetConfig instance.

Raises:

Type Description
ValueError

If the format is not recognized or parsing fails.

Source code in src/guidellm/data/builders.py
def parse_synthetic_config(
    config_input: str | Path,
) -> PreprocessDatasetConfig:
    """
    Parse PreprocessDatasetConfig from string or file path.

    Reuses SyntheticTextDatasetDeserializer's parsing logic to support:
    - JSON strings
    - Key=value pairs
    - File paths (.json, .yaml, .yml, .config)

    :param config_input: String or path to config.
    :return: Parsed PreprocessDatasetConfig instance.
    :raises ValueError: If the format is not recognized or parsing fails.
    """
    config = load_config(config_input, PreprocessDatasetConfig)

    if config is not None:
        return config

    raise ValueError(
        f"Could not parse config from input: {config_input}. "
        "Expected JSON string, key=value pairs, or file path "
        "(.json, .yaml, .yml, .config)"
    )

process_dataset(data, output_path, processor, config, processor_args, data_args, data_column_mapper, short_prompt_strategy, pad_char, concat_delimiter, include_prefix_in_token_count, push_to_hub, hub_dataset_id, random_seed)

Main method to process and save a dataset with sampled prompt/output token counts.

Source code in src/guidellm/data/builders.py
def process_dataset(
    data: DataArgs,
    output_path: str | Path,
    processor: str | Path | PreTrainedTokenizerBase,
    config: str | Path,
    processor_args: dict[str, Any] | None,
    data_args: dict[str, Any] | None,
    data_column_mapper: dict[str, str] | None,
    short_prompt_strategy: ShortPromptStrategy,
    pad_char: str | None,
    concat_delimiter: str | None,
    include_prefix_in_token_count: bool,
    push_to_hub: bool,
    hub_dataset_id: str | None,
    random_seed: int,
) -> None:
    """
    Main method to process and save a dataset with sampled prompt/output token counts.
    """
    _validate_output_suffix(output_path)
    logger.info(f"Starting dataset conversion | Input: {data} | Output: {output_path}")

    # Parse config
    config_obj = parse_synthetic_config(config)

    # Load tokenizer
    tokenizer = check_load_processor(
        processor,
        processor_args,
        "dataset conversion.",
    )

    # Load dataset
    data.load_kwargs.update(data_args or {})
    dataset = DatasetDeserializerFactory.deserialize(
        config=data,
        processor_factory=lambda: tokenizer,
        random_seed=random_seed,
    )
    # Setup column mapper
    column_mapper: GenerativeColumnMapper = PreprocessorRegistry.create(  # type: ignore[assignment]
        config=DataPreprocessorArgs.model_validate(
            data_column_mapper  # type: ignore[arg-type]
        )
    )
    column_mapper.setup_data(
        datasets=[dataset],
    )

    # Extract column names from mapper
    prompt_column, prefix_column, output_column = _extract_column_names(column_mapper)

    # Create token samplers
    prompt_token_sampler, output_token_sampler, prefix_tokens_max = (
        _create_token_samplers(
            config_obj,
            random_seed,
        )
    )

    # Process dataset
    dataset_iterator = iter(dataset)
    processed_prompts = []
    prompt_handler = ShortPromptStrategyHandler.get_strategy_handler(
        short_prompt_strategy
    )

    for row in dataset_iterator:
        processed_row = _process_single_row(
            row=row,
            prompt_column=prompt_column,
            prefix_column=prefix_column,
            prompt_token_sampler=prompt_token_sampler,
            output_token_sampler=output_token_sampler,
            tokenizer=tokenizer,
            prompt_handler=prompt_handler,
            dataset_iterator=dataset_iterator,
            include_prefix_in_token_count=include_prefix_in_token_count,
            pad_char=pad_char,
            concat_delimiter=concat_delimiter,
            output_column=output_column,
            prefix_tokens_max=prefix_tokens_max,
        )
        if processed_row is not None:
            processed_prompts.append(processed_row)

        # Finalize
    _finalize_processed_dataset(
        processed_prompts,
        output_path,
        push_to_hub,
        hub_dataset_id,
    )

push_dataset_to_hub(hub_dataset_id, processed_dataset)

Pushes the processed dataset to Hugging Face Hub using HF_TOKEN.

Parameters:

Name Type Description Default
hub_dataset_id str | None

Identifier on the Hub to push to.

required
processed_dataset Dataset

HuggingFace Dataset object.

required

Raises:

Type Description
ValueError

If hub_dataset_id or HF_TOKEN is not available.

Source code in src/guidellm/data/builders.py
def push_dataset_to_hub(
    hub_dataset_id: str | None,
    processed_dataset: Dataset,
) -> None:
    """
    Pushes the processed dataset to Hugging Face Hub using HF_TOKEN.

    :param hub_dataset_id: Identifier on the Hub to push to.
    :param processed_dataset: HuggingFace Dataset object.
    :raises ValueError: If hub_dataset_id or HF_TOKEN is not available.
    """

    hf_token = os.environ.get("HF_TOKEN")
    if not hub_dataset_id or not hf_token:
        raise ValueError(
            "hub_dataset_id and HF_TOKEN env var must be provided when push_to_hub"
            " is True"
        )
    processed_dataset.push_to_hub(hub_dataset_id, token=hf_token)