Source code for nanoml.data

from datasets import load_from_disk, load_dataset, DatasetDict
from datasets.config import DATASET_STATE_JSON_FILENAME
from pathlib import Path



[docs]
def load_dataset_flexible(dataset_path: str, *args, **kwargs):
    """Get the appropriate dataset loader based on the dataset path.

    Args:
        dataset_path (str): The path to the dataset.

    Raises:
        Exception: If the dataset is not found.

    Returns:
        datasets.Dataset: The dataset.
    """
    try:
        if Path(dataset_path, DATASET_STATE_JSON_FILENAME).exists():
            return load_from_disk(dataset_path, *args, **kwargs)
        else:
            return load_dataset(dataset_path, *args, **kwargs)
    except Exception as e:
        raise e




[docs]
def split_hf_dataset(dataset, val_size=0.1, test_size=0.1, **kwargs):
    """Split a Hugging Face dataset into train, validation, and test sets.

    Args:
        | dataset (datasets.Dataset): The dataset to split.
        | val_size (float | int, optional): The size of the validation set. Defaults to 0.1.
        | test_size (float | int, optional): The size of the test set. Defaults to 0.1.
        | **kwargs: Additional keyword arguments to pass to the `train_test_split` method.

    Returns:
        datasets.DatasetDict: A dictionary containing the train, validation, and test sets.
    """
    train_val = dataset.train_test_split(test_size=val_size, **kwargs)
    val = train_val["test"]

    train_test = train_val["train"].train_test_split(test_size=test_size, **kwargs)
    train = train_test["train"]
    test = train_test["test"]

    return DatasetDict({"train": train, "val": val, "test": test})
Source code for nanoml.data

nanoml

Navigation

Related Topics