Source code for nanoml.data
from datasets import load_from_disk, load_dataset, DatasetDict
from datasets.config import DATASET_STATE_JSON_FILENAME
from pathlib import Path
[docs]
def load_dataset_flexible(dataset_path: str, *args, **kwargs):
"""Get the appropriate dataset loader based on the dataset path.
Args:
dataset_path (str): The path to the dataset.
Raises:
Exception: If the dataset is not found.
Returns:
datasets.Dataset: The dataset.
"""
try:
if Path(dataset_path, DATASET_STATE_JSON_FILENAME).exists():
return load_from_disk(dataset_path, *args, **kwargs)
else:
return load_dataset(dataset_path, *args, **kwargs)
except Exception as e:
raise e
[docs]
def split_hf_dataset(dataset, val_size=0.1, test_size=0.1, **kwargs):
"""Split a Hugging Face dataset into train, validation, and test sets.
Args:
| dataset (datasets.Dataset): The dataset to split.
| val_size (float | int, optional): The size of the validation set. Defaults to 0.1.
| test_size (float | int, optional): The size of the test set. Defaults to 0.1.
| **kwargs: Additional keyword arguments to pass to the `train_test_split` method.
Returns:
datasets.DatasetDict: A dictionary containing the train, validation, and test sets.
"""
train_val = dataset.train_test_split(test_size=val_size, **kwargs)
val = train_val["test"]
train_test = train_val["train"].train_test_split(test_size=test_size, **kwargs)
train = train_test["train"]
test = train_test["test"]
return DatasetDict({"train": train, "val": val, "test": test})