Source code for mmirage.core.loader.local_hf
"""Local Hugging Face dataset loader implementation."""
from dataclasses import dataclass
from typing import Optional
from datasets import (
load_from_disk,
IterableDatasetDict,
IterableDataset,
)
from mmirage.core.loader.base import (
BaseDataLoader,
BaseDataLoaderConfig,
DataLoaderRegistry,
DatasetLike,
)
[docs]
@dataclass
class LocalHFConfig(BaseDataLoaderConfig):
"""Configuration for loading local Hugging Face datasets.
Attributes:
type: Type identifier (must be "loadable").
path: Directory path to the saved Hugging Face dataset.
output_dir: Directory for saving processed output.
"""
path: str = ""
@DataLoaderRegistry.register("loadable", LocalHFConfig)
class LocalHFDataLoader(BaseDataLoader[LocalHFConfig]):
"""Data loader for locally saved Hugging Face datasets.
Loads datasets from disk that were previously saved using the
Hugging Face datasets library's save_to_disk method.
Note:
Iterable datasets are not supported by this loader.
"""
def from_config(self, ds_config: LocalHFConfig) -> Optional[DatasetLike]:
"""Load a dataset from a local Hugging Face dataset directory.
Args:
ds_config: Configuration containing the path to the dataset directory.
Returns:
A Hugging Face Dataset loaded from disk.
Raises:
RuntimeError: If the loaded dataset is an iterable dataset.
"""
ds = load_from_disk(ds_config.path)
if isinstance(ds, (IterableDatasetDict, IterableDataset)):
raise RuntimeError(
f"Iterable datasets are not supported for path: {ds_config.path}"
)
return ds