Source code for mmirage.core.loader.jsonl
"""JSONL data loader implementation."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Optional, Union, override
from datasets import (
Dataset,
DatasetDict,
IterableDataset,
IterableDatasetDict,
load_dataset,
)
from mmirage.core.loader.base import (
BaseDataLoader,
DataLoaderRegistry,
BaseDataLoaderConfig,
DatasetLike,
)
[docs]
@dataclass
class JSONLDataConfig(BaseDataLoaderConfig):
"""Configuration for loading JSONL datasets.
Attributes:
type: Type identifier (must be "JSONL").
path: File path to the JSONL file, or dict mapping split names to paths.
output_dir: Directory for saving processed output.
"""
path: Union[str, Dict[str, str]] = ""
@DataLoaderRegistry.register("JSONL", JSONLDataConfig)
class JSONLDataLoader(BaseDataLoader[JSONLDataConfig]):
"""Data loader for JSONL (JSON Lines) formatted datasets.
Loads datasets from JSONL files using the Hugging Face datasets library.
Supports both single files and split-based loading.
Note:
Iterable datasets are not supported by this loader.
"""
def __init__(self) -> None:
"""Initialize the JSONL data loader."""
super().__init__()
@override
def from_config(self, ds_config: JSONLDataConfig) -> Optional[DatasetLike]:
"""Load a dataset from a JSONL file.
Args:
ds_config: Configuration containing the path to the JSONL file.
Returns:
A Hugging Face Dataset or a DatasetDict containing the JSONL data.
Raises:
RuntimeError: If the loaded dataset is an iterable dataset.
"""
path = ds_config.path
ds = load_dataset("json", data_files=path, streaming=False)
if isinstance(ds, (IterableDatasetDict, IterableDataset)):
raise RuntimeError(f"Iterable datasets are not supported for path: {path}")
if isinstance(path, str):
# If we only have a single split, we load it as a standard Dataset
ds = ds["train"]
return ds