Source code for mmirage.core.loader.utils
"""Utility functions for loading datasets and handling images."""
from __future__ import annotations
import os
from typing import Any, List, Optional, Union
from datasets import Dataset, DatasetDict
from PIL import Image
from mmirage.core.loader.base import AutoDataLoader, BaseDataLoaderConfig, DatasetLike
import logging
logger = logging.getLogger(__name__)
[docs]
def load_datasets_from_configs(configs: List[BaseDataLoaderConfig]) -> List[DatasetLike]:
"""Load multiple datasets from configurations.
Attempts to load datasets using the specified loader configurations.
Failed loads are logged as warnings and skipped.
Args:
configs: List of dataset configuration objects.
Returns:
List of Hugging Face Datasets/DatasetDicts.
Raises:
RuntimeError: If no datasets could be loaded successfully.
"""
valid_ds: List[DatasetLike] = []
loader_by_type = {}
for ds_config in configs:
loader = loader_by_type.get(ds_config.type)
if loader is None:
loader = AutoDataLoader.from_name(ds_config.type)()
loader_by_type[ds_config.type] = loader
try:
ds = loader.from_config(ds_config)
if ds is None:
continue
valid_ds.append(ds)
except Exception as e:
logger.warning(f"Dataset loading failed with error: {e}. Skipping")
if not valid_ds:
raise RuntimeError("No valid datasets loaded from the provided configs.")
return valid_ds
[docs]
def resolve_image_input(value: Union[Image.Image, str], image_base_path: Optional[str] = None) -> Union[Image.Image, str]:
"""Resolve image input to a format SGLang can use.
Handles multiple image input formats:
- PIL Image objects: passed through directly
- URLs (http/https): passed through as-is
- Absolute file paths: validated and passed through
- Relative file paths: resolved using image_base_path
Args:
value: The image value to resolve (PIL Image, path string, or URL).
image_base_path: Optional base directory for resolving relative paths.
Returns:
Resolved image value suitable for SGLang processing.
Raises:
FileNotFoundError: If a relative path cannot be resolved.
RuntimeError: If an absolute path exists but is not a file.
"""
# Case 1: Already a PIL Image - pass through
if isinstance(value, Image.Image):
return value
# Case 2: Not a string - pass through (might be other image format)
if not isinstance(value, str):
return value
# Case 3: URL - pass through as-is
if value.startswith(("http://", "https://")):
return value
# Case 4: Absolute path that exists - pass through
if os.path.isabs(value) and os.path.exists(value):
if os.path.isfile(value):
return value
elif os.path.islink(value):
return os.path.realpath(value)
else:
raise RuntimeError(f"The provided path {value} exists but is not a file")
# Case 5: Relative path - try to resolve with base path
if image_base_path:
resolved_path = os.path.join(image_base_path, value)
if os.path.exists(resolved_path):
return resolved_path
raise FileNotFoundError(
f"Resolved image path '{resolved_path}' does not exist "
f"(from base '{image_base_path}' and relative path '{value}')."
)
# Case 6: No base path - return as-is and let SGLang handle it
return value