Source code for mmirage.config.utils

"""Configuration loading utilities for MMIRAGE pipeline."""

from typing import Any, Dict, List, TypeAlias, Union, cast
from dacite import Config, from_dict
import yaml
import os

from mmirage.config.config import MMirageConfig
from mmirage.core.process.base import BaseProcessorConfig, ProcessorRegistry, OutputVar
from mmirage.core.loader.base import BaseDataLoaderConfig, DataLoaderRegistry

# Register built-in processors/loaders.
#
# We import configuration modules (lightweight) here so the registries know how
# to construct config/output-var objects from YAML without importing heavy
# processor implementations (e.g. torch/transformers).
import mmirage.core.process.processors.llm.config  # noqa: F401
import mmirage.core.loader.jsonl  # noqa: F401
import mmirage.core.loader.local_hf  # noqa: F401

EnvValue: TypeAlias = Union[str, List["EnvValue"], Dict[str, "EnvValue"]]


[docs] def load_mmirage_config(config_path: str) -> MMirageConfig: """ Load MMIRAGE configuration from a YAML file. Supports environment variable expansion and dynamic processor/loader configuration based on registered types. Example config: processors: - type: llm server_args: model_path: Qwen/Qwen2-VL-7B-Instruct tp_size: 4 trust_remote_code: true chat_template: qwen2-vl default_sampling_params: temperature: 0.1 top_p: 0.9 max_new_tokens: 1024 loading_params: datasets: - path: /path/to/dataset.jsonl type: JSONL output_dir: /path/to/output image_base_path: /path/to/images num_shards: 4 shard_id: 0 batch_size: 64 processing_params: inputs: - name: text key: text - name: image key: image_path type: image outputs: - name: formatted_answer type: llm output_type: JSON output_schema: - question - answer prompt: | Generate a Q&A pair from: {{ text }} remove_columns: True output_schema: conversations: - role: "user" content: "{{ formatted_answer.question }}" - role: "assistant" content: "{{ formatted_answer.answer }}" Args: config_path: Path to the YAML configuration file. Returns: MMirageConfig: Parsed and validated configuration object. """ with open(config_path, "r") as f: cfg: EnvValue = yaml.safe_load(f) or {} def expand_env_vars(obj: EnvValue) -> EnvValue: if isinstance(obj, dict): return {key: expand_env_vars(value) for key, value in obj.items()} elif isinstance(obj, list): return [expand_env_vars(item) for item in obj] elif isinstance(obj, str): return os.path.expandvars(obj) else: return obj def processor_config_hook(data: Dict[str, Any]) -> BaseProcessorConfig: clz = ProcessorRegistry.get_config_cls(data["type"]) return from_dict(clz, data, config=config) def loader_config_hook(data: Dict[str, Any]) -> BaseDataLoaderConfig: clz = DataLoaderRegistry.get_config_cls(data["type"]) return from_dict(clz, data, config=config) def output_var_hook(data: Dict[str, Any]) -> OutputVar: clz = ProcessorRegistry.get_output_var_cls(data["type"]) return from_dict(clz, data, config=config) cfg = expand_env_vars(cfg) config = Config( type_hooks={ BaseProcessorConfig: processor_config_hook, BaseDataLoaderConfig: loader_config_hook, OutputVar: output_var_hook, } ) cfg_obj = from_dict(MMirageConfig, cast(dict, cfg), config=config) return cfg_obj