Created
December 18, 2025 06:56
-
-
Save LxYuan0420/93db7cc99421aacacd397f203c9780c1 to your computer and use it in GitHub Desktop.
Single-file NVIDIA NeMo Data Designer example that generates a multilingual code-switching reasoning+debate dataset using the OpenAI provider. Each row is produced via one structured LLM call (then flattened into query, reasoning, debate, final_answer) with samplers for language-mix intensity (Singlish/Manglish-style English+中文+Bahasa Melayu), d…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = [ | |
| # "data-designer==0.2.0", | |
| # "python-dotenv>=1.0.0", | |
| # ] | |
| # /// | |
| r"""Generate a small multilingual reasoning+debate dataset (OpenAI provider). | |
| Suggested commands (copy/paste; includes all args): | |
| (Use a single trailing `\` for line-continuation; do NOT type `\\`.) | |
| uv: | |
| uv run nemo_data_designer_multilingual_codeswitch_reasoning_debate.py \ | |
| --model-alias openai-text \ | |
| --num-records 8 \ | |
| --artifact-path artifacts/multilingual_codeswitch_reasoning_debate \ | |
| --max-parallel-requests 8 \ | |
| --max-tokens 1500 \ | |
| --print-records 2 \ | |
| --no-drop-context-columns \ | |
| --no-keep-raw-structured \ | |
| --dotenv | |
| python (with an existing venv): | |
| source ../.venv/bin/activate | |
| python nemo_data_designer_multilingual_codeswitch_reasoning_debate.py \ | |
| --model-alias openai-text \ | |
| --num-records 8 \ | |
| --artifact-path artifacts/multilingual_codeswitch_reasoning_debate \ | |
| --max-parallel-requests 8 \ | |
| --max-tokens 1500 \ | |
| --print-records 2 \ | |
| --no-drop-context-columns \ | |
| --no-keep-raw-structured \ | |
| --dotenv | |
| This script uses NVIDIA NeMo Data Designer and the preconfigured OpenAI provider | |
| (`OPENAI_API_KEY`) to create a 4-column dataset: | |
| - query | |
| - reasoning | |
| - debate | |
| - final_answer | |
| Each column is generated in multilingual, code-switching style (with extra focus | |
| on Singlish/Manglish: English + Chinese + Malay). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from argparse import ArgumentParser, BooleanOptionalAction, Namespace | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from pydantic import BaseModel, Field | |
| from data_designer.essentials import ( | |
| CategorySamplerParams, | |
| DataDesigner, | |
| DataDesignerConfigBuilder, | |
| DropColumnsProcessorConfig, | |
| ExpressionColumnConfig, | |
| LLMStructuredColumnConfig, | |
| ModelConfig, | |
| SamplerColumnConfig, | |
| SamplerType, | |
| SubcategorySamplerParams, | |
| ) | |
| from data_designer.interface.results import DatasetCreationResults | |
| from data_designer.config.models import ChatCompletionInferenceParams | |
| DEFAULT_MODEL_ALIAS = "openai-text" | |
| DEFAULT_ARTIFACT_PATH = Path("artifacts/multilingual_codeswitch_reasoning_debate") | |
| class ReasoningDebateExample(BaseModel): | |
| """One dataset record with reasoning + debate + final answer.""" | |
| query: str = Field( | |
| ..., | |
| description="A realistic, multi-sentence question or user query with enough scenario/context and clear constraints.", | |
| ) | |
| reasoning: str = Field( | |
| ..., | |
| description="A long-form, detailed multi-step explanation leading toward the answer (no final answer).", | |
| ) | |
| debate: str = Field( | |
| ..., | |
| description="A long-form critique/counterargument challenging the reasoning (no final answer; include alternative POV).", | |
| ) | |
| final_answer: str = Field( | |
| ..., | |
| description="The final answer/conclusion (concise, no step-by-step reasoning).", | |
| ) | |
| def parse_args() -> Namespace: | |
| parser = ArgumentParser() | |
| parser.add_argument( | |
| "--model-alias", | |
| type=str, | |
| default=DEFAULT_MODEL_ALIAS, | |
| help="Model alias from your Data Designer model configs (must use provider=openai).", | |
| ) | |
| parser.add_argument("--num-records", type=int, default=8) | |
| parser.add_argument("--artifact-path", type=str, default=str(DEFAULT_ARTIFACT_PATH)) | |
| parser.add_argument( | |
| "--max-parallel-requests", | |
| type=int, | |
| default=None, | |
| help="Override chat-completion concurrency for the selected model alias (higher = faster, but watch rate limits).", | |
| ) | |
| parser.add_argument( | |
| "--max-tokens", | |
| type=int, | |
| default=None, | |
| help="Override max_tokens for the selected model alias (lower = faster/cheaper).", | |
| ) | |
| parser.add_argument( | |
| "--print-records", | |
| type=int, | |
| default=2, | |
| help="Print up to N records as JSON to stdout (0 disables).", | |
| ) | |
| parser.add_argument( | |
| "--drop-context-columns", | |
| action=BooleanOptionalAction, | |
| default=False, | |
| help="Drop sampler/context columns (language_mix/mix_level/domain/topic/task_kind/difficulty) from the final dataset.", | |
| ) | |
| parser.add_argument( | |
| "--keep-raw-structured", | |
| action=BooleanOptionalAction, | |
| default=False, | |
| help="Keep the intermediate structured JSON column used to generate the 4 output fields.", | |
| ) | |
| parser.add_argument( | |
| "--dotenv", | |
| action=BooleanOptionalAction, | |
| default=True, | |
| help="Load environment variables from a local .env file.", | |
| ) | |
| return parser.parse_args() | |
| def get_model_config( | |
| config_builder: DataDesignerConfigBuilder, model_alias: str | |
| ) -> ModelConfig: | |
| for model_config in config_builder.model_configs: | |
| if model_config.alias == model_alias: | |
| return model_config | |
| available_aliases = ", ".join( | |
| sorted(mc.alias for mc in config_builder.model_configs) | |
| ) | |
| raise ValueError( | |
| f"Unknown model alias: {model_alias!r}. Available: {available_aliases}" | |
| ) | |
| def language_mix_sampler_params() -> CategorySamplerParams: | |
| # `weights` are unnormalized sampling probabilities (larger = sampled more often). | |
| weighted = [ | |
| ("Singlish/Manglish (English + 中文(简体) + Bahasa Melayu)", 10.0), | |
| ("English + 中文(简体)", 4.0), | |
| ("English + Bahasa Melayu (Manglish)", 4.0), | |
| ("English + 中文(繁體)", 2.0), | |
| ("English + Bahasa Indonesia", 1.0), | |
| ("English + 日本語", 1.0), | |
| ("English + 한국어", 1.0), | |
| ("English + Tiếng Việt", 1.0), | |
| ("English + Español", 1.0), | |
| ("English + Français", 1.0), | |
| ("English + Português", 1.0), | |
| ("English + Deutsch", 1.0), | |
| ] | |
| values = [v for v, _ in weighted] | |
| weights = [w for _, w in weighted] | |
| return CategorySamplerParams(values=values, weights=weights) | |
| def ensure_openai_ready(model_config: ModelConfig) -> None: | |
| if model_config.provider != "openai": | |
| raise ValueError( | |
| f"This script is intended to run with provider='openai', but model alias {model_config.alias!r} " | |
| f"uses provider={model_config.provider!r}." | |
| ) | |
| if not os.environ.get("OPENAI_API_KEY"): | |
| raise EnvironmentError( | |
| "OPENAI_API_KEY is not set. Export it in your shell or add it to a local .env file." | |
| ) | |
| def resolve_chat_model_alias( | |
| config_builder: DataDesignerConfigBuilder, | |
| *, | |
| base_model_alias: str, | |
| max_parallel_requests: int | None, | |
| max_tokens: int | None, | |
| ) -> str: | |
| """Optionally add an overridden chat-completion model config and return its alias.""" | |
| base_model_config = get_model_config(config_builder, base_model_alias) | |
| ensure_openai_ready(base_model_config) | |
| if max_parallel_requests is None and max_tokens is None: | |
| return base_model_alias | |
| if not isinstance( | |
| base_model_config.inference_parameters, ChatCompletionInferenceParams | |
| ): | |
| raise ValueError( | |
| f"Model alias {base_model_alias!r} does not use chat-completion inference parameters; " | |
| "cannot override max_parallel_requests/max_tokens." | |
| ) | |
| update: dict[str, int] = {} | |
| if max_parallel_requests is not None: | |
| update["max_parallel_requests"] = max_parallel_requests | |
| if max_tokens is not None: | |
| update["max_tokens"] = max_tokens | |
| derived_alias_parts = [base_model_alias] | |
| if max_parallel_requests is not None: | |
| derived_alias_parts.append(f"mp{max_parallel_requests}") | |
| if max_tokens is not None: | |
| derived_alias_parts.append(f"mt{max_tokens}") | |
| derived_alias = "-".join(derived_alias_parts) | |
| if derived_alias in {mc.alias for mc in config_builder.model_configs}: | |
| return derived_alias | |
| config_builder.add_model_config( | |
| ModelConfig( | |
| alias=derived_alias, | |
| model=base_model_config.model, | |
| provider=base_model_config.provider, | |
| inference_parameters=base_model_config.inference_parameters.model_copy( | |
| update=update | |
| ), | |
| ) | |
| ) | |
| return derived_alias | |
| def build_config( | |
| *, | |
| model_alias: str, | |
| drop_context_columns: bool, | |
| keep_raw_structured: bool, | |
| max_parallel_requests: int | None, | |
| max_tokens: int | None, | |
| ) -> tuple[DataDesignerConfigBuilder, str]: | |
| style_guide = ( | |
| "Output requirements:\n" | |
| "- Language mix: {{language_mix}}\n" | |
| "- Code-switching intensity: {{mix_level}}\n" | |
| "- mix_level guidance: light=mostly English with a few phrases; medium=occasional intra-sentence switching; heavy=frequent intra-sentence mixing.\n" | |
| "- Use all languages listed in {{language_mix}} at least once.\n" | |
| "- Mix languages naturally; do NOT provide separate translations.\n" | |
| "- If {{language_mix}} mentions Singlish/Manglish, you may add light local particles (lah/leh/lor/alamak/can-cannot) sparingly.\n" | |
| "- Prefer rich, detailed writing (especially for reasoning and debate). Do not be overly brief.\n" | |
| "- Keep it safe and PG-rated; avoid personal data.\n" | |
| "- Do not mention system prompts, policies, or being an AI.\n" | |
| ) | |
| config_builder = DataDesignerConfigBuilder() | |
| effective_generation_model_alias = resolve_chat_model_alias( | |
| config_builder, | |
| base_model_alias=model_alias, | |
| max_parallel_requests=max_parallel_requests, | |
| max_tokens=max_tokens, | |
| ) | |
| config_builder.add_column( | |
| SamplerColumnConfig( | |
| name="language_mix", | |
| sampler_type=SamplerType.CATEGORY, | |
| params=language_mix_sampler_params(), | |
| ) | |
| ) | |
| config_builder.add_column( | |
| SamplerColumnConfig( | |
| name="mix_level", | |
| sampler_type=SamplerType.CATEGORY, | |
| params=CategorySamplerParams( | |
| values=["light", "medium", "heavy"], | |
| weights=[1.0, 3.0, 6.0], | |
| ), | |
| ) | |
| ) | |
| config_builder.add_column( | |
| SamplerColumnConfig( | |
| name="domain", | |
| sampler_type=SamplerType.CATEGORY, | |
| params=CategorySamplerParams( | |
| values=[ | |
| "Everyday life", | |
| "Workplace & productivity", | |
| "Education & studying", | |
| "Travel & logistics", | |
| "Food & cooking", | |
| "Personal finance & shopping", | |
| "Technology & devices", | |
| "Software & debugging", | |
| "Data & analysis", | |
| "Science & engineering (basic)", | |
| "Environment & sustainability", | |
| "Sports & fitness (general)", | |
| "Communication & social dynamics", | |
| ], | |
| ), | |
| ) | |
| ) | |
| config_builder.add_column( | |
| SamplerColumnConfig( | |
| name="topic", | |
| sampler_type=SamplerType.SUBCATEGORY, | |
| params=SubcategorySamplerParams( | |
| category="domain", | |
| values={ | |
| "Everyday life": [ | |
| "choosing between two options with trade-offs", | |
| "time management for errands", | |
| "sharing chores fairly", | |
| "resolving a simple misunderstanding", | |
| "making a practical purchase decision", | |
| ], | |
| "Workplace & productivity": [ | |
| "prioritizing tasks with deadlines", | |
| "writing a clear update message", | |
| "meeting scheduling conflicts", | |
| "estimating effort vs impact", | |
| "handling ambiguous requirements", | |
| ], | |
| "Education & studying": [ | |
| "designing a study plan", | |
| "learning strategy selection", | |
| "exam time allocation", | |
| "group project coordination", | |
| "improving memory and retention", | |
| ], | |
| "Travel & logistics": [ | |
| "route planning with constraints", | |
| "time zones and arrival times", | |
| "packing priorities under limits", | |
| "public transport vs rideshare choice", | |
| "budgeting an itinerary", | |
| ], | |
| "Food & cooking": [ | |
| "adjusting a recipe proportionally", | |
| "substitutions under dietary constraints", | |
| "meal prep planning", | |
| "timing multiple dishes", | |
| "choosing ingredients under budget", | |
| ], | |
| "Personal finance & shopping": [ | |
| "comparing discounts and bundles", | |
| "subscription vs one-time purchase", | |
| "simple budgeting trade-offs", | |
| "saving vs convenience decision", | |
| "avoiding impulse buying heuristics", | |
| ], | |
| "Technology & devices": [ | |
| "battery life vs portability", | |
| "storage management strategy", | |
| "Wi‑Fi troubleshooting logic", | |
| "privacy settings trade-offs", | |
| "device upgrade decision", | |
| ], | |
| "Software & debugging": [ | |
| "triaging a bug report", | |
| "identifying a likely root cause from symptoms", | |
| "choosing between two implementation approaches", | |
| "writing a minimal repro scenario", | |
| "reasoning about edge cases", | |
| ], | |
| "Data & analysis": [ | |
| "interpreting a small table of numbers", | |
| "choosing an evaluation metric", | |
| "spotting data leakage in a setup", | |
| "thinking about sampling bias", | |
| "basic A/B test reasoning (toy)", | |
| ], | |
| "Science & engineering (basic)": [ | |
| "units and proportional reasoning", | |
| "cause-and-effect in a simple system", | |
| "energy/efficiency trade-offs (qualitative)", | |
| "reading a simple graph description", | |
| "estimating with assumptions", | |
| ], | |
| "Environment & sustainability": [ | |
| "trade-offs of convenience vs waste", | |
| "comparing two eco choices with constraints", | |
| "reasoning about incentives and behavior", | |
| "simple carbon footprint comparison (toy)", | |
| "resource allocation for recycling", | |
| ], | |
| "Sports & fitness (general)": [ | |
| "planning a weekly routine", | |
| "progression with constraints", | |
| "rest vs training trade-offs", | |
| "time-boxed workout choices", | |
| "tracking habits and consistency", | |
| ], | |
| "Communication & social dynamics": [ | |
| "negotiating a compromise", | |
| "responding to criticism constructively", | |
| "setting boundaries politely", | |
| "clarifying expectations", | |
| "persuasion vs empathy trade-offs", | |
| ], | |
| }, | |
| ), | |
| ) | |
| ) | |
| config_builder.add_column( | |
| SamplerColumnConfig( | |
| name="task_kind", | |
| sampler_type=SamplerType.CATEGORY, | |
| params=CategorySamplerParams( | |
| values=[ | |
| "deduction / logic puzzle", | |
| "math word problem (self-contained)", | |
| "time arithmetic / scheduling", | |
| "planning with constraints", | |
| "optimization (best choice under constraints)", | |
| "probability (toy scenario)", | |
| "estimation (Fermi-style, toy)", | |
| "causal reasoning (simple, self-contained)", | |
| "counterfactual reasoning (what-if change)", | |
| "argument analysis (spot a fallacy)", | |
| "compare-and-contrast (pros/cons)", | |
| "decision making (trade-offs)", | |
| "debugging reasoning (hypothesis -> test)", | |
| "policy/design trade-off (non-political)", | |
| "rubric-based evaluation (choose best)", | |
| ] | |
| ), | |
| ) | |
| ) | |
| config_builder.add_column( | |
| SamplerColumnConfig( | |
| name="difficulty", | |
| sampler_type=SamplerType.CATEGORY, | |
| params=CategorySamplerParams( | |
| values=["easy", "medium", "hard"], weights=[1.0, 3.0, 4.0] | |
| ), | |
| ) | |
| ) | |
| system_prompt = ( | |
| "You generate high-quality dataset examples for multilingual reasoning.\n" | |
| "Follow the user instructions precisely.\n" | |
| "Prefer rich, detailed content over overly short answers.\n" | |
| "Do not be concise for query/reasoning/debate; write long-form where it improves clarity.\n" | |
| ) | |
| config_builder.add_column( | |
| LLMStructuredColumnConfig( | |
| name="example", | |
| drop=not keep_raw_structured, | |
| system_prompt=system_prompt, | |
| prompt=( | |
| f"{style_guide}\n" | |
| "Length guidance:\n" | |
| "- query: write a realistic, multi-sentence scenario with concrete constraints; not just a one-liner.\n" | |
| "- reasoning: write long-form, multi-paragraph explanation; include assumptions, intermediate conclusions, calculations, checks, and edge cases.\n" | |
| "- debate: write long-form, multi-paragraph critique; challenge key assumptions, surface possible mistakes, and propose at least one plausible alternative approach.\n" | |
| "- final_answer: concise but not abrupt; answer directly and resolve the debate.\n" | |
| "- If any of query/reasoning/debate feels too short, expand it with more detail.\n\n" | |
| "JSON formatting guidance:\n" | |
| "- Return valid JSON in a ```json code fence.\n" | |
| "- Use \\n\\n inside JSON string values for paragraph breaks (avoid unescaped literal newlines inside strings).\n\n" | |
| "Create ONE dataset record as a JSON object with these fields:\n" | |
| "- query: one self-contained question in the domain {{domain}} about {{topic}}.\n" | |
| "- reasoning: multi-step explanation leading toward the answer (do NOT give the final answer).\n" | |
| "- debate: a critique/counterargument that challenges the reasoning and offers an alternative POV (no final answer).\n" | |
| "- final_answer: the final answer/conclusion (concise; do NOT include step-by-step reasoning).\n\n" | |
| "The question should suit {{task_kind}} at {{difficulty}} difficulty.\n" | |
| "Include any needed numbers/constraints so it is answerable without external facts.\n" | |
| ), | |
| output_format=ReasoningDebateExample, | |
| model_alias=effective_generation_model_alias, | |
| ) | |
| ) | |
| config_builder.add_column( | |
| ExpressionColumnConfig(name="query", expr="{{ example.query }}") | |
| ) | |
| config_builder.add_column( | |
| ExpressionColumnConfig(name="reasoning", expr="{{ example.reasoning }}") | |
| ) | |
| config_builder.add_column( | |
| ExpressionColumnConfig(name="debate", expr="{{ example.debate }}") | |
| ) | |
| config_builder.add_column( | |
| ExpressionColumnConfig(name="final_answer", expr="{{ example.final_answer }}") | |
| ) | |
| if drop_context_columns: | |
| config_builder.add_processor( | |
| DropColumnsProcessorConfig( | |
| name="drop_context_columns", | |
| column_names=[ | |
| "language_mix", | |
| "mix_level", | |
| "domain", | |
| "topic", | |
| "task_kind", | |
| "difficulty", | |
| ], | |
| ) | |
| ) | |
| config_builder.validate(raise_exceptions=True) | |
| return config_builder, effective_generation_model_alias | |
| def create_dataset( | |
| config_builder: DataDesignerConfigBuilder, | |
| *, | |
| num_records: int, | |
| artifact_path: Path | str | None, | |
| ) -> DatasetCreationResults: | |
| data_designer = DataDesigner(artifact_path=artifact_path) | |
| return data_designer.create(config_builder, num_records=num_records) | |
| def main() -> int: | |
| args = parse_args() | |
| if args.dotenv: | |
| load_dotenv(dotenv_path=Path(".env"), override=False) | |
| artifact_path = Path(args.artifact_path) | |
| artifact_path.mkdir(parents=True, exist_ok=True) | |
| config_builder, generation_model_alias = build_config( | |
| model_alias=args.model_alias, | |
| drop_context_columns=args.drop_context_columns, | |
| keep_raw_structured=args.keep_raw_structured, | |
| max_parallel_requests=args.max_parallel_requests, | |
| max_tokens=args.max_tokens, | |
| ) | |
| gen_model_config = get_model_config(config_builder, generation_model_alias) | |
| print( | |
| f"Using OpenAI generation model alias: {gen_model_config.alias} (model={gen_model_config.model})" | |
| ) | |
| results = create_dataset( | |
| config_builder, | |
| num_records=args.num_records, | |
| artifact_path=artifact_path, | |
| ) | |
| print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") | |
| df = results.load_dataset() | |
| jsonl_path = results.artifact_storage.base_dataset_path / "dataset.jsonl" | |
| df.to_json(jsonl_path, orient="records", lines=True, force_ascii=False) | |
| print(f"JSONL saved to: {jsonl_path}") | |
| if args.print_records > 0: | |
| print( | |
| df.head(args.print_records).to_json( | |
| orient="records", force_ascii=False, indent=2 | |
| ) | |
| ) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment