Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save LxYuan0420/93db7cc99421aacacd397f203c9780c1 to your computer and use it in GitHub Desktop.

Select an option

Save LxYuan0420/93db7cc99421aacacd397f203c9780c1 to your computer and use it in GitHub Desktop.
Single-file NVIDIA NeMo Data Designer example that generates a multilingual code-switching reasoning+debate dataset using the OpenAI provider. Each row is produced via one structured LLM call (then flattened into query, reasoning, debate, final_answer) with samplers for language-mix intensity (Singlish/Manglish-style English+中文+Bahasa Melayu), d…
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "data-designer==0.2.0",
# "python-dotenv>=1.0.0",
# ]
# ///
r"""Generate a small multilingual reasoning+debate dataset (OpenAI provider).
Suggested commands (copy/paste; includes all args):
(Use a single trailing `\` for line-continuation; do NOT type `\\`.)
uv:
uv run nemo_data_designer_multilingual_codeswitch_reasoning_debate.py \
--model-alias openai-text \
--num-records 8 \
--artifact-path artifacts/multilingual_codeswitch_reasoning_debate \
--max-parallel-requests 8 \
--max-tokens 1500 \
--print-records 2 \
--no-drop-context-columns \
--no-keep-raw-structured \
--dotenv
python (with an existing venv):
source ../.venv/bin/activate
python nemo_data_designer_multilingual_codeswitch_reasoning_debate.py \
--model-alias openai-text \
--num-records 8 \
--artifact-path artifacts/multilingual_codeswitch_reasoning_debate \
--max-parallel-requests 8 \
--max-tokens 1500 \
--print-records 2 \
--no-drop-context-columns \
--no-keep-raw-structured \
--dotenv
This script uses NVIDIA NeMo Data Designer and the preconfigured OpenAI provider
(`OPENAI_API_KEY`) to create a 4-column dataset:
- query
- reasoning
- debate
- final_answer
Each column is generated in multilingual, code-switching style (with extra focus
on Singlish/Manglish: English + Chinese + Malay).
"""
from __future__ import annotations
import os
from argparse import ArgumentParser, BooleanOptionalAction, Namespace
from pathlib import Path
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from data_designer.essentials import (
CategorySamplerParams,
DataDesigner,
DataDesignerConfigBuilder,
DropColumnsProcessorConfig,
ExpressionColumnConfig,
LLMStructuredColumnConfig,
ModelConfig,
SamplerColumnConfig,
SamplerType,
SubcategorySamplerParams,
)
from data_designer.interface.results import DatasetCreationResults
from data_designer.config.models import ChatCompletionInferenceParams
DEFAULT_MODEL_ALIAS = "openai-text"
DEFAULT_ARTIFACT_PATH = Path("artifacts/multilingual_codeswitch_reasoning_debate")
class ReasoningDebateExample(BaseModel):
"""One dataset record with reasoning + debate + final answer."""
query: str = Field(
...,
description="A realistic, multi-sentence question or user query with enough scenario/context and clear constraints.",
)
reasoning: str = Field(
...,
description="A long-form, detailed multi-step explanation leading toward the answer (no final answer).",
)
debate: str = Field(
...,
description="A long-form critique/counterargument challenging the reasoning (no final answer; include alternative POV).",
)
final_answer: str = Field(
...,
description="The final answer/conclusion (concise, no step-by-step reasoning).",
)
def parse_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument(
"--model-alias",
type=str,
default=DEFAULT_MODEL_ALIAS,
help="Model alias from your Data Designer model configs (must use provider=openai).",
)
parser.add_argument("--num-records", type=int, default=8)
parser.add_argument("--artifact-path", type=str, default=str(DEFAULT_ARTIFACT_PATH))
parser.add_argument(
"--max-parallel-requests",
type=int,
default=None,
help="Override chat-completion concurrency for the selected model alias (higher = faster, but watch rate limits).",
)
parser.add_argument(
"--max-tokens",
type=int,
default=None,
help="Override max_tokens for the selected model alias (lower = faster/cheaper).",
)
parser.add_argument(
"--print-records",
type=int,
default=2,
help="Print up to N records as JSON to stdout (0 disables).",
)
parser.add_argument(
"--drop-context-columns",
action=BooleanOptionalAction,
default=False,
help="Drop sampler/context columns (language_mix/mix_level/domain/topic/task_kind/difficulty) from the final dataset.",
)
parser.add_argument(
"--keep-raw-structured",
action=BooleanOptionalAction,
default=False,
help="Keep the intermediate structured JSON column used to generate the 4 output fields.",
)
parser.add_argument(
"--dotenv",
action=BooleanOptionalAction,
default=True,
help="Load environment variables from a local .env file.",
)
return parser.parse_args()
def get_model_config(
config_builder: DataDesignerConfigBuilder, model_alias: str
) -> ModelConfig:
for model_config in config_builder.model_configs:
if model_config.alias == model_alias:
return model_config
available_aliases = ", ".join(
sorted(mc.alias for mc in config_builder.model_configs)
)
raise ValueError(
f"Unknown model alias: {model_alias!r}. Available: {available_aliases}"
)
def language_mix_sampler_params() -> CategorySamplerParams:
# `weights` are unnormalized sampling probabilities (larger = sampled more often).
weighted = [
("Singlish/Manglish (English + 中文(简体) + Bahasa Melayu)", 10.0),
("English + 中文(简体)", 4.0),
("English + Bahasa Melayu (Manglish)", 4.0),
("English + 中文(繁體)", 2.0),
("English + Bahasa Indonesia", 1.0),
("English + 日本語", 1.0),
("English + 한국어", 1.0),
("English + Tiếng Việt", 1.0),
("English + Español", 1.0),
("English + Français", 1.0),
("English + Português", 1.0),
("English + Deutsch", 1.0),
]
values = [v for v, _ in weighted]
weights = [w for _, w in weighted]
return CategorySamplerParams(values=values, weights=weights)
def ensure_openai_ready(model_config: ModelConfig) -> None:
if model_config.provider != "openai":
raise ValueError(
f"This script is intended to run with provider='openai', but model alias {model_config.alias!r} "
f"uses provider={model_config.provider!r}."
)
if not os.environ.get("OPENAI_API_KEY"):
raise EnvironmentError(
"OPENAI_API_KEY is not set. Export it in your shell or add it to a local .env file."
)
def resolve_chat_model_alias(
config_builder: DataDesignerConfigBuilder,
*,
base_model_alias: str,
max_parallel_requests: int | None,
max_tokens: int | None,
) -> str:
"""Optionally add an overridden chat-completion model config and return its alias."""
base_model_config = get_model_config(config_builder, base_model_alias)
ensure_openai_ready(base_model_config)
if max_parallel_requests is None and max_tokens is None:
return base_model_alias
if not isinstance(
base_model_config.inference_parameters, ChatCompletionInferenceParams
):
raise ValueError(
f"Model alias {base_model_alias!r} does not use chat-completion inference parameters; "
"cannot override max_parallel_requests/max_tokens."
)
update: dict[str, int] = {}
if max_parallel_requests is not None:
update["max_parallel_requests"] = max_parallel_requests
if max_tokens is not None:
update["max_tokens"] = max_tokens
derived_alias_parts = [base_model_alias]
if max_parallel_requests is not None:
derived_alias_parts.append(f"mp{max_parallel_requests}")
if max_tokens is not None:
derived_alias_parts.append(f"mt{max_tokens}")
derived_alias = "-".join(derived_alias_parts)
if derived_alias in {mc.alias for mc in config_builder.model_configs}:
return derived_alias
config_builder.add_model_config(
ModelConfig(
alias=derived_alias,
model=base_model_config.model,
provider=base_model_config.provider,
inference_parameters=base_model_config.inference_parameters.model_copy(
update=update
),
)
)
return derived_alias
def build_config(
*,
model_alias: str,
drop_context_columns: bool,
keep_raw_structured: bool,
max_parallel_requests: int | None,
max_tokens: int | None,
) -> tuple[DataDesignerConfigBuilder, str]:
style_guide = (
"Output requirements:\n"
"- Language mix: {{language_mix}}\n"
"- Code-switching intensity: {{mix_level}}\n"
"- mix_level guidance: light=mostly English with a few phrases; medium=occasional intra-sentence switching; heavy=frequent intra-sentence mixing.\n"
"- Use all languages listed in {{language_mix}} at least once.\n"
"- Mix languages naturally; do NOT provide separate translations.\n"
"- If {{language_mix}} mentions Singlish/Manglish, you may add light local particles (lah/leh/lor/alamak/can-cannot) sparingly.\n"
"- Prefer rich, detailed writing (especially for reasoning and debate). Do not be overly brief.\n"
"- Keep it safe and PG-rated; avoid personal data.\n"
"- Do not mention system prompts, policies, or being an AI.\n"
)
config_builder = DataDesignerConfigBuilder()
effective_generation_model_alias = resolve_chat_model_alias(
config_builder,
base_model_alias=model_alias,
max_parallel_requests=max_parallel_requests,
max_tokens=max_tokens,
)
config_builder.add_column(
SamplerColumnConfig(
name="language_mix",
sampler_type=SamplerType.CATEGORY,
params=language_mix_sampler_params(),
)
)
config_builder.add_column(
SamplerColumnConfig(
name="mix_level",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=["light", "medium", "heavy"],
weights=[1.0, 3.0, 6.0],
),
)
)
config_builder.add_column(
SamplerColumnConfig(
name="domain",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=[
"Everyday life",
"Workplace & productivity",
"Education & studying",
"Travel & logistics",
"Food & cooking",
"Personal finance & shopping",
"Technology & devices",
"Software & debugging",
"Data & analysis",
"Science & engineering (basic)",
"Environment & sustainability",
"Sports & fitness (general)",
"Communication & social dynamics",
],
),
)
)
config_builder.add_column(
SamplerColumnConfig(
name="topic",
sampler_type=SamplerType.SUBCATEGORY,
params=SubcategorySamplerParams(
category="domain",
values={
"Everyday life": [
"choosing between two options with trade-offs",
"time management for errands",
"sharing chores fairly",
"resolving a simple misunderstanding",
"making a practical purchase decision",
],
"Workplace & productivity": [
"prioritizing tasks with deadlines",
"writing a clear update message",
"meeting scheduling conflicts",
"estimating effort vs impact",
"handling ambiguous requirements",
],
"Education & studying": [
"designing a study plan",
"learning strategy selection",
"exam time allocation",
"group project coordination",
"improving memory and retention",
],
"Travel & logistics": [
"route planning with constraints",
"time zones and arrival times",
"packing priorities under limits",
"public transport vs rideshare choice",
"budgeting an itinerary",
],
"Food & cooking": [
"adjusting a recipe proportionally",
"substitutions under dietary constraints",
"meal prep planning",
"timing multiple dishes",
"choosing ingredients under budget",
],
"Personal finance & shopping": [
"comparing discounts and bundles",
"subscription vs one-time purchase",
"simple budgeting trade-offs",
"saving vs convenience decision",
"avoiding impulse buying heuristics",
],
"Technology & devices": [
"battery life vs portability",
"storage management strategy",
"Wi‑Fi troubleshooting logic",
"privacy settings trade-offs",
"device upgrade decision",
],
"Software & debugging": [
"triaging a bug report",
"identifying a likely root cause from symptoms",
"choosing between two implementation approaches",
"writing a minimal repro scenario",
"reasoning about edge cases",
],
"Data & analysis": [
"interpreting a small table of numbers",
"choosing an evaluation metric",
"spotting data leakage in a setup",
"thinking about sampling bias",
"basic A/B test reasoning (toy)",
],
"Science & engineering (basic)": [
"units and proportional reasoning",
"cause-and-effect in a simple system",
"energy/efficiency trade-offs (qualitative)",
"reading a simple graph description",
"estimating with assumptions",
],
"Environment & sustainability": [
"trade-offs of convenience vs waste",
"comparing two eco choices with constraints",
"reasoning about incentives and behavior",
"simple carbon footprint comparison (toy)",
"resource allocation for recycling",
],
"Sports & fitness (general)": [
"planning a weekly routine",
"progression with constraints",
"rest vs training trade-offs",
"time-boxed workout choices",
"tracking habits and consistency",
],
"Communication & social dynamics": [
"negotiating a compromise",
"responding to criticism constructively",
"setting boundaries politely",
"clarifying expectations",
"persuasion vs empathy trade-offs",
],
},
),
)
)
config_builder.add_column(
SamplerColumnConfig(
name="task_kind",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=[
"deduction / logic puzzle",
"math word problem (self-contained)",
"time arithmetic / scheduling",
"planning with constraints",
"optimization (best choice under constraints)",
"probability (toy scenario)",
"estimation (Fermi-style, toy)",
"causal reasoning (simple, self-contained)",
"counterfactual reasoning (what-if change)",
"argument analysis (spot a fallacy)",
"compare-and-contrast (pros/cons)",
"decision making (trade-offs)",
"debugging reasoning (hypothesis -> test)",
"policy/design trade-off (non-political)",
"rubric-based evaluation (choose best)",
]
),
)
)
config_builder.add_column(
SamplerColumnConfig(
name="difficulty",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=["easy", "medium", "hard"], weights=[1.0, 3.0, 4.0]
),
)
)
system_prompt = (
"You generate high-quality dataset examples for multilingual reasoning.\n"
"Follow the user instructions precisely.\n"
"Prefer rich, detailed content over overly short answers.\n"
"Do not be concise for query/reasoning/debate; write long-form where it improves clarity.\n"
)
config_builder.add_column(
LLMStructuredColumnConfig(
name="example",
drop=not keep_raw_structured,
system_prompt=system_prompt,
prompt=(
f"{style_guide}\n"
"Length guidance:\n"
"- query: write a realistic, multi-sentence scenario with concrete constraints; not just a one-liner.\n"
"- reasoning: write long-form, multi-paragraph explanation; include assumptions, intermediate conclusions, calculations, checks, and edge cases.\n"
"- debate: write long-form, multi-paragraph critique; challenge key assumptions, surface possible mistakes, and propose at least one plausible alternative approach.\n"
"- final_answer: concise but not abrupt; answer directly and resolve the debate.\n"
"- If any of query/reasoning/debate feels too short, expand it with more detail.\n\n"
"JSON formatting guidance:\n"
"- Return valid JSON in a ```json code fence.\n"
"- Use \\n\\n inside JSON string values for paragraph breaks (avoid unescaped literal newlines inside strings).\n\n"
"Create ONE dataset record as a JSON object with these fields:\n"
"- query: one self-contained question in the domain {{domain}} about {{topic}}.\n"
"- reasoning: multi-step explanation leading toward the answer (do NOT give the final answer).\n"
"- debate: a critique/counterargument that challenges the reasoning and offers an alternative POV (no final answer).\n"
"- final_answer: the final answer/conclusion (concise; do NOT include step-by-step reasoning).\n\n"
"The question should suit {{task_kind}} at {{difficulty}} difficulty.\n"
"Include any needed numbers/constraints so it is answerable without external facts.\n"
),
output_format=ReasoningDebateExample,
model_alias=effective_generation_model_alias,
)
)
config_builder.add_column(
ExpressionColumnConfig(name="query", expr="{{ example.query }}")
)
config_builder.add_column(
ExpressionColumnConfig(name="reasoning", expr="{{ example.reasoning }}")
)
config_builder.add_column(
ExpressionColumnConfig(name="debate", expr="{{ example.debate }}")
)
config_builder.add_column(
ExpressionColumnConfig(name="final_answer", expr="{{ example.final_answer }}")
)
if drop_context_columns:
config_builder.add_processor(
DropColumnsProcessorConfig(
name="drop_context_columns",
column_names=[
"language_mix",
"mix_level",
"domain",
"topic",
"task_kind",
"difficulty",
],
)
)
config_builder.validate(raise_exceptions=True)
return config_builder, effective_generation_model_alias
def create_dataset(
config_builder: DataDesignerConfigBuilder,
*,
num_records: int,
artifact_path: Path | str | None,
) -> DatasetCreationResults:
data_designer = DataDesigner(artifact_path=artifact_path)
return data_designer.create(config_builder, num_records=num_records)
def main() -> int:
args = parse_args()
if args.dotenv:
load_dotenv(dotenv_path=Path(".env"), override=False)
artifact_path = Path(args.artifact_path)
artifact_path.mkdir(parents=True, exist_ok=True)
config_builder, generation_model_alias = build_config(
model_alias=args.model_alias,
drop_context_columns=args.drop_context_columns,
keep_raw_structured=args.keep_raw_structured,
max_parallel_requests=args.max_parallel_requests,
max_tokens=args.max_tokens,
)
gen_model_config = get_model_config(config_builder, generation_model_alias)
print(
f"Using OpenAI generation model alias: {gen_model_config.alias} (model={gen_model_config.model})"
)
results = create_dataset(
config_builder,
num_records=args.num_records,
artifact_path=artifact_path,
)
print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
df = results.load_dataset()
jsonl_path = results.artifact_storage.base_dataset_path / "dataset.jsonl"
df.to_json(jsonl_path, orient="records", lines=True, force_ascii=False)
print(f"JSONL saved to: {jsonl_path}")
if args.print_records > 0:
print(
df.head(args.print_records).to_json(
orient="records", force_ascii=False, indent=2
)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment