May 7, 2026 13:00 · April 17, 2026 00:24 · April 14, 2026 01:47 · April 19, 2026 14:33 · November 21, 2025 20:56 · November 21, 2025 00:14
 {%- macro format_parameters(properties, required) -%}
    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
    {%- set ns = namespace(found_first=false) -%}
    {%- for key, value in properties | dictsort -%}
        {%- set add_comma = false -%}
        {%- if key not in standard_keys -%}
            {%- if ns.found_first %},{% endif -%}
            {%- set ns.found_first = true -%}
            {{ key }}:{
            {%- if value['description'] -%}
 diff --git a/csrc/ops.h b/csrc/ops.h
 index f8bdc61aa..933c64db0 100644
 --- a/csrc/ops.h
 +++ b/csrc/ops.h
 @@ -218,6 +218,7 @@ bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
 bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
 +bool cutlass_moe_mm_supports_fp4(int64_t cuda_device_capability);
 
 # A crude copy of vLLM's normal Dockerfile that installs
 # a released version on DGX Spark

 ARG CUDA_VERSION=13.0.2
 ARG PYTHON_VERSION=3.12
 ARG VLLM_VERSION=0.11.2
 ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl

 import json
 from openai import OpenAI


 def hermes_grammar_from_tools(tools: list[dict]) -> str:
    tool_funcs = ""
    for tool in tools:
        tool_funcs += " | " if tool_funcs else ""
        tool_funcs += f"fun_{tool['function']['name']}"
 # Dependencies:
 #  pip install openai pydantic-ai

 # This example uses the web_search builtin tool, so it assumes you
 # have a valid TAVILY_API_KEY environment variable set before starting
 # your Llama Stack server.

 # Usage:
 #
 #  ollama run llama3.2:3b
 @@grammar::Llama4


 start
    =
    expression $
    ;

 expression
    =
	{%- macro format_parameters(properties, required) -%}
	{%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
	{%- set ns = namespace(found_first=false) -%}
	{%- for key, value in properties \| dictsort -%}
	{%- set add_comma = false -%}
	{%- if key not in standard_keys -%}
	{%- if ns.found_first %},{% endif -%}
	{%- set ns.found_first = true -%}
	{{ key }}:{
	{%- if value['description'] -%}
	diff --git a/csrc/ops.h b/csrc/ops.h
	index f8bdc61aa..933c64db0 100644
	--- a/csrc/ops.h
	+++ b/csrc/ops.h
	@@ -218,6 +218,7 @@ bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
	bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
	bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
	bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
	+bool cutlass_moe_mm_supports_fp4(int64_t cuda_device_capability);
	# A crude copy of vLLM's normal Dockerfile that installs
	# a released version on DGX Spark

	ARG CUDA_VERSION=13.0.2
	ARG PYTHON_VERSION=3.12
	ARG VLLM_VERSION=0.11.2
	ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
	ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
	import json
	from openai import OpenAI


	def hermes_grammar_from_tools(tools: list[dict]) -> str:
	tool_funcs = ""
	for tool in tools:
	tool_funcs += " \| " if tool_funcs else ""
	tool_funcs += f"fun_{tool['function']['name']}"
	# Dependencies:
	# pip install openai pydantic-ai

	# This example uses the web_search builtin tool, so it assumes you
	# have a valid TAVILY_API_KEY environment variable set before starting
	# your Llama Stack server.

	# Usage:
	#
	# ollama run llama3.2:3b