bantmen · December 29, 2025 09:33 · bantmen · Dec 29, 2025
diff --git a/bench.py b/bench.py
 import torch
 from triton.testing import do_bench


 def bench(M, N=128, dtype=torch.bfloat16):
    x = torch.randn(M, N, device="cuda", dtype=dtype)
    w = torch.randn(N, device="cuda", dtype=dtype)
    eps = 1e-6

    with torch.inference_mode():
        # Compile torch kernel
        compiled_torch = torch.compile(
            torch.nn.functional.rms_norm, mode="default", fullgraph=True
        )

        torch.cuda.synchronize()

        compiled_torch_time = do_bench(
            lambda: compiled_torch(x, (N,), weight=w, eps=eps)
        )

    bwd = lambda m, n, time: M * N * dtype.itemsize * 2 / time / 1e6

    print(
        f"M={M:7d} | "
        f"compiled_torch: {bwd(M, N, compiled_torch_time):4.1f} GB/s"
    )


 if __name__ == "__main__":
    print("N=128, dtype=bfloat16")
    print("-" * 85)

    # Sweep multiple M values (comment above, uncomment below)
    M = 1024
    for _ in range(14):
        bench(M)
        M *= 2
	import torch
	from triton.testing import do_bench


	def bench(M, N=128, dtype=torch.bfloat16):
	x = torch.randn(M, N, device="cuda", dtype=dtype)
	w = torch.randn(N, device="cuda", dtype=dtype)
	eps = 1e-6

	with torch.inference_mode():
	# Compile torch kernel
	compiled_torch = torch.compile(
	torch.nn.functional.rms_norm, mode="default", fullgraph=True
	)

	torch.cuda.synchronize()

	compiled_torch_time = do_bench(
	lambda: compiled_torch(x, (N,), weight=w, eps=eps)
	)

	bwd = lambda m, n, time: M * N * dtype.itemsize * 2 / time / 1e6

	print(
	f"M={M:7d} \| "
	f"compiled_torch: {bwd(M, N, compiled_torch_time):4.1f} GB/s"
	)


	if __name__ == "__main__":
	print("N=128, dtype=bfloat16")
	print("-" * 85)

	# Sweep multiple M values (comment above, uncomment below)
	M = 1024
	for _ in range(14):
	bench(M)
	M *= 2
No results found