Nikita Shulga malfet

malfet / fetch_commit_statuses.py

Created May 7, 2026 15:29

Fetch commit statuses from HUD

	"""Fetch the latest N commits' job statuses from hud.pytorch.org."""

	import argparse
	import gzip
	import json
	import sys
	from curl_cffi import requests


	def fetch_hud(owner: str, repo: str, branch: str, count: int) -> dict:

malfet / tune_mpp.py

Last active April 30, 2026 15:24

Tune MPP matmul2d tile sizes for MPS F.linear

	"""Tune MPP matmul2d tile sizes across dtypes and shapes."""
	import time
	import torch
	import torch.nn.functional as F

	torch.set_grad_enabled(False)

	WARMUP = 20
	REPEAT = 200
	BATCH = 10

malfet / repack_torchaudio_wheel.py

Last active April 23, 2026 16:50

	#!/usr/bin/env python3
	"""
	Repack torchaudio wheels as cp310-abi3 (stable ABI) wheels.

	Downloads the cp310 wheel for each platform from PyPI, verifies all native
	extensions use the stable ABI, patches the WHEEL tag (adds PEP 427 Build
	number) and METADATA (adds torch>=<version> dependency), and repacks.
	RECORD regeneration and repacking are handled by auditwheel's InWheelCtx.

	Usage:

malfet / gist:5b762d46bc62bfb34dc71327b9ac384c

Created April 13, 2026 15:11

	"""
	Demonstrate the NEON overread in F.interpolate by placing uint8 tensor data
	right before an unmapped guard page.

	vld3_u8 in the block-of-4 loop reads 24 bytes (8 pixels × 3 channels)
	but only needs 12 bytes (4 pixels × 3 channels). If the extra 12 bytes
	cross into an unmapped page, we get SIGBUS.
	"""

	import ctypes

malfet / fetch_collaborator_affiliations.py

Last active February 10, 2026 18:16

	#!/usr/bin/env python3
	"""
	Fetch PyTorch outside collaborators and infer company affiliation from commit emails.

	Requires: `gh` CLI authenticated with appropriate permissions.
	Usage: python fetch_collaborator_affiliations.py [--repo pytorch/pytorch] [--max-commits 100]

	Caches results in pytorch_collab_emails.json to avoid re-fetching known collaborators.
	"""

malfet / print_mps_noncontig_skips.py

Created February 6, 2026 17:27

	#!/usr/bin/env python3
	"""Print all ops with MPS skips for non-contiguous input."""

	import unittest
	from torch.testing._internal.common_methods_invocations import op_db
	from torch.testing._internal.opinfo.core import DecorateInfo


	def main():
	ops_with_mps_noncontig_skip = []

malfet / launch_kernel.cu

Created October 21, 2025 21:50

	#include <stdio.h>
	#include <chrono>

	__global__ void noop() { }

	int main(int argc, const char *argv[]) {
	cudaDeviceProp prop;
	auto rc = cudaGetDeviceProperties(&prop, 0);
	printf("Running on %s sm%d.%d multiProcessorCount = %d maxBlocksPerMultiProcessor = %d maxThreadsPerBlock = %d\n",
	prop.name, prop.major, prop.minor, prop.multiProcessorCount, prop.maxBlocksPerMultiProcessor, prop.maxThreadsPerBlock);

malfet / symbol_clash.py

Created October 20, 2025 21:51

	a_cpp = """#include <iostream>

	namespace foo::bar {
	inline namespace baz {
	int inc(int x) {
	std::cout << "do inc from lib_a" << std::endl;
	return x + 1;
	}
	} // inline namespace baz
	void do_a(int x) {

malfet / bench-dispatch.py

Created October 7, 2025 15:56

malfet / hello.cu

Created October 6, 2025 21:59

	#include <stdio.h>

	__global__ void print() {
	printf("Hello World of CUDA threadIdx.x=%d\n", threadIdx.x);
	}

	__global__ void noop() { }

	int main(int argc, const char *argv[]) {
	cudaDeviceProp prop;