Skip to content

Instantly share code, notes, and snippets.

View okaris's full-sized avatar

Ömer Karışman okaris

View GitHub Profile
# Make sure you are using the latest `bitsandbytes` (at least 0.46.0) and PyTorch nightlies (at least 2.8).
# Put together by sayakpaul and anijain2305
from diffusers.quantizers import PipelineQuantizationConfig
from diffusers import FluxPipeline
import argparse
import json
import torch
import time
from functools import partial
@willccbb
willccbb / grpo_demo.py
Last active March 16, 2026 11:22
GRPO Llama-1B
# train_grpo.py
#
# See https://github.com/willccbb/verifiers for ongoing developments
#
"""
citation:
@misc{brown2025grpodemo,
title={Granular Format Rewards for Eliciting Mathematical Reasoning Capabilities in Small Language Models},
author={Brown, William},
@3outeille
3outeille / pipeline_parallel.py
Last active November 6, 2025 15:55
Self contained example of how pipeline parallel works (AFAB and 1F1B) in 200 LOC
#VERBOSE=0 torchrun --nproc_per_node 3 self_contained_pp_LOC.py
import os, random, numpy as np, torch, torch.nn as nn, torch.distributed as dist, torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, DistributedSampler
from datasets import load_dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
STEP, local_rank, world_size, verbose = 0, int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"]), os.environ.get("VERBOSE", "0") == "1"
def set_all_seed(seed):
@sayakpaul
sayakpaul / inference_with_torchao_serialized.py
Last active August 29, 2025 11:42
Shows how to run Flux schnell under 17GBs without bells and whistles. It additionally shows how to serialize the quantized checkpoint and load it back.
import torch
from huggingface_hub import hf_hub_download
from diffusers import FluxTransformer2DModel, DiffusionPipeline
dtype, device = torch.bfloat16, "cuda"
ckpt_id = "black-forest-labs/FLUX.1-schnell"
with torch.device("meta"):
config = FluxTransformer2DModel.load_config(ckpt_id, subfolder="transformer")
model = FluxTransformer2DModel.from_config(config).to(dtype)
@sayakpaul
sayakpaul / run_flux_under_24gbs.py
Last active June 28, 2025 22:53
This gist shows how to run Flux on a 24GB 4090 card with Diffusers.
from diffusers import FluxPipeline, AutoencoderKL
from diffusers.image_processor import VaeImageProcessor
from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
import torch
import gc
def flush():
gc.collect()
torch.cuda.empty_cache()
@Birch-san
Birch-san / flash_attn_processor.py
Last active December 19, 2023 22:07
FlashAttnProcessor
import torch
from typing import Optional
from flash_attn import flash_attn_func
from diffusers.models.attention import Attention
class FlashAttnProcessor:
r"""
Processor for implementing memory efficient attention using flash_attn.
"""
@Birch-san
Birch-san / topk_softmax_denominator.py
Created April 3, 2023 22:45
Reducing the softmax denominator to sum only as many attention scores as the in-distibution checkpoint would've, so that its outputs have in-distribution magnitudes
from torch import FloatTensor
vae_scale_factor = 8
typical_self_attn_key_length = (512/vae_scale_factor) * (512/vae_scale_factor)
desired_self_attn_key_length = (768/vae_scale_factor) * (768/vae_scale_factor)
key_length_factor=desired_self_attn_key_length/typical_self_attn_key_length if is_self_attn else 1.
def softmax(x: FloatTensor, dim=-1) -> FloatTensor:
maxes = x.max(dim, keepdim=True).values
@Birch-san
Birch-san / softmax.py
Created April 3, 2023 00:12
Typical softmax
from torch import FloatTensor
def softmax(x: FloatTensor, dim=-1) -> FloatTensor:
maxes = x.max(dim, keepdim=True).values
diffs = x-maxes
x_exp = diffs.exp()
x_exp_sum = x_exp.sum(dim, keepdim=True)
quotient = x_exp/x_exp_sum
return quotient
@Christopher-Hayes
Christopher-Hayes / convertToCheckpoint.md
Last active June 2, 2024 14:56
Convert DreamBooth .bin files to a .ckpt file

Converting DreamBooth .bin files to a .ckpt model file.

These instructions are based on DreamBooth usage with the https://github.com/ShivamShrirao/diffusers repo.

1. Add the script files

Below are 2 files. "convertToCkpt.py" and "toCkpt.sh". Create those files inside the examples/dreambooth folder with the code provided.

1a. Python convert script (required)

@trygvebw
trygvebw / find_noise.py
Last active December 13, 2025 14:53
A "reverse" version of the k_euler sampler for Stable Diffusion, which finds the noise that will reconstruct the supplied image
import torch
import numpy as np
import k_diffusion as K
from PIL import Image
from torch import autocast
from einops import rearrange, repeat
def pil_img_to_torch(pil_img, half=False):
image = np.array(pil_img).astype(np.float32) / 255.0