import torch, torch.nn.functional as F from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor from PIL import Image REVISION = "2025-04-14" # lock to a known good tag MODEL_ID = "vikhyatk/moondream2" device = "cuda" # or "cpu" / bitsandbytes / GGUF, etc. # 1️⃣ load model + text tokenizer model = AutoModelForCausalLM.from_pretrained( MODEL_ID, revision=REVISION, trust_remote_code=True, # Moondream ships custom classes torch_dtype=torch.float16, device_map="auto", ) tok = AutoTokenizer.from_pretrained(MODEL_ID, revision=REVISION, trust_remote_code=True) from types import MethodType import torch.nn as nn def _get_input_embeddings(self): # wrap the parameter so HF utilities that expect an nn.Embedding still work return nn.Embedding.from_pretrained(self.model.text.wte, freeze=True) model.get_input_embeddings = MethodType(_get_input_embeddings, model) print(model.get_input_embeddings().weight) W_lang = F.normalize(model.get_input_embeddings().weight, dim=-1).to(device) # ~ # 4️⃣ pick an image img = Image.open("1280px-Labrador_Retriever_portrait.jpg").convert("RGB") with torch.no_grad(): # Use Moondream's encode_image method encoded_image = model.encode_image(img) # If you need the raw vision embeddings for analysis: img_emb = model.model._run_vision_encoder(img) # This gives you the vision embeddings # Continue with your analysis... patch_lang = F.normalize(img_emb, dim=-1) sim = patch_lang @ W_lang.T vals, ids = sim.topk(8, dim=-1) # 8️⃣ helper --------------------------------------------------------------- def labels_from_ids(vals, ids, thresh=0.15): """ Convert the top-k token IDs for every patch into *readable* words. Uses the tokenizer’s .decode, which automatically strips the Ġ / ▁ “new-word” markers for you. """ out = [] keep = vals > thresh # mask of tokens we want to keep for v, i, m in zip(vals, ids, keep): if not m.any(): # no token passed the threshold out.append([]) continue # decode the surviving ids in one go → a clean utf-8 string text = tok.decode(i[m].tolist(), skip_special_tokens=True).strip() # break the string back into individual words out.append(text.split()) return out words_per_patch = labels_from_ids(vals, ids, thresh=0.0001) print(words_per_patch) # e.g. ['dog', 'black', 'fur'], ... # --------------------------------------------------------------- # 9️⃣ visualise – draw the highest-probability word per patch # --------------------------------------------------------------- from moondream.torch.vision import prepare_crops # part of the repo from PIL import Image, ImageDraw, ImageFont # ⓵ your code up to `words_per_patch` ... # Skip the global CLS token – everything afterwards is one patch patch_words = words_per_patch[1:] # ⓶ Get the tiling that Moondream decided to use _, tiling = prepare_crops(img, model.config.vision, device=device) ps = model.config.vision.enc_patch_size # 14 crop_patches = model.config.vision.crop_size // ps # 27 win_patches = crop_patches - 2 * model.config.vision.overlap_margin # 19 grid_w = tiling[1] * win_patches grid_h = tiling[0] * win_patches assert len(patch_words) == grid_w * grid_h # ⓷ Draw on a canvas that matches the (resized) image Moondream worked with canvas = img.resize((grid_w * ps, grid_h * ps), Image.LANCZOS) draw = ImageDraw.Draw(canvas) try: font = ImageFont.truetype("DejaVuSans.ttf", size=10) except IOError: # fallback if the font isn't available font = ImageFont.load_default() for idx, toks in enumerate(patch_words): if not toks: # nothing above the threshold for this patch continue row, col = divmod(idx, grid_w) x = col * ps + ps // 2 # patch centre y = row * ps + ps // 2 draw.text((x, y), toks[0], fill="white", font=font, anchor="mm", stroke_width=1, stroke_fill="black") # small outline for contrast canvas.show("moondream_patch_labels.png") print("✅ overlay saved as moondream_patch_labels.png")