Skip to content

Instantly share code, notes, and snippets.

@zhuker
Last active October 13, 2025 17:39
Show Gist options
  • Select an option

  • Save zhuker/e9addb68894b6addcde18a552f6597f6 to your computer and use it in GitHub Desktop.

Select an option

Save zhuker/e9addb68894b6addcde18a552f6597f6 to your computer and use it in GitHub Desktop.
BLIP2 conditional generation
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from PIL import Image
import torch
the_device = "cpu"
if torch.backends.mps.is_available():
the_device = "mps"
elif torch.cuda.is_available():
the_device = "cuda"
print(the_device)
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16).to(the_device)
image_path = "login.png"
image = Image.open(image_path).convert("RGB")
prompt = "Question: Does it look like a login screen? Answer:"
start = time.time()
inputs = processor(images=image, text=prompt, return_tensors="pt").to(the_device)
with torch.no_grad():
caption_ids = model.generate(**inputs, max_new_tokens=1)
caption = processor.decode(caption_ids[0], skip_special_tokens=True)
elapsed = time.time() - start
print(f"{int(elapsed * 1000)}msec Generated Caption:", caption)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment