openbmb/MiniCPM-V-2_6

https://huggingface.co/openbmb/MiniCPM-V-2_6

import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests

model_id = "openbmb/MiniCPM-V-2_6"

model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)

model = model.to(device='cuda')

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model.eval()

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# question = 'What is in the image?'
# question = 'can you give a few keywords in the picture? and delimited by comma'
question = 'can you give a few keywords in the picture? and put them into a python list'
msgs = [{'role': 'user', 'content': question}]

res = model.chat(
    image=image,
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True, # if sampling=False, beam_search will be used by default
    temperature=0.7,
    # system_prompt='' # pass system_prompt if needed
)
print(res)

print("end.")

output

Certainly Here are a few keywords based on the image, presented as a Python list:
```python
keywords = [
    "beach",
    "dog",
    "human",
    "sunrise/sunset",
    "playful",
    "connection",
    "nature"
]

vuiseng9/mllm.md

openbmb/MiniCPM-V-2_6

vuiseng9 commented Sep 12, 2024