https://huggingface.co/openbmb/MiniCPM-V-2_6
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
model_id = "openbmb/MiniCPM-V-2_6"
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
model = model.to(device='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model.eval()
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
# question = 'What is in the image?'
# question = 'can you give a few keywords in the picture? and delimited by comma'
question = 'can you give a few keywords in the picture? and put them into a python list'
msgs = [{'role': 'user', 'content': question}]
res = model.chat(
image=image,
msgs=msgs,
tokenizer=tokenizer,
sampling=True, # if sampling=False, beam_search will be used by default
temperature=0.7,
# system_prompt='' # pass system_prompt if needed
)
print(res)
print("end.")
output
Certainly Here are a few keywords based on the image, presented as a Python list:
```python
keywords = [
"beach",
"dog",
"human",
"sunrise/sunset",
"playful",
"connection",
"nature"
]