Skip to content

Instantly share code, notes, and snippets.

@melastmohican
Created May 12, 2025 00:00
Show Gist options
  • Save melastmohican/383c3f92029eec269b72b611011fb83c to your computer and use it in GitHub Desktop.
Save melastmohican/383c3f92029eec269b72b611011fb83c to your computer and use it in GitHub Desktop.
Generate title, caption and keywords from attached image using Gemini
#!/usr/bin/env lua
local http = require("socket.http")
local ltn12 = require("ltn12")
local mime = require("mime") -- For base64
local json = require("dkjson") -- dkjson is a JSON library for Lua
-- Load and encode image
local function read_image_base64(path)
local file = assert(io.open(path, "rb"))
local content = file:read("*all")
file:close()
return mime.b64(content)
end
-- JSON prompt message
local prompt_text = [[
You are a professional photography analyst with expertise in object recognition and computer-generated image description.
You also try to identify famous buildings and landmarks as well as the location where the photo was taken.
Furthermore, you aim to specify animal and plant species as accurately as possible. Always give common name followed by the scientific name in brackets e.g (Beta vulgaris).
You also describe objects—such as vehicle types and manufacturers—as specifically as you can.
Analyze the uploaded photo and generate the following data:
* Keywords (comma-separated list of 50 single-word keywords)
* Image title
* Image caption (Maximum 200 characters)
Make sure the result is in JSON format:
{
"title": "",
"caption": "",
"keywords": "key1,key2"
}
]]
-- Setup request payload
local function build_payload(image_base64)
return json.encode({
contents = {
{
role = "user",
parts = {
{ text = prompt_text },
{
inlineData = {
mimeType = "image/jpeg",
data = image_base64
}
}
}
}
},
generationConfig = {
temperature = 1,
topP = 0.95,
maxOutputTokens = 8192,
responseModalities = { "TEXT" }
},
safetySettings = {
{ category = "HARM_CATEGORY_HATE_SPEECH", threshold = "BLOCK_NONE" },
{ category = "HARM_CATEGORY_DANGEROUS_CONTENT", threshold = "BLOCK_NONE" },
{ category = "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold = "BLOCK_NONE" },
{ category = "HARM_CATEGORY_HARASSMENT", threshold = "BLOCK_NONE" }
}
})
end
-- Make API call
local function generate()
local image_base64 = read_image_base64("20250506-153543.jpg")
--print("Image: " .. image_base64 .. "\n")
local payload = build_payload(image_base64)
--print("Payload: " .. payload .. "\n")
local response = {}
local _, status = http.request{
url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:streamGenerateContent?key={your_api_key}",
method = "POST",
headers = {
["Content-Type"] = "application/json",
["Content-Length"] = tostring(#payload)
},
source = ltn12.source.string(payload),
sink = ltn12.sink.table(response)
}
if status == 200 then
--print(table.concat(response))
local decoded, _, err = json.decode(table.concat(response))
if not decoded then
print("JSON decode error:", err)
return
end
-- Combine all `parts[].text` from every item in the array
local combined_text = ""
for _, item in ipairs(decoded) do
if item.candidates and item.candidates[1] and item.candidates[1].content and item.candidates[1].content.parts then
local parts = item.candidates[1].content.parts
for _, part in ipairs(parts) do
if part.text then
combined_text = combined_text .. part.text
end
end
end
end
combined_text = string.gsub(combined_text, '```json', '')
combined_text = string.gsub(combined_text, '```', '')
print("Combined output:\n", combined_text)
else
print("Request failed with status:", status)
end
end
generate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment