Last active
March 5, 2024 07:14
-
-
Save SrJSDev/1e4e4cdc3c6209d468cf9ce148d7d2a7 to your computer and use it in GitHub Desktop.
GPT-4 Vision API + Puppeteer = Easy Web Scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer from 'puppeteer-extra'; | |
import StealthPlugin from 'puppeteer-extra-plugin-stealth'; | |
import OpenAI from 'openai'; | |
import readline from 'readline'; | |
import fs from 'fs'; | |
// Configure Puppeteer with StealthPlugin | |
puppeteer.use(StealthPlugin()); | |
// Initialize OpenAI and timeout constant | |
const openai = new OpenAI(); | |
const timeout = 8000; | |
// Start the main function | |
main(); | |
function async main() { | |
console.log("###########################################"); | |
console.log("# GPT4V-Browsing by Unconventional Coding #"); | |
console.log("###########################################\n"); | |
const browser = await puppeteer.launch({ headless: "new" }); | |
const page = await browser.newPage(); | |
await page.setViewport({ width: 1200, height: 1200, deviceScaleFactor: 1.75 }); | |
let messages = [{ "role": "system", "content": systemMessage }]; | |
console.log("GPT: How can I assist you today?"); | |
let userPrompt = ""; | |
while (true) { | |
// Decide which user prompt to provide, text or screenshot | |
if (!userPrompt) { | |
userPrompt = await input("You: "); | |
console.log(); | |
messages.push({ "role": "user", "content": userPrompt }); | |
} | |
else { | |
const base64Image = await imageToBase64("screenshot.jpg"); | |
messages.push({ | |
"role": "user", | |
"content": [ | |
{ "type": "image_url", "image_url": base64Image }, | |
{ "type": "text", "text": `Here's the screenshot of the website you are on right now. | |
You can click on links with {"click": "Link text"}. | |
Or you can crawl to another URL if this one is incorrect with {"url": "url goes here"}. | |
If you find the answer to the user's question, you can respond normally.` | |
} | |
] | |
}); | |
} | |
const response = await openai.chat.completions.create({ | |
model: "gpt-4-vision-preview", | |
max_tokens: 1024, | |
messages: messages, | |
}); | |
const responseText = response.choices[0].message.content; | |
messages.push({ "role": "assistant", "content": responseText }); | |
console.log("GPT: " + responseText); | |
screenShotOf = await handleAssistantResponseSS(page, messages, responseText) | |
if (!screenShotOf) { | |
// Then the LLM gave an answer, logged above. | |
// If you want to start clean: | |
// messages = [{ "role": "system", "content": systemMessage }]; | |
// console.log("GPT: How can I assist you today?"); | |
// You'll need to provide a text prompt on next loop | |
userPrompt = ""; | |
} | |
} | |
}; | |
// Convert image to base64 format | |
async function imageToBase64(imageFile) { | |
try { | |
const data = await fs.promises.readFile(imageFile); | |
const base64Data = data.toString('base64'); | |
return `data:image/jpeg;base64,${base64Data}`; | |
} catch (error) { | |
console.error('Error reading the file:', error); | |
throw error; | |
} | |
}; | |
// Prompt user for input | |
async function input(text) { | |
const rl = readline.createInterface({ | |
input: process.stdin, | |
output: process.stdout | |
}); | |
return new Promise(resolve => { | |
rl.question(text, (prompt) => { | |
rl.close(); | |
resolve(prompt); | |
}); | |
}); | |
}; | |
// Sleep function to introduce delay | |
const sleep = (milliseconds) => new Promise(resolve => setTimeout(resolve, milliseconds)); | |
// Remove attribute from DOM element | |
const removeAttribute = (element, attributeName) => { | |
element.removeAttribute(attributeName); | |
}; | |
// Check if element is visible on the page | |
const isElementVisible = (el) => { | |
if (!el) return false; | |
function isStyleVisible(el) { | |
const style = window.getComputedStyle(el); | |
return style.width !== '0' && | |
style.height !== '0' && | |
style.opacity !== '0' && | |
style.display !== 'none' && | |
style.visibility !== 'hidden'; | |
} | |
function isElementInViewport(el) { | |
const rect = el.getBoundingClientRect(); | |
return ( | |
rect.top >= 0 && | |
rect.left >= 0 && | |
rect.bottom <= (window.innerHeight || document.documentElement.clientHeight) && | |
rect.right <= (window.innerWidth || document.documentElement.clientWidth) | |
); | |
} | |
// Check if the element is visible style-wise | |
if (!isStyleVisible(el)) { | |
return false; | |
} | |
// Traverse up the DOM and check if any ancestor element is hidden | |
let parent = el; | |
while (parent) { | |
if (!isStyleVisible(parent)) return false; | |
parent = parent.parentElement; | |
} | |
// Finally, check if the element is within the viewport | |
return isElementInViewport(el); | |
}; | |
// Highlight visible links on the page | |
async function highlightElement (page, element) { | |
await page.evaluate(element => { | |
element.style.border = "1px solid red"; | |
const position = element.getBoundingClientRect(); | |
if (position.width > 5 && position.height > 5 && isElementVisible(element)) { | |
const linkText = element.textContent.replace(/[^a-zA-Z0-9 ]/g, ''); | |
element.setAttribute("gpt-link-text", linkText); | |
} | |
}, element); | |
}; | |
// Click on the specified link on the page | |
async function clickElement (page, linkText) { | |
const elements = await page.$$('[gpt-link-text]'); | |
let partial, exact; | |
for (const element of elements) { | |
const attributeValue = await element.getAttribute('gpt-link-text'); | |
if (attributeValue === linkText) { | |
exact = element; | |
break; | |
} | |
if (attributeValue.includes(linkText)) { | |
partial = element; | |
} | |
} | |
if (exact) { | |
await exact.click(); | |
} else if (partial) { | |
await partial.click(); | |
} else { | |
throw new Error("Can't find link"); | |
} | |
}; | |
// Handle click event on link specified in the message | |
async function handleLinkClickSS(page, messages, messageText) { | |
const linkText = messageText.split('{"click": "')[1].split('"}')[0].replace(/[^a-zA-Z0-9 ]/g, ''); | |
console.log("Clicking on " + linkText); | |
try { | |
await clickElement(page, linkText); | |
await Promise.race([waitForEvent(page, 'load'), sleep(timeout)]); | |
await highlightLinks(page); | |
await page.screenshot({ path: "screenshot.jpg", quality: 100 }); | |
return linkText; | |
} catch (error) { | |
console.log("ERROR: Clicking failed"); | |
messages.push({ "role": "user", "content": "ERROR: I was unable to click that element" }); | |
return false; | |
} | |
}; | |
// Navigate to the specified URL | |
async function navigateToUrl (page, messages, url) { | |
console.log("Crawling " + url); | |
await page.goto(url, { waitUntil: "domcontentloaded" }); | |
await highlightLinks(page); | |
await Promise.race([waitForEvent(page, 'load'), sleep(timeout)]); | |
await highlightLinks(page); | |
await page.screenshot({ path: "screenshot.jpg", quality: 100 }); | |
return url; | |
}; | |
// Handle navigation to URL specified in the message | |
async function handleUrlNavigationSS(page, messages, messageText) { | |
const url = messageText.split('{"url": "')[1].split('"}')[0]; | |
return await navigateToUrl(page, messages, url); | |
}; | |
// Handle assistant response and perform appropriate action | |
async function handleAssistantResponseSS (page, messages, responseText) { | |
if (responseText.includes('{"click": "')) { | |
return await handleLinkClick(page, messages, responseText); | |
} | |
if (responseText.includes('{"url": "')) { | |
return await handleUrlNavigation(page, messages, responseText); | |
} | |
return false; | |
}; | |
const systemMessage = ` | |
You are a website crawler. You will be given instructions on what to do by browsing. | |
You are connected to a web browser and you will be given the screenshot of the website you are on. | |
The links on the website will be highlighted in red in the screenshot. Always read exactly what is in the screenshot. | |
Don't guess link names. | |
You can go to a specific URL by answering with the following JSON format: | |
{"url": "url goes here"} | |
You can click links on the website by referencing the text inside of the link/button, by answering in the following JSON format: | |
{"click": "Text in link"} | |
Once you are on a URL and you have found the answer to the user's question, you can answer with a regular message. | |
In the beginning, go directly to URL that you think might contain the answer to the user's question. | |
Prefer to go directly to sub-urls like 'https://google.com/search?q=search' if possible. | |
Prefer to use Google for simple queries. | |
If the user message provides a direct URL, always answer by going to that one instead.` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
credit: https://www.youtube.com/watch?v=VeQR17k7fiU