Created
August 28, 2023 08:15
-
-
Save feliche93/8dc78e1c0a2bd97408f2288ce46b564e to your computer and use it in GitHub Desktop.
Browserless.io Next.js Edge Route, scraping Website Content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { SScrapingResult, SWebsiteInfoInput, SWebsiteInfoOutput } from '@lib/zod-models'; | |
import { NextResponse } from 'next/server'; | |
import { z } from 'zod'; | |
export const runtime = 'edge' | |
export async function POST(request: Request) { | |
const data = await request.json(); | |
const startTime = Date.now(); | |
const parsedData = SWebsiteInfoInput.parse(data); | |
const apiTOken = process.env.BROWSERLESS_API_TOKEN; | |
if (!apiTOken) throw new Error("No BROWSERLESS_API_TOKEN environment variable set"); | |
const url = `https://chrome.browserless.io/scrape?token=${apiTOken}`; | |
const scrapingUrl = parsedData.url; | |
const keyword = parsedData.keyword; | |
const body = { | |
"url": scrapingUrl, | |
"elements": [ | |
{ | |
"selector": "body", | |
"timeout": 0 | |
}, | |
{ | |
"selector": "title", | |
"timeout": 0 | |
}, | |
{ | |
"selector": "meta[property='og:title']", | |
"timeout": 0 | |
}, | |
{ | |
"selector": "meta[property='og:description']", | |
"timeout": 0 | |
}, | |
{ | |
"selector": "meta[property='og:image']", | |
"timeout": 0 | |
}, | |
{ | |
"selector": "link[rel='icon']", | |
"timeout": 0 | |
} | |
] | |
}; | |
const response = await fetch(url, { | |
method: 'POST', | |
body: JSON.stringify(body), | |
headers: { 'Content-Type': 'application/json' } | |
}); | |
console.log(`Fetch completed in ${(Date.now() - startTime) / 1000} seconds`); | |
if (!response.ok) throw new Error("Error in fetch request"); | |
const result = await response.json(); | |
function transformToWebsiteInfoOutput(parsedResult: z.infer<typeof SScrapingResult>) { | |
// Initialize empty result | |
let output: Partial<z.infer<typeof SWebsiteInfoOutput>> = {}; | |
// Loop over each data item in parsedResult | |
for (const item of parsedResult.data) { | |
if (item.selector === "body") { | |
output.bodyText = item.results[0]?.text; | |
} else if (item.selector === "title") { | |
output.pageTitle = item.results[0]?.text; | |
} else { | |
const attr = item.results[0]?.attributes?.find(a => a.name === "content"); | |
if (attr) { | |
if (item.selector === "meta[property='og:title']") { | |
output.metaTitle = attr.value; | |
} else if (item.selector === "meta[property='og:description']") { | |
output.metaDescription = attr.value; | |
} else if (item.selector === "meta[property='og:image']") { | |
output.metaImageUrl = attr.value; | |
} else if (item.selector === "link[rel='icon']") { | |
output.faviconImageUrl = attr.value; | |
} | |
} | |
} | |
} | |
output.url = scrapingUrl; | |
keyword && (output.keyword = keyword); | |
return output; | |
} | |
// Parse the result into our SScrapingResult schema | |
const parsedResult = SScrapingResult.parse(result); | |
// Transform the parsed result into our target SWebsiteInfoOutput schema | |
const transformedResult = transformToWebsiteInfoOutput(parsedResult); | |
// Now you can use SWebsiteInfoOutput to parse and validate the transformed result | |
const websiteInfoOutput = SWebsiteInfoOutput.parse(transformedResult); | |
// console.log(JSON.stringify(websiteInfoOutput, null, 2)); | |
return NextResponse.json(websiteInfoOutput); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment