Created
June 15, 2024 22:17
-
-
Save Darep/46c0aa7f65753291f7140e4c09f8405c to your computer and use it in GitHub Desktop.
Scrape Flockler site articles and/or images to local files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Scrape Flockler site articles and/or images to local files. | |
* | |
* Usage: | |
* $ bun fl-scrape.ts --get-articles --get-images | |
*/ | |
// change this to your Flockler site ID | |
const SITE_ID = 1; | |
let articles = [] as { cover_url: string; body: string }[]; | |
async function getArticles() { | |
let older = null; | |
const response = await fetch( | |
`https://api.flockler.com/v1/sites/${SITE_ID}/articles` | |
); | |
const data = await response.json(); | |
articles = data.articles; | |
older = data.pagination.older; | |
console.log(data.articles); | |
while (older) { | |
console.log(older); | |
const response = await fetch(older); | |
const data = await response.json(); | |
articles = articles.concat(data.articles); | |
older = data.pagination.older; | |
} | |
await Bun.write('flockler-export.json', JSON.stringify(articles, null, 2)); | |
} | |
async function downloadImage(url: string) { | |
const origUrl = url | |
.replace(/_(s|c|l|m)\d+x\d+/, '') | |
.replace(/_m\d+x\d+/, '') | |
.replace(/_q\d+/, '') | |
.replace('_noupscale', '') | |
.replace('_contain', ''); | |
const filename = url.match( | |
/https:\/\/flockler\.com\/thumbs\/sites\/\d+\/([^\/"]+)/ | |
)?.[1]; | |
if (!filename) { | |
console.error('error: no filename for url', url); | |
return; | |
} | |
const origFilename = filename | |
.replace(/_(s|c|l|m)\d+x\d+/, '') | |
.replace(/_m\d+x\d+/, '') | |
.replace(/_q\d+/, '') | |
.replace('_noupscale', '') | |
.replace('_contain', ''); | |
console.log(`Downloading ${filename}`); | |
const thumbResponse = await fetch(url); | |
const thumbPath = `images/thumbs/${filename}`; | |
await Bun.write(thumbPath, thumbResponse); | |
console.log(`Saved thumb to ${thumbPath}`); | |
const origResponse = await fetch(origUrl); | |
const origPath = `images/files/${origFilename}`; | |
await Bun.write(origPath, origResponse); | |
console.log(`Saved original to ${origPath}`); | |
} | |
async function getImages(useFile) { | |
if (useFile) { | |
const file = Bun.file('flockler-export.json'); | |
articles = JSON.parse(await file.text()); | |
} else if (!articles) { | |
console.error('error: no articles!'); | |
return; | |
} | |
for (const article of articles) { | |
if ( | |
article.cover_url && | |
article.cover_url.includes(`flockler.com/thumbs/sites/${SITE_ID}`) | |
) { | |
downloadImage(article.cover_url); | |
} | |
const imageMatches = (article.body as string).matchAll( | |
/https:\/\/flockler\.com\/thumbs\/sites\/\d+\/([^\/"]+)/g | |
); | |
for (const img of imageMatches) { | |
const imgUrl = img[0]; | |
if (imgUrl) { | |
downloadImage(imgUrl); | |
} | |
} | |
} | |
} | |
const args = Bun.argv; | |
const shouldGetArticles = args.includes('--get-articles'); | |
const shouldGetImages = args.includes('--get-images'); | |
if (shouldGetArticles) { | |
console.log('Getting articles'); | |
getArticles(); | |
console.log('Done!'); | |
} | |
if (shouldGetImages) { | |
console.log('Getting images'); | |
getImages(!shouldGetArticles); | |
console.log('Done!'); | |
} | |
if (!shouldGetArticles && !shouldGetImages) { | |
console.log('No action specified. Use --get-articles and --get-images'); | |
} | |
export {}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment