Skip to content

Instantly share code, notes, and snippets.

@iaincollins
Last active November 12, 2019 23:02
Show Gist options
  • Save iaincollins/82e1f245271aaa2bf2d100010b0afb1f to your computer and use it in GitHub Desktop.
Save iaincollins/82e1f245271aaa2bf2d100010b0afb1f to your computer and use it in GitHub Desktop.
Script to fetch images from Flickr
// 1. Add your API key and secret below
// 2. Run `npm i flickrapi` to install the SDK dependency
// 3. Run 'node auth.js` and follow the instructions
// (will open browser window, need to paste number back in terminal and copy entire output)
const Flickr = require('flickrapi')
const FLICKR_API_KEY = 'YOUR-API-KEY'
const FLICKR_SECRET = 'YOUR-API-SECRET'
Flickr.authenticate({
api_key: FLICKR_API_KEY,
secret: FLICKR_SECRET,
}, async (err, flickr) => {
});
const csv = require('csv-parser')
const fs = require('fs')
const results = []
const download = require('image-downloader')
const mkdirp = require('mkdirp')
fs.createReadStream('photos.csv')
.pipe(csv())
.on('data', (data) => results.push(data))
.on('end', async () => {
for (const result of results) {
await new Promise(resolve => {
const { photoId, title, url } = result
// Use hashing on path to file to avoid too many files in one directory
const baseDir = `./images/${photoId[0]}${photoId[1]}`
mkdirp.sync(baseDir)
const dest = `${baseDir}/${photoId}.jpg`
if (!fs.existsSync(dest)) {
console.log(`Downloading image ${photoId} (${url.trim()})…"`)
download.image({
url: url.trim(),
dest
})
.then(({ filename, image }) => {
console.log('Saved image to', dest)
resolve(true)
})
.catch((err) => console.error(err))
} else {
console.log(`Skipping image ${photoId} (${url.trim()})…" as already downloaded`)
resolve(true)
}
})
}
});
/**
* Script to fetch list of images from flickr API
*
* For Flickr API docs see:
* https://github.com/Pomax/node-flickrapi
* https://www.flickr.com/services/api/flickr.photos.search.html
**/
require('dotenv').config()
const Flickr = require('flickrapi')
const Datastore = require('nedb')
// Credentials expected to be environment variables, but you can hard code them here instead
const FLICKR_API_KEY = process.env.FLICKR_API_KEY
const FLICKR_SECRET = process.env.FLICKR_SECRET
//const FLICKR_API_QUERY = { text: "paulfavs" }
//const FLICKR_API_QUERY = { tags: "paulfavs" }
const FLICKR_API_QUERY = { user_id: 'paul_clarke' }
// Option to manually skip pages below a certain number, for results we know we already have.
//
// This is useful for retrying when building a list of photos as the Flickr API can stall
// after handling a large number of requests. This provides a quick and dirty way to resume
// from a manually specified point.
//
// NB: Can be any point before the last successful request; doesn't have to be exact it will
// never record duplicates is keyed on Photo ID (which is unique).
const SKIP_PAGES_BELOW_THRESHOLD = 0
const db = new Datastore({ filename: 'photos.db', autoload: true })
flickrOptions = {
api_key: FLICKR_API_KEY,
secret: FLICKR_SECRET,
progress: false
}
Flickr.tokenOnly(flickrOptions, async (err, flickr) => {
const query = FLICKR_API_QUERY
const numberOfPages = await getNumberOfPhotos(flickr, query)
if (SKIP_PAGES_BELOW_THRESHOLD > 0)
console.log(`Skipping pages below page ${SKIP_PAGES_BELOW_THRESHOLD}…`)
let promises = []
for (let page = 1; page <= numberOfPages; page++) {
if (page > SKIP_PAGES_BELOW_THRESHOLD) {
const photosInPage = await getPhotosInPage(flickr, query, page)
photosInPage.forEach(async (photo) => {
const promise = new Promise(async resolve => {
db.findOne({ photoId: photo.id }, async (err, doc) => {
const pageDescription = `Page ${page} of ${numberOfPages}`
if (doc) {
// Use existing object
// TODO Placeholder in case want to add additional data
console.log(`Updating Photo #${doc.photoId} (${pageDescription}) - ${doc.title}`)
const newDoc = doc
db.update({ _id: doc._id }, newDoc)
} else {
// Add new object
const newDoc = {
photoId: photo.id,
page: pageDescription,
public: photo.ispublic === 1 ? true : false,
url: await getLargestPhotoUrl(flickr, photo.id),
title: photo.title,
lastUpdated: new Date().toISOString()
}
console.log(`Added new Photo #${newDoc.photoId} (${pageDescription}) - ${newDoc.title}`)
db.insert(newDoc)
}
})
resolve(true)
})
promises.push(promise)
})
}
}
await Promise.all(promises)
console.log("Completed!")
// Simple hacky console log of entire DB once script is complete, so can easily manually capture output to a CSV
console.log(`photoId,public,url,title,lastUpdated`)
db.find({ }, function (err, docs) {
docs.forEach(doc => console.log(`${doc.photoId},${doc.public},${doc.url},"${doc.title}",${doc.lastUpdated}`))
});
})
const getNumberOfPhotos = async(flickr, query) => {
return new Promise(resolve => {
flickr.photos.search(query, (err, result) => {
if (err) { throw new Error(err) }
resolve(result.photos.pages)
})
})
}
const getPhotosInPage = async(flickr, query, page) => {
return new Promise(resolve => {
flickr.photos.search({
...query,
page,
}, (err, result) => {
if (err) { throw new Error(err) }
resolve(result.photos.photo)
})
})
}
const getLargestPhotoUrl = async(flickr, photo_id) => {
return new Promise(resolve => {
flickr.photos.getSizes({ photo_id }, (err, result) => {
if (err) { throw new Error(err) }
// Largest photo they have is always the last one in the list
// NB: Usually, but not always, it has the label 'Original'.
const largestPhotoUrl = result.sizes.size[result.sizes.size.length - 1].source
resolve(largestPhotoUrl)
})
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment