Skip to content

Instantly share code, notes, and snippets.

@otarza
Created December 26, 2016 13:27
Show Gist options
  • Save otarza/7f2332cc412adccd5ee639cf91841e82 to your computer and use it in GitHub Desktop.
Save otarza/7f2332cc412adccd5ee639cf91841e82 to your computer and use it in GitHub Desktop.
function fetchByURLs_$(urls, parserTemplate){
urls.map(url => {
return {
url: url,
urisListPromise: scrapeIt(url, parserTemplate),
}
}).reduce(function (sequence, urlWithUrisListPromise) {
return sequence.then(() => {
return urlWithUrisListPromise.urisListPromise;
}).then(urisList => {
//TODO DB insert
}, reason => {
errored.push(urlWithUrisListPromise.url);
});
}, Promise.resolve()).then(function() {
console.log("All done");
}).catch(function(err) {
console.log("Oops: " + err.message);
}).then(function() {
//TODO DB close!
if(errored.length) {
fetchByURLs_$(errored, parserTemplate )
}
});
}
@safareli
Copy link

safareli commented Dec 27, 2016

function fetchByURLs(maxThreads, maxAttempts, urls, scrape, done) {
  let results = []
  let rest = drop(threadsMax, urls)
  let fails = []
  
  const removeThread = (thread) => {
    threads.splice(thread.idx, 1) // remove `t` from `threads`
  }
  
  const scrapeURL = (attemptsLeft, url, idx) => {
    const p = scrape(url).then(result => {
      results[t.url] = result
      removeThread(t)
      if (rest.length > 0) { // if there are more urls to scrape
        const nextUrl = rest.unShift()
        threads.push(scrapeURL(maxAttempts, nextUrl, threads.length))
      }
      return result
    }, error => {
      if(t.attempt - 1 <= 0) { // if attempted too meny times
        fails[t.url] = error
        removeThread(t)
      } else {
        threads[t.idx] = scrapeURL(t.attempt - 1, t.url, t.idx)
      }
    }).finally(() => {
      if (threads.length == 0) { // if there are no more running threads
        done({ resuls, fails })
      }
    })
    
    return {
      promise: p,
      idx: idx,
      url: url,
      attempt: 0,
    }
  }
  
  let threads = take(maxThreads, urls).map((url, idx) => scrapeURL(maxAttempts, url, idx))
}

fetchByURLs(10, +Infinity, urls, url => scrapeIt(url, parserTemplate), (res) => {
  // res.results
  // res.fails 
})

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment