Last active
February 9, 2017 09:23
-
-
Save qdequele/f0f8f53d3f28b3cd6aef to your computer and use it in GitHub Desktop.
Media title/image universal scrapper with scraperjs promised
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Bundle: Helpers - Scraper | |
* Project: Readlist - Server | |
* Author: Quentin de Quelen <[email protected]> | |
* Copyright: 2015, Readlist | |
*/ | |
/* | |
* | |
*Need 'scraperjs & url' | |
*/ | |
"use strict"; | |
const scraperjs = require('scraperjs'); | |
const url = require('url'); | |
exports.scrapUrl = function(url) { | |
var res = { | |
"title": '', | |
"image": '' | |
} | |
var scrapFacebook = function($) { | |
return new Promise(function(resolve, reject) { | |
try { | |
res.title = $('meta[property="og:title"]').attr('content'); | |
if ($('meta[property="og:image"]').attr('content').length > 0) | |
res.image = $('meta[property="og:image"]').attr('content'); | |
reject(); | |
} catch (err) { | |
resolve($); | |
} | |
}); | |
}; | |
var scrapTwitter = function($) { | |
return new Promise(function(resolve, reject) { | |
try { | |
res.title = $('meta[property="twitter:title"]').attr('content'); | |
if ($('meta[property="twitter:image"]').attr('content').length > 0) | |
res.image = $('meta[property="twitter:image"]').attr('content'); | |
reject(); | |
} catch (err) { | |
resolve($); | |
} | |
}); | |
}; | |
var scrapArticleH1 = function($) { | |
return new Promise(function(resolve, reject) { | |
try { | |
res.title = $('.article h1').text(); | |
res.image = $('.article img').attr("src"); | |
if (res.title.length == 0) resolve($); | |
else reject(); | |
} catch (err) { | |
resolve($); | |
} | |
}); | |
}; | |
var scrapArticleH2 = function($) { | |
return new Promise(function(resolve, reject) { | |
try { | |
res.title = $('article h2').text(); | |
res.image = $('article img').attr("src"); | |
if (res.title.length == 0) resolve($); | |
else reject(); | |
} catch (err) { | |
resolve($); | |
} | |
}); | |
}; | |
var scrapMainH2 = function($) { | |
return new Promise(function(resolve, reject) { | |
try { | |
res.title = $('#main h2').text(); | |
res.image = $('#main img').attr("src"); | |
if (res.title.length == 0) resolve($); | |
else reject(); | |
} catch (err) { | |
resolve($); | |
} | |
}); | |
}; | |
return new Promise(function(resolve, reject) { | |
scraperjs.StaticScraper.create(url) | |
.scrape(function($) { | |
scrapFacebook($) | |
.then(scrapFacebook) | |
.then(scrapTwitter) | |
.then(scrapArticleH1) | |
.then(scrapArticleH2) | |
.then(scrapMainH2) | |
.then(function($) { | |
reject("Article not found"); | |
}) | |
.catch(function() { | |
resolve(res); | |
}); | |
}); | |
}); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment