Created
July 27, 2014 16:08
-
-
Save joepie91/b4ae6b4bab694da44e56 to your computer and use it in GitHub Desktop.
AnonNews stuff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
S = require "string" | |
Promise = require "bluebird" | |
request = require "request" | |
libxml = require "libxmljs" | |
longest = require "longest" | |
module.exports = | |
createTeaser: (input, length) -> | |
if input.length > length | |
re = new RegExp("^((.|[\r\n]){0,#{length}})(\\W|$)", "") | |
return re.exec(input)[1] + "..." | |
else | |
return input | |
justText: (input) -> | |
return S(input.replace(/<br\s?\/?>/, " ").replace("</p>", " ")).stripTags().s | |
getMetadata: (url) -> | |
selectors = | |
title: [ | |
"//meta[@property='twitter:title']/@content", | |
"//meta[@property='og:title']/@content", | |
"//*[@itemprop='headline']/text()", | |
"//h1[contains(@class, 'title')]/text()", | |
"//h2[contains(@class, 'title')]/text()", | |
"//title/text()" | |
], | |
image: [ | |
"//meta[@property='twitter:image']/@content", | |
"//meta[@property='og:image']/@content", | |
"//img[@itemprop='image']/@src", | |
"//img[contains(@class, 'size-full')]/@src", | |
"(//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or @id='body' or @id='entry']//img)[1]/@src" | |
], | |
description: [ | |
"//meta[@property='twitter:description']/@content", | |
"//meta[@property='og:description']/@content", | |
"//*[@itemprop='description']/text()", | |
"//*[@itemprop='articleBody']/text()", | |
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']/text()", | |
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']//p[not(contains(@class, 'wptl'))]/text()", | |
"//div[@class='entry']//p/text()" | |
], | |
sitename: [ | |
"//meta[@property='og:site_name']/@content", | |
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@alt", | |
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@title", | |
"//h1//img/@alt" | |
] | |
title_separators = /(\s[|:\/-]\s|»|\s>\s)/ | |
return new Promise (resolve, reject) -> | |
request url, (err, resp, body) -> | |
if err? | |
reject(err) | |
else | |
doc = libxml.parseHtmlString(body) | |
metadata = {} | |
for field of selectors | |
for selector in selectors[field] | |
result = doc.find(selector) | |
if result? and result.length > 0 | |
found = false | |
for node in result | |
if node.text? | |
value = node.text().trim() | |
else if node.value? | |
value = node.value().trim() | |
else | |
value = node.toString().trim() | |
if value.length > 5 | |
found = true | |
value = value.replace("\n", " ") | |
if field == "title" and value.search title_separators > -1 | |
value = longest value.split(title_separators) | |
metadata[field] = value | |
break | |
if found == true | |
break | |
resolve metadata | |
NotFound: (message = "The requested page could not be found.") -> | |
err = new Error | |
err.status = 404 | |
err.message = message | |
return err | |
InputError: (message = "One or more required input fields were missing.") -> | |
err = new Error | |
err.status = 400 | |
err.message = message | |
return err |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment