Skip to content

Instantly share code, notes, and snippets.

@joepie91
Created July 27, 2014 16:08
Show Gist options
  • Save joepie91/b4ae6b4bab694da44e56 to your computer and use it in GitHub Desktop.
Save joepie91/b4ae6b4bab694da44e56 to your computer and use it in GitHub Desktop.
AnonNews stuff
S = require "string"
Promise = require "bluebird"
request = require "request"
libxml = require "libxmljs"
longest = require "longest"
module.exports =
createTeaser: (input, length) ->
if input.length > length
re = new RegExp("^((.|[\r\n]){0,#{length}})(\\W|$)", "")
return re.exec(input)[1] + "..."
else
return input
justText: (input) ->
return S(input.replace(/<br\s?\/?>/, " ").replace("</p>", " ")).stripTags().s
getMetadata: (url) ->
selectors =
title: [
"//meta[@property='twitter:title']/@content",
"//meta[@property='og:title']/@content",
"//*[@itemprop='headline']/text()",
"//h1[contains(@class, 'title')]/text()",
"//h2[contains(@class, 'title')]/text()",
"//title/text()"
],
image: [
"//meta[@property='twitter:image']/@content",
"//meta[@property='og:image']/@content",
"//img[@itemprop='image']/@src",
"//img[contains(@class, 'size-full')]/@src",
"(//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or @id='body' or @id='entry']//img)[1]/@src"
],
description: [
"//meta[@property='twitter:description']/@content",
"//meta[@property='og:description']/@content",
"//*[@itemprop='description']/text()",
"//*[@itemprop='articleBody']/text()",
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']/text()",
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']//p[not(contains(@class, 'wptl'))]/text()",
"//div[@class='entry']//p/text()"
],
sitename: [
"//meta[@property='og:site_name']/@content",
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@alt",
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@title",
"//h1//img/@alt"
]
title_separators = /(\s[|:\/-]\s|&raquo;|\s&gt;\s)/
return new Promise (resolve, reject) ->
request url, (err, resp, body) ->
if err?
reject(err)
else
doc = libxml.parseHtmlString(body)
metadata = {}
for field of selectors
for selector in selectors[field]
result = doc.find(selector)
if result? and result.length > 0
found = false
for node in result
if node.text?
value = node.text().trim()
else if node.value?
value = node.value().trim()
else
value = node.toString().trim()
if value.length > 5
found = true
value = value.replace("\n", " ")
if field == "title" and value.search title_separators > -1
value = longest value.split(title_separators)
metadata[field] = value
break
if found == true
break
resolve metadata
NotFound: (message = "The requested page could not be found.") ->
err = new Error
err.status = 404
err.message = message
return err
InputError: (message = "One or more required input fields were missing.") ->
err = new Error
err.status = 400
err.message = message
return err
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment