Created
October 18, 2010 15:44
-
-
Save kwellman/632442 to your computer and use it in GitHub Desktop.
lxml_readability.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""An lxml Port of Nirmal Patel's port (http://nirmalpatel.com/fcgi/hn.py) of | |
Arc90's Readability to Python. | |
""" | |
import re | |
from lxml.html import fromstring, tostring | |
from lxml.html.clean import Cleaner | |
NEGATIVE = re.compile('comment|meta|footer|footnote|foot') | |
POSITIVE = re.compile('post|hentry|entry|content|text|body|article') | |
BREAKS = re.compile(r'<br */? *>\s*<br */? *>') | |
def extract(link, html): | |
# convert text separated by breaks into individual paragraph elements | |
html = BREAKS.sub('</p><p>', html) | |
doc = fromstring(html) | |
cleaner = Cleaner(style=True) | |
# remove style, meta, page structure, script, form elements | |
doc = cleaner.clean_html(doc) | |
parents = [] | |
topparent = None | |
# traverse paragraph elements | |
for d in doc.iter('p'): | |
parent = d.getparent() | |
if parent not in parents: | |
parents.append(parent) | |
parent.score = 0 | |
# add/remove points based on class attribute | |
if 'class' in parent: | |
cls = parent.get('class') | |
if NEGATIVE.match(cls): | |
parent.score -= 50 | |
if POSITIVE.match(cls): | |
parent.score += 25 | |
# ditto for id attribute | |
if 'id' in parent: | |
id = parent.get('id') | |
if NEGATIVE.match(id): | |
parent.score -= 50 | |
if POSITIVE.match(id): | |
parent.score += 25 | |
text = d.text_content() | |
# add points for containing a significant amount of text | |
if len(text) > 10: | |
parent.score += 1 | |
# add points for containing punctuation common in text content | |
parent.score += text.count(',') | |
# find highest scoring element | |
for parent in parents: | |
if topparent is None or parent.score > topparent.score: | |
topparent = parent | |
if topparent is not None: | |
if link: | |
topparent.make_links_absolute(link) | |
return topparent.text_content() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment