Created
June 5, 2012 18:51
-
-
Save danlamanna/2876920 to your computer and use it in GitHub Desktop.
Scrapy crawling, pyEnchant spell checking.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from spellchecker.items import SpellcheckItem | |
from BeautifulSoup import BeautifulSoup | |
import re,urllib,enchant | |
from string import strip | |
class SpellcheckSpider(CrawlSpider): | |
name = "intellisites" | |
allowed_domains = ["intellisites.com"] | |
start_urls = [ | |
"http://intellisites.com/", | |
] | |
allowed_words = ["Facebook", "LinkedIn", "BoldChat", "EXEControl"] | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=('http://intellisites.com/')), callback='parse_item', follow=True), | |
) | |
""" This is called for every URL crawled, it parses the words on the page | |
and created a SpellcheckItem to return. """ | |
def parse_item(self, response): | |
# Setup the spellcheck item | |
item = SpellcheckItem() | |
item["url"] = response.url | |
item["invalid_words"] = [] | |
# Get the text visible from the page | |
html = urllib.urlopen(response.url).read() | |
soup = BeautifulSoup(html) | |
visible_texts = filter(self.visible, soup.findAll(text=True)) | |
d = enchant.Dict("en_US") | |
for text in visible_texts: | |
legit_words = filter(self.isLegit, text.split(" ")) | |
for word in legit_words: | |
word = word.strip() | |
if word in self.allowed_words or d.check(word): | |
continue | |
elif not d.check(word) and word not in item["invalid_words"]: | |
item["invalid_words"].append(word) | |
return item | |
""" Ensures the word is greater than one character, and is strictly alphanumeric. """ | |
def isLegit(self, possible_word): | |
if (len(possible_word) <= 1): | |
return False | |
elif not re.match("^\w+$", possible_word): | |
return False | |
else: | |
return True | |
""" Ensures we're only dealing with text in proper places. """ | |
def visible(self, element): | |
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: | |
return False | |
elif re.match('<!--.*-->', str(element)): | |
return False | |
return True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment