Skip to content

Instantly share code, notes, and snippets.

@ljos
Created February 9, 2013 13:00
Show Gist options
  • Save ljos/4745229 to your computer and use it in GitHub Desktop.
Save ljos/4745229 to your computer and use it in GitHub Desktop.
from os import listdir
from os.path import join
from re import match, findall, search, M
from time import mktime, strptime
from tldextract import extract
from collections import defaultdict
from HTMLParser import HTMLParser
class FindAllAs(HTMLParser):
def __init__(self, siteURL):
self.siteURL = siteURL
HTMLParser.__init__(self)
self.links = defaultdict(int)
self.removedLinks = set()
def handle_starttag(self, tag, attrs):
if tag == 'a':
href = None
for (attr, val) in attrs:
if attr == 'href':
href = val
break
if href:
if (search('www.blogger', href) or
match('http://wordpress.com', href) or
match('http://automattic.com/',href) or
match('https?://gravatar', href) or
search('\d\.bp\.blogspot', href) or
search('v\d\.cache\d\.googlevideo', href) or
search('\s', href) or
match('/\d{4}/\d\d/', href) or
search(self.siteURL, href) or
search('javascript', href) or
not extract(href)[-1]):
self.removedLinks.add(href)
else:
href = '.'.join(filter(None, extract(href)))
self.links[href] += 1
def generatePages(sitefile):
with open(sitefile, 'r') as content:
for line in content:
pageURL = ''
scrapeTime = 0
pageHTML = []
previous = ''
for line in content:
if match('#http', line):
if pageURL:
page = {'url' : pageURL,
'scrapeTime' : scrapeTime,
'html' : ' '.join(pageHTML)}
yield page
pageURL = line[1:].strip()
elif match('#http', previous):
t = strptime(line.strip(), '%Y%m%d-%H%M%S')
scrapeTime = int(mktime(t))
else:
pageHTML.append(line)
previous = line
def siteToDict(sitefile):
siteURL = search('(.*?)\.html',f).group(1)
allLinks = FindAllAs(siteURL)
with open(sitefile, 'r') as content:
for line in content:
allLinks.feed(unicode(line, 'utf-8'))
site = {'url' : siteURL,
'links' : allLinks.links,
'removedLinks' : allLinks.removedLinks,
'pages' : generatePages(sitefile)}
return site
if __name__ == '__main__':
folder = 'blog'
site = {}
for f in listdir(folder):
if search('\.html', f):
site = siteToDict(join(folder,f))
for page in site['pages']:
print page['html']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment