Skip to content

Instantly share code, notes, and snippets.

@thomasjensen
Created January 5, 2012 18:27
Show Gist options
  • Save thomasjensen/1566517 to your computer and use it in GitHub Desktop.
Save thomasjensen/1566517 to your computer and use it in GitHub Desktop.
extract information from files downloaded with download.py
from BeautifulSoup import BeautifulSoup
import os
import re
path = "/Users/thomasjensen/Documents/RBloggersScrape/download"
listing = os.listdir(path)
listing = [name for name in listing if re.search(r"post\d+\.html",name) != None]
os.chdir(path)
data = {}
for page in listing:
site = open(page,"rb")
soup = BeautifulSoup(site)
key = re.sub(".html","",page)
print key
data.update({key:{}})
content = soup.find("div", id = "leftcontent")
title = content.findNext("h1").text
author = content.find("div",{"class":"meta"}).findNext("a").text
date = content.find("div",{"class":"date"}).text
data[key]["title"] = title
data[key]["author"] = author
data[key]["date"] = date
output = open("/Users/thomasjensen/Documents/RBloggersScrape/output.csv","wb")
keys = data.keys()
variables = unicode(",".join(["id","date","author","title"]))
header = variables + "\n"
output.write(header.encode("utf8"))
for key in keys:
print key
id = key
date = re.sub(",","",data[key]["date"])
author = data[key]["author"]
title = re.sub(",","",data[key]["title"])
title = re.sub("\\n","",title)
linelist = [id,date,author,title]
linestring = unicode(",".join(linelist))
linestring = linestring + "\n"
output.write(linestring.encode("utf-8"))
output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment