Skip to content

Instantly share code, notes, and snippets.

@macklinu
Created April 24, 2013 01:23
Show Gist options
  • Select an option

  • Save macklinu/5448873 to your computer and use it in GitHub Desktop.

Select an option

Save macklinu/5448873 to your computer and use it in GitHub Desktop.
HTML parsing python (output files for an installation piece)
#!/usr/bin/python
# get a list of androgynous names and write it to a .txt file
# requires BeautifulSoup library
# imports
from bs4 import BeautifulSoup
import urllib2
import sys
import os
import re
reload(sys)
sys.setdefaultencoding("utf-8")
# global variables
names = []
urls = [
"http://www.20000-names.com/androgynous_names_unisex_names.htm",
"http://www.20000-names.com/androgynous_names_unisex_names_02.htm",
"http://www.20000-names.com/androgynous_names_unisex_names_03.htm",
"http://www.20000-names.com/androgynous_names_unisex_names_04.htm",
"http://www.20000-names.com/androgynous_names_unisex_names_05.htm"
]
filename = "givennames.txt"
pattern = re.compile(r'^[A-Z\d]+$')
# if names.txt exists, delete it so we can write a new one
try:
os.remove(filename)
except OSError:
pass
# find each name in the html files
for url in urls:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
for name in soup.findAll('a', href=False):
if re.match(pattern, name.get_text()):
names.append(name.get_text().lower().capitalize())
# write the the list to a file, with each given name as a new line
with open(filename, "a") as f:
for item in set(names):
f.write(u"%s\n" % item)
#!/usr/bin/python
# get the IMDB Top 250 and write it to a .txt file
# requires BeautifulSoup library
# imports
from bs4 import BeautifulSoup
import urllib2
import sys
import os
reload(sys)
sys.setdefaultencoding("utf-8")
# global variables
movies = []
url = "http://www.imdb.com/chart/top"
filename = "movies.txt"
# load IMDB site as a temp file
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
# if movies.txt exists, delete it so we can write a new one
try:
os.remove(filename)
except OSError:
pass
# find each movie in the html file
for entry in soup.findAll("table")[1].find_all('a'):
movies.append(entry.get_text()) # add each movie to a list
# write the the list to a file, with each movie as a new line
with open(filename, "a") as f:
for item in movies:
f.write(u"%s\n" % item)
#!/usr/bin/python
# get a list of surnames and write it to a .txt file
# requires BeautifulSoup library
# imports
from bs4 import BeautifulSoup
import urllib2
import sys
import os
import re
reload(sys)
sys.setdefaultencoding("utf-8")
# global variables
surnames = []
url = "http://names.mongabay.com/most_common_surnames.htm"
filename = "surnames.txt"
pattern = re.compile(r'^[A-Z]+$')
# if surnames.txt exists, delete it so we can write a new one
try:
os.remove(filename)
except OSError:
pass
# find each name in the html files
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
for name in soup.findAll('td'):
if re.match(pattern, name.get_text()):
surnames.append(name.get_text().lower().capitalize())
# write the the list to a file, with each surname as a new line
with open(filename, "a") as f:
for item in surnames:
f.write(u"%s\n" % item)
#!/usr/bin/python
# get a list of universities and write it to a .txt file
# requires BeautifulSoup library
# imports
from bs4 import BeautifulSoup
import urllib2
import sys
import os
reload(sys)
sys.setdefaultencoding("utf-8")
# global variables
universities = []
url = "http://www.utexas.edu/world/univ/alpha/"
filename = "universities.txt"
# load UT site as a temp file
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
# if universities.txt exists, delete it so we can write a new one
try:
os.remove(filename)
except OSError:
pass
# find each university in the html file
for uni in soup.findAll('a',{'class':'institution'}):
universities.append(uni.get_text()) # add each university to a list
# write the the list to a file, with each university as a new line
with open(filename, "a") as f:
for item in universities:
f.write(u"%s\n" % item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment