macklinu · April 24, 2013 01:23
diff --git a/givennames.py b/givennames.py
 #!/usr/bin/python

 # get a list of androgynous names and write it to a .txt file
 # requires BeautifulSoup library

 # imports
 from bs4 import BeautifulSoup
 import urllib2
 import sys
 import os
 import re
 reload(sys)
 sys.setdefaultencoding("utf-8")

 # global variables
 names = []
 urls = [
 "http://www.20000-names.com/androgynous_names_unisex_names.htm",
 "http://www.20000-names.com/androgynous_names_unisex_names_02.htm",
 "http://www.20000-names.com/androgynous_names_unisex_names_03.htm",
 "http://www.20000-names.com/androgynous_names_unisex_names_04.htm",
 "http://www.20000-names.com/androgynous_names_unisex_names_05.htm"
 ]
 filename = "givennames.txt"
 pattern = re.compile(r'^[A-Z\d]+$')

 # if names.txt exists, delete it so we can write a new one
 try:
    os.remove(filename)
 except OSError:
    pass

 # find each name in the html files
 for url in urls:
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page.read())
    for name in soup.findAll('a', href=False):
        if re.match(pattern, name.get_text()):
            names.append(name.get_text().lower().capitalize())

 # write the the list to a file, with each given name as a new line
 with open(filename, "a") as f:
    for item in set(names):
        f.write(u"%s\n" % item)
diff --git a/imdb.py b/imdb.py
 #!/usr/bin/python

 # get the IMDB Top 250 and write it to a .txt file
 # requires BeautifulSoup library

 # imports
 from bs4 import BeautifulSoup
 import urllib2
 import sys
 import os
 reload(sys)
 sys.setdefaultencoding("utf-8")

 # global variables
 movies = []
 url = "http://www.imdb.com/chart/top"
 filename = "movies.txt"

 # load IMDB site as a temp file 
 page = urllib2.urlopen(url)
 soup = BeautifulSoup(page.read())

 # if movies.txt exists, delete it so we can write a new one
 try:
    os.remove(filename)
 except OSError:
    pass

 # find each movie in the html file
 for entry in soup.findAll("table")[1].find_all('a'):
    movies.append(entry.get_text()) # add each movie to a list

 # write the the list to a file, with each movie as a new line
 with open(filename, "a") as f:
    for item in movies:
        f.write(u"%s\n" % item)
diff --git a/surnames.py b/surnames.py
 #!/usr/bin/python

 # get a list of surnames and write it to a .txt file
 # requires BeautifulSoup library

 # imports
 from bs4 import BeautifulSoup
 import urllib2
 import sys
 import os
 import re
 reload(sys)
 sys.setdefaultencoding("utf-8")

 # global variables
 surnames = []
 url = "http://names.mongabay.com/most_common_surnames.htm"
 filename = "surnames.txt"
 pattern = re.compile(r'^[A-Z]+$')

 # if surnames.txt exists, delete it so we can write a new one
 try:
    os.remove(filename)
 except OSError:
    pass

 # find each name in the html files
 page = urllib2.urlopen(url)
 soup = BeautifulSoup(page.read())
 for name in soup.findAll('td'):
    if re.match(pattern, name.get_text()):
        surnames.append(name.get_text().lower().capitalize())

 # write the the list to a file, with each surname as a new line
 with open(filename, "a") as f:
    for item in surnames:
        f.write(u"%s\n" % item)
diff --git a/universities.py b/universities.py
 #!/usr/bin/python

 # get a list of universities and write it to a .txt file
 # requires BeautifulSoup library

 # imports
 from bs4 import BeautifulSoup
 import urllib2
 import sys
 import os
 reload(sys)
 sys.setdefaultencoding("utf-8")

 # global variables
 universities = []
 url = "http://www.utexas.edu/world/univ/alpha/"
 filename = "universities.txt"

 # load UT site as a temp file 
 page = urllib2.urlopen(url)
 soup = BeautifulSoup(page.read())

 # if universities.txt exists, delete it so we can write a new one
 try:
    os.remove(filename)
 except OSError:
    pass

 # find each university in the html file
 for uni in soup.findAll('a',{'class':'institution'}):
    universities.append(uni.get_text()) # add each university to a list

 # write the the list to a file, with each university as a new line
 with open(filename, "a") as f:
    for item in universities:
        f.write(u"%s\n" % item)
	#!/usr/bin/python

	# get a list of androgynous names and write it to a .txt file
	# requires BeautifulSoup library

	# imports
	from bs4 import BeautifulSoup
	import urllib2
	import sys
	import os
	import re
	reload(sys)
	sys.setdefaultencoding("utf-8")

	# global variables
	names = []
	urls = [
	"http://www.20000-names.com/androgynous_names_unisex_names.htm",
	"http://www.20000-names.com/androgynous_names_unisex_names_02.htm",
	"http://www.20000-names.com/androgynous_names_unisex_names_03.htm",
	"http://www.20000-names.com/androgynous_names_unisex_names_04.htm",
	"http://www.20000-names.com/androgynous_names_unisex_names_05.htm"
	]
	filename = "givennames.txt"
	pattern = re.compile(r'^[A-Z\d]+$')

	# if names.txt exists, delete it so we can write a new one
	try:
	os.remove(filename)
	except OSError:
	pass

	# find each name in the html files
	for url in urls:
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read())
	for name in soup.findAll('a', href=False):
	if re.match(pattern, name.get_text()):
	names.append(name.get_text().lower().capitalize())

	# write the the list to a file, with each given name as a new line
	with open(filename, "a") as f:
	for item in set(names):
	f.write(u"%s\n" % item)
	#!/usr/bin/python

	# get the IMDB Top 250 and write it to a .txt file
	# requires BeautifulSoup library

	# imports
	from bs4 import BeautifulSoup
	import urllib2
	import sys
	import os
	reload(sys)
	sys.setdefaultencoding("utf-8")

	# global variables
	movies = []
	url = "http://www.imdb.com/chart/top"
	filename = "movies.txt"

	# load IMDB site as a temp file
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read())

	# if movies.txt exists, delete it so we can write a new one
	try:
	os.remove(filename)
	except OSError:
	pass

	# find each movie in the html file
	for entry in soup.findAll("table")[1].find_all('a'):
	movies.append(entry.get_text()) # add each movie to a list

	# write the the list to a file, with each movie as a new line
	with open(filename, "a") as f:
	for item in movies:
	f.write(u"%s\n" % item)
	#!/usr/bin/python

	# get a list of surnames and write it to a .txt file
	# requires BeautifulSoup library

	# imports
	from bs4 import BeautifulSoup
	import urllib2
	import sys
	import os
	import re
	reload(sys)
	sys.setdefaultencoding("utf-8")

	# global variables
	surnames = []
	url = "http://names.mongabay.com/most_common_surnames.htm"
	filename = "surnames.txt"
	pattern = re.compile(r'^[A-Z]+$')

	# if surnames.txt exists, delete it so we can write a new one
	try:
	os.remove(filename)
	except OSError:
	pass

	# find each name in the html files
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read())
	for name in soup.findAll('td'):
	if re.match(pattern, name.get_text()):
	surnames.append(name.get_text().lower().capitalize())

	# write the the list to a file, with each surname as a new line
	with open(filename, "a") as f:
	for item in surnames:
	f.write(u"%s\n" % item)
	#!/usr/bin/python

	# get a list of universities and write it to a .txt file
	# requires BeautifulSoup library

	# imports
	from bs4 import BeautifulSoup
	import urllib2
	import sys
	import os
	reload(sys)
	sys.setdefaultencoding("utf-8")

	# global variables
	universities = []
	url = "http://www.utexas.edu/world/univ/alpha/"
	filename = "universities.txt"

	# load UT site as a temp file
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read())

	# if universities.txt exists, delete it so we can write a new one
	try:
	os.remove(filename)
	except OSError:
	pass

	# find each university in the html file
	for uni in soup.findAll('a',{'class':'institution'}):
	universities.append(uni.get_text()) # add each university to a list

	# write the the list to a file, with each university as a new line
	with open(filename, "a") as f:
	for item in universities:
	f.write(u"%s\n" % item)