Created
April 24, 2013 01:23
-
-
Save macklinu/5448873 to your computer and use it in GitHub Desktop.
HTML parsing python (output files for an installation piece)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # get a list of androgynous names and write it to a .txt file | |
| # requires BeautifulSoup library | |
| # imports | |
| from bs4 import BeautifulSoup | |
| import urllib2 | |
| import sys | |
| import os | |
| import re | |
| reload(sys) | |
| sys.setdefaultencoding("utf-8") | |
| # global variables | |
| names = [] | |
| urls = [ | |
| "http://www.20000-names.com/androgynous_names_unisex_names.htm", | |
| "http://www.20000-names.com/androgynous_names_unisex_names_02.htm", | |
| "http://www.20000-names.com/androgynous_names_unisex_names_03.htm", | |
| "http://www.20000-names.com/androgynous_names_unisex_names_04.htm", | |
| "http://www.20000-names.com/androgynous_names_unisex_names_05.htm" | |
| ] | |
| filename = "givennames.txt" | |
| pattern = re.compile(r'^[A-Z\d]+$') | |
| # if names.txt exists, delete it so we can write a new one | |
| try: | |
| os.remove(filename) | |
| except OSError: | |
| pass | |
| # find each name in the html files | |
| for url in urls: | |
| page = urllib2.urlopen(url) | |
| soup = BeautifulSoup(page.read()) | |
| for name in soup.findAll('a', href=False): | |
| if re.match(pattern, name.get_text()): | |
| names.append(name.get_text().lower().capitalize()) | |
| # write the the list to a file, with each given name as a new line | |
| with open(filename, "a") as f: | |
| for item in set(names): | |
| f.write(u"%s\n" % item) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # get the IMDB Top 250 and write it to a .txt file | |
| # requires BeautifulSoup library | |
| # imports | |
| from bs4 import BeautifulSoup | |
| import urllib2 | |
| import sys | |
| import os | |
| reload(sys) | |
| sys.setdefaultencoding("utf-8") | |
| # global variables | |
| movies = [] | |
| url = "http://www.imdb.com/chart/top" | |
| filename = "movies.txt" | |
| # load IMDB site as a temp file | |
| page = urllib2.urlopen(url) | |
| soup = BeautifulSoup(page.read()) | |
| # if movies.txt exists, delete it so we can write a new one | |
| try: | |
| os.remove(filename) | |
| except OSError: | |
| pass | |
| # find each movie in the html file | |
| for entry in soup.findAll("table")[1].find_all('a'): | |
| movies.append(entry.get_text()) # add each movie to a list | |
| # write the the list to a file, with each movie as a new line | |
| with open(filename, "a") as f: | |
| for item in movies: | |
| f.write(u"%s\n" % item) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # get a list of surnames and write it to a .txt file | |
| # requires BeautifulSoup library | |
| # imports | |
| from bs4 import BeautifulSoup | |
| import urllib2 | |
| import sys | |
| import os | |
| import re | |
| reload(sys) | |
| sys.setdefaultencoding("utf-8") | |
| # global variables | |
| surnames = [] | |
| url = "http://names.mongabay.com/most_common_surnames.htm" | |
| filename = "surnames.txt" | |
| pattern = re.compile(r'^[A-Z]+$') | |
| # if surnames.txt exists, delete it so we can write a new one | |
| try: | |
| os.remove(filename) | |
| except OSError: | |
| pass | |
| # find each name in the html files | |
| page = urllib2.urlopen(url) | |
| soup = BeautifulSoup(page.read()) | |
| for name in soup.findAll('td'): | |
| if re.match(pattern, name.get_text()): | |
| surnames.append(name.get_text().lower().capitalize()) | |
| # write the the list to a file, with each surname as a new line | |
| with open(filename, "a") as f: | |
| for item in surnames: | |
| f.write(u"%s\n" % item) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # get a list of universities and write it to a .txt file | |
| # requires BeautifulSoup library | |
| # imports | |
| from bs4 import BeautifulSoup | |
| import urllib2 | |
| import sys | |
| import os | |
| reload(sys) | |
| sys.setdefaultencoding("utf-8") | |
| # global variables | |
| universities = [] | |
| url = "http://www.utexas.edu/world/univ/alpha/" | |
| filename = "universities.txt" | |
| # load UT site as a temp file | |
| page = urllib2.urlopen(url) | |
| soup = BeautifulSoup(page.read()) | |
| # if universities.txt exists, delete it so we can write a new one | |
| try: | |
| os.remove(filename) | |
| except OSError: | |
| pass | |
| # find each university in the html file | |
| for uni in soup.findAll('a',{'class':'institution'}): | |
| universities.append(uni.get_text()) # add each university to a list | |
| # write the the list to a file, with each university as a new line | |
| with open(filename, "a") as f: | |
| for item in universities: | |
| f.write(u"%s\n" % item) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment