Created
July 5, 2013 05:42
-
-
Save leonardreidy/5932185 to your computer and use it in GitHub Desktop.
Simple script to strip out the administrator names and school names of contacts in a certain online directory.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A simple program to extract the administrator name, and school name from | |
# the html files of an online directory then output a file each for | |
# the lists of names and schools using the json.dumps() approach to generate | |
# simple json output | |
def extractor(infile, outfile1, outfile2): | |
file = open(infile, 'r') | |
soup = BeautifulSoup(file) | |
commonsoup = soup('strong') | |
names = [] | |
schools = [] | |
# administrator name extraction | |
for i in commonsoup: | |
for j in i: | |
if j.string != None: | |
if commonsoup.index(i)%2 != 0: | |
if j != '\n': | |
names.append(j.string.encode('utf-8').strip()) | |
# school name extraction | |
for i in commonsoup: | |
if i.string != None: | |
if i.string != "More": | |
schools.append(i.string.encode('utf-8').strip()) | |
with open(outfile1, 'w') as file: | |
file.write(json.dumps(names)) | |
with open(outfile2, 'w') as file: | |
file.write(json.dumps(schools)) | |
def main(filelist): | |
for i in filelist: | |
extractor(i, "p"+str(filelist.index(i)+1)+"-names.txt", "p"+str(filelist.index(i)+1)+"-schools.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment