Created
July 5, 2013 14:07
-
-
Save leonardreidy/5934770 to your computer and use it in GitHub Desktop.
Parse html file with Beautiful Soup, find emails and names and output as json, ready for ponymailer.rb. Emails are found (with href=mailto) and names (inside <strong> tags). The program creates a single list that contains both names, and emails, and then output it as json, ready for ponymailer to send.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A simple python script to extract names, and emails from | |
# a certain online directory | |
import os, json | |
from bs4 import BeautifulSoup | |
#get a list of the files in the current directory | |
inputfiles = os.listdir(os.getcwd()) | |
def postproc(inputfiles): | |
#for every file in the directory | |
for i in inputfiles: | |
#call the preproc function on said file and generate the appropriate outfile | |
preproc(i, "out"+str(inputfiles.index(i))+".txt") | |
def preproc(infile, outfile): | |
# open the infile for reading | |
file = open(infile, 'r') | |
# convert the infile to soup object | |
soup = BeautifulSoup(file) | |
# find all <strong></strong> elements | |
strongs = soup.select('strong') | |
# find all mailto (email) elements | |
mailtos = soup.select('a[href^=mailto]') | |
# prep variables for subsequent stages i process | |
prenames = [] | |
names = [] | |
emails = [] | |
contactzip = [] | |
jsondump = [] | |
# Extract names | |
for i in strongs: | |
for j in i: | |
prenames.append(j.string) | |
for i in prenames: | |
if prenames.index(i)%2 != 0: | |
if i.string != None: | |
if i != '\n': | |
names.append(i.string.encode('utf-8').strip()) | |
# Extract emails | |
for i in mailtos: | |
if i.string != None: | |
emails.append(i.string.encode('utf-8').strip()) | |
# zip together names,emails into a list of lists | |
contactzip = zip(emails, names) | |
# convert list of lists to json for processing by ponymailer | |
jsondump = json.dumps(contactzip) | |
# write to file | |
with open(outfile, 'w') as file: | |
file.write(jsondump) | |
# run the script | |
postproc(inputfiles) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment