Created
July 28, 2010 21:43
-
-
Save axiak/496433 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
contactinfo.py is a simple script to get contact information from | |
Facebook. | |
To use, first install mechanize [1]. Then use the friendstocsv application [2] | |
on Facebook to export a csv file with all of your friends. Be sure | |
to include at least the profile URL. | |
Afterwards, edit this file to put your email, password, and user agent | |
string at the top. | |
Then, run it as follows: | |
$ python contactinfo.py <INPUT> <OUTPUT> | |
Where INPUT and OUTPUT are either filenames or '-'. If they are -, the | |
script will use the standard input and the standard output. So the | |
following is acceptable: | |
$ python contactinfo.py - - < friendstocsvoutput.csv > myinfo.csv | |
1: http://pypi.python.org/pypi/mechanize/ | |
2: http://apps.facebook.com/friendstocsv/ | |
""" | |
__VERSION__ = (0, 0, 1) | |
FB_EMAIL = "[email protected]" | |
FB_PASSWORD = "" | |
FB_USERAGENT = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.38 Safari/533.4" | |
import csv | |
import sys | |
import random | |
import time | |
import re | |
import mechanize | |
br_re = re.compile(r"<br[^>]*>") | |
html_strip = re.compile(r"<[^>]+>") | |
def debug(s): | |
sys.stderr.write("%s\n" % s) | |
class PersonRecord(dict): | |
fields = ( | |
'uid', | |
'last_name', | |
'first_name', | |
'name', | |
'birthday_date', | |
'hometown_location', | |
'state', | |
'country', | |
'zip', | |
'profile_url', | |
'emails', | |
'phone', | |
'aim', | |
'skype', | |
'yahoo', | |
'address', | |
'website', | |
) | |
def __init__(self, **kwargs): | |
super(PersonRecord, self).__init__() | |
for field in PersonRecord.fields: | |
self[field] = '' | |
self.update(kwargs) | |
def to_row(self): | |
return map(self.__getitem__, PersonRecord.fields) | |
def main(): | |
browser = facebook_login() | |
outcsv, outfile = open_csv_output() | |
for person in read_csv(sys.argv[1]): | |
contact_info(browser, person) | |
outcsv.writerow(person.to_row()) | |
outfile.flush() | |
outfile.close() | |
def facebook_login(): | |
br = mechanize.Browser() | |
br.open("http://www.facebook.com/") | |
br.select_form(nr=0) | |
br.addheaders = [("User-agent", FB_USERAGENT), | |
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"), | |
("Accept-Language", "en-us,en;q=0.5"), | |
("Accept-Charset", "ISO-8859-1,utf-8;q=0.5")] | |
br["email"] = FB_EMAIL | |
br["pass"] = FB_PASSWORD | |
response = br.submit() | |
content = response.read() | |
if '"standard_error"' in content: | |
error_re = re.compile(r'id="standard_error">(.+?)</h2>') | |
m = error_re.search(content) | |
if m: | |
msg = m.groups()[0] | |
msg = html_strip.sub('', br_re.sub('\n', msg)) | |
else: | |
msg = "Invalid username and or password." | |
debug("ERROR: %s" % msg) | |
sys.exit(2) | |
return br | |
def random_wait(mean=1): | |
time.sleep(random.expovariate(1 / float(mean))) | |
def read_csv(arg): | |
if arg == '-': | |
input = sys.stdin | |
else: | |
input = open(arg, 'r') | |
reader = csv.reader(input) | |
header = reader.next() | |
name_mapper = {} | |
for i, name in enumerate(header): | |
name_mapper[i] = name | |
for row in reader: | |
person = PersonRecord() | |
for i, value in enumerate(row): | |
person[name_mapper[i]] = value | |
yield person | |
if arg != '-': | |
input.close() | |
def open_csv_output(): | |
if len(sys.argv) > 2 and sys.argv[2] != '-': | |
output = open(sys.argv[2], 'w') | |
else: | |
output = sys.stdout | |
outcsv = csv.writer(output, quoting=csv.QUOTE_ALL) | |
outcsv.writerow(PersonRecord.fields) | |
return outcsv, output | |
def contact_info(browser, person): | |
URL = person['profile_url'] | |
if '?' in URL: | |
URL += '&v=info' | |
else: | |
URL += '?v=info' | |
random_wait() | |
debug("INFO: Downloading %s" % URL) | |
for i in range(10): | |
try: | |
response = browser.open(URL) | |
except Exception, e: | |
debug("ERROR: Browser download error %s" % e) | |
debug("INFO: Waiting to redownload...") | |
random_wait() | |
else: | |
break | |
else: | |
debug("ERROR: Was unable to download from facebook too many times.") | |
sys.exit(3) | |
person.update(get_data(response.read())) | |
def get_data(info): | |
pieces = re.compile(r"class=\\\"label\\\">(.+?)<\\\/th.*?class=\\\"data\\\">(.+?)<\\\/td>") | |
label_transform = { | |
'mobile number': 'phone', | |
'email': 'emails', | |
'contact info': 'emails', | |
} | |
new_info = {} | |
for match in pieces.finditer(info): | |
label, data = match.groups() | |
label = label.rstrip(':').strip().lower() | |
label = label_transform.get(label, label) | |
data = html_strip.sub('', br_re.sub('|', data)) | |
if label in PersonRecord.fields: | |
if label == 'emails' and data.startswith('Email:'): | |
data = data[6:] | |
new_info[label] = data | |
return new_info | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment