Skip to content

Instantly share code, notes, and snippets.

@gelendir
Forked from ruel/infb.py
Created November 27, 2010 18:35
Show Gist options
  • Save gelendir/718149 to your computer and use it in GitHub Desktop.
Save gelendir/718149 to your computer and use it in GitHub Desktop.
A script to scrape information from your facebook friends
/*
Ruel Pagayon (c) 2010 - [email protected]
Cascading Style Sheet for InFB Log Output.
*/
body {
background-color: #3C3C3C;
color: #FFF;
margin-top: 50px;
margin-left: 25px;
font-size: xx-small;
font-family: Calibri, Arial, sans;
}
.rby {
text-align: center;
font-size: xx-small;
}
table {
text-align: center;
}
td {
padding-top: 0.5em;
padding-bottom: 0.5em;
padding-left: 1em;
padding-right: 1em;
text-align: left;
font-size: small;
}
td.num {
color: #CCC;
}
td.cnum {
color: #AFAFAF;
}
a:active, a:visited, a:link {
color: #FFF;
font-weight: bold;
text-decoration: none;
}
a:hover {
color: #FFF;
font-weight: bold;
text-decoration: underline;
}
#!/usr/bin/python
#
# InFB - Information Facebook
# Usage: infb.py [email protected] password
# http://ruel.me
#
# Copyright (c) 2010, Ruel Pagayon - [email protected]
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of ruel.me nor the names of its contributors
# may be used to endorse or promote products derived from this
# script without specific prior written permission.
#
# THIS SCRIPT IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL RUEL PAGAYON BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SCRIPT, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys, re, urllib, urllib2, cookielib, HTMLParser, getpass
class FormScraper(HTMLParser.HTMLParser):
"""
Scrapes the Facebook login page for form values that need to be submitted on login.
Necessary because the form values change each time the login page is loaded.
Usage:
form_scraper = FormScraper()
form_scraper.feed(html_from_facebook)
form_values = form_scraper.values
"""
def __init__(self, *args, **kwargs):
HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
self.in_form = False
self.values = []
def handle_starttag(self, tag, attrs):
tag = tag.lower()
attrs = dict(attrs)
if tag == 'form' and attrs['id'] == 'login_form':
self.in_form = True
elif self.in_form and tag == 'input' and attrs['type'] == 'hidden':
self.values.append( (attrs['name'], attrs['value']) )
def handle_endtag(self, tag):
if tag.lower() == 'form' and self.in_form:
self.in_form = False
def main():
if len(sys.argv) < 2:
usage()
user = sys.argv[1]
if len(sys.argv) < 3:
passw = getpass.getpass("Enter password: ")
else:
passw = sys.argv[2]
# Set needed modules
CHandler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
browser = urllib2.build_opener(CHandler)
browser.addheaders = [('User-agent', 'InFB - [email protected] - http://ruel.me')]
urllib2.install_opener(browser)
#Retrieve login form data and initialize the cookies
print 'Initializing..'
res = browser.open('https://www.facebook.com/login.php')
#Determine string encoding
content_type = res.info()['Content-Type'].split('; ')
encoding = 'utf-8'
if len(content_type) > 1 and content_type[1].startswith('charset'):
encoding = content_type[1].split('=')[1]
html = unicode( res.read(), encoding=encoding )
res.close()
#scrape form for hidden inputs, add email and password to values
form_scraper = FormScraper()
form_scraper.feed(html)
form_data = form_scraper.values
form_data.extend( [('email', user), ('pass', passw)] )
#HACK: urlencode doesn't like strings that aren't encoded with the 'encode' function.
#Using html.encode(encoding) doesn't help either. why ??
form_data = [ ( x.encode(encoding), y.encode(encoding) ) for x,y in form_data ]
data = urllib.urlencode(form_data)
# Login
print 'Logging in to account ' + user
res = browser.open('https://login.facebook.com/login.php?login_attempt=1', data)
rcode = res.code
print rcode
print res.url
if not re.search('home\.php$', res.url):
print 'Login Failed'
exit(2)
res.close()
# Get Emails and Phone Numbers
print "Getting Info..\n"
flog = open(user + '.html', 'a')
flog.write("<html>\n\t<head>\n\t\t<title>InFB - " + user + "</title>\n\t\t<link href=\"infb.css\" rel=\"stylesheet\" type=\"text/css\" />\n\t</head>\n\t<body>\n\t\t<div class=\"rby\">\n\t\t\t<table class=\"flist\">\n\t\t\t\t")
page = 0
while True:
res = browser.open('http://m.facebook.com/friends.php?a&f=' + str(page))
parp = res.read()
m = re.findall('"\/friends\.php\?id=([0-9]+)&', parp)
res.close()
for i in m:
prof = 'http://m.facebook.com/profile.php?id=' + i + '&v=info'
res = browser.open(prof)
cont = res.read()
res.close()
prof = prof.replace('m.', 'www.')
ms = re.search('<div id="body"><div><div>(.*?)<\/div>', cont)
if ms:
name = ms.group(1)
else:
continue
ms = re.search('href="tel:(.*?)"', cont)
if ms:
tel = ms.group(1)
else:
tel = ''
ms = re.search('Emails?:<\/div><\/td><td valign="top"><div>(.*?)<\/div>', cont)
if ms:
email = re.sub('<br \/>', ', ', ms.group(1)).replace('&#64;', '@')
else:
continue
print name + ' : ' + email + ' ' + tel
flog.write("<tr class=\"lbreak\">\n\t\t\t\t\t<td class=\"num\">" + i + "</td><td class=\"fname\"><a href=\"" + prof + "\" title=\"" + name + "\">" + name + "</a></td><td class=\"fmail\">" + email + "</td></td><td class=\"cnum\">" + tel + "</td>\n\t\t\t\t\t</tr>\n\t\t\t\t")
if re.search('Next', parp):
page += 10
else:
break
flog.write("\n\t\t\t</table>\n\t\t</div>\n\t</body>\n</html>")
flog.close()
def usage():
print 'Usage: ' + sys.argv[0] + ' [email protected] [password]'
sys.exit(1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment