Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save chengjun/d19bdce3de3250165b78 to your computer and use it in GitHub Desktop.
Save chengjun/d19bdce3de3250165b78 to your computer and use it in GitHub Desktop.
scrape thread info using mechanize in python
# -*- coding: utf-8 -*-
"""
Spyder Editor
This temporary script file is located here:
C:\Users\chengwang6\Desktop\WinPython-64bit-2.7.6.4\settings\.spyder2\.temp.py
"""
import mechanize
import cookielib
from bs4 import BeautifulSoup
from random import randint
from time import sleep
import csv
def screen_login():
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# setting
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# br.set_debug_http(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Chrome/17.0.963.56 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# br.add_password('http://members.lovingfromadistance.com/', 'chengjun', 'wangchj0')
br.open('http://members.lovingfromadistance.com/login.php?do=login')
# Open the login page
br.select_form(nr = 0) # Find the login form
br['vb_login_username'] = 'your_name' # Set the form values
br['vb_login_password'] = 'your_pwd'
br.submit() # Submit the form
return(br)
# read data
def get_urls(total_page_num):
urls = []
profiles = []
web_url = 'http://members.lovingfromadistance.com/forumdisplay.php?18-Closing-the-Distance/page'
for page_num in range(1, total_page_num + 1):
browser = br.open( web_url + str(page_num) )
# '''sleep'''
sleep(0.01*randint(1,7))
print page_num
# browser = br.open('http://members.lovingfromadistance.com/forumdisplay.php?18-Closing-the-Distance/page1')
# parse data
soup = BeautifulSoup(browser)
titles = soup.find_all('a', {'class', 'title'})
users = soup.find_all('a', {'class', 'siteicon_profile'})
for i in range(len(titles)):
title = titles[i]['href'] # .split('&')[0]
user = users[i]['href']
title = 'http://members.lovingfromadistance.com/' + title
profiles.append(user)
urls.append(title)
print title
return urls, profiles
br = screen_login()
thread_and_user = get_urls(41)
thread_urls = thread_and_user[0]
user_profiles_all = thread_and_user[1]
user_profiles = list(set(user_profiles_all)) # delete duplicated users
#seq_id = user_profiles_all.index(u'member.php?557-agentholli')
#thread_urls[seq_id]
#http://members.lovingfromadistance.com/forumdisplay.php?18-Closing-the-Distance/page34
#http://members.lovingfromadistance.com/showthread.php?11305-Moving-from-England-to-Republic-Of-Ireland
# 27 people are guests or change their names!
# [1] "9152-BabyDimples" "0-Lina" "0-NaNi" "6409-kaylamcquaig" "3018-FadedSunrise" "11103-Schlafmütze" "782-rsvpnj" "0-Madge"
# [9] "0-Brandie" "0-books" "0-Mio" "8211-Tooki" "15461-Krissy12512" "10382-staystrong12" "9594-LoveL" "0-BJL_Sweetheart1109"
# [17] "557-agentholli" "0-Èternity" "11883-jlcsgirl" "9132-rach92g" "0-Bluestars" "8953-lademoiselle" "1468-Élan" "664-Caitlin2009"
# [25] "8871-MeganK"
""""
'''Thread info'''
"""
## bigger collection:
cp1252 = {
u"\x80": u"\u20AC", # e282ac
u"\x81": u"\uFFFD", # ` ? efbfbd
u"\x82": u"\u201A", # e2809a
u"\x83": u"\u0192", # à à c692
u"\x84": u"\u201E", # Ġ Ġ e2809e
u"\x85": u"\u2026", # Š Š e280a6
u"\x86": u"\u2020", # Ơ Ơ e280a0
u"\x87": u"\u2021", # Ǡ Ǡ e280a1
u"\x88": u"\u02C6", # Ƞ Ƞ cb86
u"\x89": u"\u2030", # ɠ ɠ e280b0
u"\x8a": u"\u0160", # ʠ ʠ c5a0
u"\x8b": u"\u2039", # ˠ ˠ e280b9
u"\x8c": u"\u0152", # ̠ ̠ c592
u"\x8d": u"\uFFFD", # ͠ ? efbfbd
u"\x8e": u"\u017D", # Π Π c5bd
u"\x8f": u"\uFFFD", # Ϡ ? efbfbd
u"\x90": u"\uFFFD", # Р ? efbfbd
u"\x91": u"\u2018", # Ѡ Ѡ e28098
u"\x92": u"\u2019", # Ҡ Ҡ e28099
u"\x93": u"\u201C", # Ӡ Ӡ e2809c
u"\x94": u"\u201D", # Ԡ Ԡ e2809d
u"\x95": u"\u2022", # ՠ ՠ e280a2
u"\x96": u"\u2013", # ֠ ֠ e28093
u"\x97": u"\u2014", # נ נ e28094
u"\x98": u"\u02DC", # ؠ ؠ cb9c
u"\x99": u"\u2122", # ٠ ٠ e284a2
u"\x9a": u"\u0161", # ڠ ڠ c5a1
u"\x9b": u"\u203A", # ۠ ۠ e280ba
u"\x9c": u"\u0153", # ܠ ܠ c593
u"\x9d": u"\uFFFD", # ݠ ? efbfbd
u"\x9e": u"\u017E", # ޠ ޠ c5be
u"\x9f": u"\u0178", # ߠ ߠ c5b8
u"\xa0": u"\u00A0", # c2a0
u"\xa1": u"\u00A1", # ` ` c2a1
u"\xa2": u"\u00A2", # c2a2
u"\xa3": u"\u00A3", # à à c2a3
u"\xa4": u"\u00A4", # Ġ Ġ c2a4
u"\xa5": u"\u00A5", # Š Š c2a5
u"\xa6": u"\u00A6", # Ơ Ơ c2a6
u"\xa7": u"\u00A7", # Ǡ Ǡ c2a7
u"\xa8": u"\u00A8", # Ƞ Ƞ c2a8
u"\xa9": u"\u00A9", # ɠ ɠ c2a9
u"\xaa": u"\u00AA", # ʠ ʠ c2aa
u"\xab": u"\u00AB", # ˠ ˠ c2ab
u"\xac": u"\u00AC", # ̠ ̠ c2ac
u"\xad": u"\u00AD", # ͠ ͠ c2ad
u"\xae": u"\u00AE", # Π Π c2ae
u"\xaf": u"\u00AF", # Ϡ Ϡ c2af
u"\xb0": u"\u00B0", # Р Р c2b0
u"\xb1": u"\u00B1", # Ѡ Ѡ c2b1
u"\xb2": u"\u00B2", # Ҡ Ҡ c2b2
u"\xb3": u"\u00B3", # Ӡ Ӡ c2b3
u"\xb4": u"\u00B4", # Ԡ Ԡ c2b4
u"\xb5": u"\u00B5", # ՠ ՠ c2b5
u"\xb6": u"\u00B6", # ֠ ֠ c2b6
u"\xb7": u"\u00B7", # נ נ c2b7
u"\xb8": u"\u00B8", # ؠ ؠ c2b8
u"\xb9": u"\u00B9", # ٠ ٠ c2b9
u"\xba": u"\u00BA", # ڠ ڠ c2ba
u"\xbb": u"\u00BB", # ۠ ۠ c2bb
u"\xbc": u"\u00BC", # ܠ ܠ c2bc
u"\xbd": u"\u00BD", # ݠ ݠ c2bd
u"\xbe": u"\u00BE", # ޠ ޠ c2be
u"\xbf": u"\u00BF", # ߠ ߠ c2bf
u"\xc0": u"\u00C0", # c380
u"\xc1": u"\u00C1", # ` ` c381
u"\xc2": u"\u00C2", # c382
u"\xc3": u"\u00C3", # à à c383
u"\xc4": u"\u00C4", # Ġ Ġ c384
u"\xc5": u"\u00C5", # Š Š c385
u"\xc6": u"\u00C6", # Ơ Ơ c386
u"\xc7": u"\u00C7", # Ǡ Ǡ c387
u"\xc8": u"\u00C8", # Ƞ Ƞ c388
u"\xc9": u"\u00C9", # ɠ ɠ c389
u"\xca": u"\u00CA", # ʠ ʠ c38a
u"\xcb": u"\u00CB", # ˠ ˠ c38b
u"\xcc": u"\u00CC", # ̠ ̠ c38c
u"\xcd": u"\u00CD", # ͠ ͠ c38d
u"\xce": u"\u00CE", # Π Π c38e
u"\xcf": u"\u00CF", # Ϡ Ϡ c38f
u"\xd0": u"\u00D0", # Р Р c390
u"\xd1": u"\u00D1", # Ѡ Ѡ c391
u"\xd2": u"\u00D2", # Ҡ Ҡ c392
u"\xd3": u"\u00D3", # Ӡ Ӡ c393
u"\xd4": u"\u00D4", # Ԡ Ԡ c394
u"\xd5": u"\u00D5", # ՠ ՠ c395
u"\xd6": u"\u00D6", # ֠ ֠ c396
u"\xd7": u"\u00D7", # נ נ c397
u"\xd8": u"\u00D8", # ؠ ؠ c398
u"\xd9": u"\u00D9", # ٠ ٠ c399
u"\xda": u"\u00DA", # ڠ ڠ c39a
u"\xdb": u"\u00DB", # ۠ ۠ c39b
u"\xdc": u"\u00DC", # ܠ ܠ c39c
u"\xdd": u"\u00DD", # ݠ ݠ c39d
u"\xde": u"\u00DE", # ޠ ޠ c39e
u"\xdf": u"\u00DF", # ߠ ߠ c39f
u"\xe0": u"\u00E0", # ࠠ ࠠ c3a0
u"\xe1": u"\u00E1", # ᠠ ᠠ c3a1
u"\xe2": u"\u00E2", # ⠠ ⠠ c3a2
u"\xe3": u"\u00E3", # 㠠 㠠 c3a3
u"\xe4": u"\u00E4", # 䠠 䠠 c3a4
u"\xe5": u"\u00E5", # 堠 堠 c3a5
u"\xe6": u"\u00E6", # 栠 栠 c3a6
u"\xe7": u"\u00E7", # 砠 砠 c3a7
u"\xe8": u"\u00E8", # 蠠 蠠 c3a8
u"\xe9": u"\u00E9", # 頠 頠 c3a9
u"\xea": u"\u00EA", # ꠠ ꠠ c3aa
u"\xeb": u"\u00EB", # 렠 렠 c3ab
u"\xec": u"\u00EC", # 젠 젠 c3ac
u"\xed": u"\u00ED", # �� �� c3ad
u"\xee": u"\u00EE", #   c3ae
u"\xef": u"\u00EF", #   c3af
u"\xf0": u"\u00F0", # 𠠠 𠠠 c3b0
u"\xf1": u"\u00F1", # 񠠠 񠠠 c3b1
u"\xf2": u"\u00F2", # 򠠠 򠠠 c3b2
u"\xf3": u"\u00F3", # 󠠠 󠠠 c3b3
u"\xf4": u"\u00F4", # ���� ���� c3b4
u"\xf5": u"\u00F5", # ���� ���� c3b5
u"\xf6": u"\u00F6", # ���� ���� c3b6
u"\xf7": u"\u00F7", # ���� ���� c3b7
u"\xf8": u"\u00F8", # 𠠠 𠠠 c3b8
u"\xf9": u"\u00F9", # 񠠠 񠠠 c3b9
u"\xfa": u"\u00FA", # 򠠠 򠠠 c3ba
u"\xfb": u"\u00FB", # 󠠠 󠠠 c3bb
u"\xfc": u"\u00FC", # ���� ���� c3bc
u"\xfd": u"\u00FD", # ���� ���� c3bd
u"\xfe": u"\u00FE", # ���� ���� c3be
u"\xff": u"\u00FF", # ���� ���� c3bf
}
import re
def killgremlins(text):
# map cp1252 gremlins to real unicode characters
if re.search(u"[\x80-\xff]", text):
def fixup(m):
s = m.group(0)
return cp1252.get(s, s)
if isinstance(text, type("")):
# make sure we have a unicode string
text = unicode(text, "iso-8859-1")
text = re.sub(u"[\x80-\xff]", fixup, text)
return text
def clean_repliers(participants):
persons = []
for person in participants:
person = person['href'].replace('member.php?', '')
persons.append(person)
return persons
def remove_img_tags(data):
p = re.compile(r'<img.*?/>')
img = re.findall(p, data)
emotions = []
for i in img:
emotion = i.split('title=')[1].split('/')[0]
emotions.append(emotion)
emotions = ','.join(emotions)
return p.sub('', data), emotions
#test2 = u'I love you<img alt="" border="0" class="inlineimg" src="images/smilies/smile.png" title="Smile"/>I hate bad men <img alt="" border="0" class="inlineimg" src="images/smilies/mad.png" title="Mad"/>'
#a = remove_img_tags(test2)
def parse_webpage(url):
browser = br.open(url)
# '''sleep'''
sleep(0.01*randint(1,7))
# browser = br.open(thread_urls[670])
soup = BeautifulSoup(browser)
thread = soup.find('blockquote').contents
thread = ''.join(unicode(tag).replace('<br/>', '') for tag in thread)
thread = thread.strip()
thread = thread.replace(u'\uf04a', '?')
thread = thread.replace(u'\uf04c', '?')
thread = thread.replace(u'\u2665', 'love')
thread = thread.replace('\r', '').replace('\n', '')
thread = thread.replace('<i>', ''). replace('</i>', '')
thread = thread.replace('<b>', ''). replace('</b>', '')
thread = killgremlins(thread)
thread_tags = remove_img_tags(thread)
thread = thread_tags[0]
thread_emotion = thread_tags[1]
thread_url = url
datetime = soup.find_all('span', {'class', 'date'})
reply_num = len(datetime) - 1
thread_date = datetime[0].contents[0]
thread_time = datetime[0].contents[1].contents[0].encode('utf-8')
participants = soup.find_all('a', {'class', 'siteicon_profile'})
author_type = soup.find('span', {'class', 'usertitle'}).contents[0].strip()
if author_type == 'Guest':
thread_author = 'guest'
repliers = clean_repliers(participants)
repliers = ','.join(unicode(tag).replace('<br/>', ' ') for tag in repliers).strip()
else:
thread_author = participants[0]['href'].replace('member.php?', '')
repliers = clean_repliers(participants[1:])
repliers = ','.join(unicode(tag).replace('<br/>', ' ') for tag in repliers)
return (thread_author, thread_url, thread_date, thread_time, reply_num, repliers, thread_emotion, thread)
#<span class="username guest">prinzeza87</span>
br = screen_login()
thread_save = open("C:/Users/chengwang6/Desktop/WinPython-64bit-2.7.6.4/thread_info.csv",'wb') # save to csv file
seq_num = 0
for url in thread_urls:
print seq_num, round(float(seq_num)/820, 3), url
result = parse_webpage(url)
# result = parse_webpage(thread_urls[330])
print >>thread_save, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % result
seq_num += 1
thread_save.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment