Created
June 2, 2014 04:58
-
-
Save chengjun/d19bdce3de3250165b78 to your computer and use it in GitHub Desktop.
scrape thread info using mechanize in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Spyder Editor | |
This temporary script file is located here: | |
C:\Users\chengwang6\Desktop\WinPython-64bit-2.7.6.4\settings\.spyder2\.temp.py | |
""" | |
import mechanize | |
import cookielib | |
from bs4 import BeautifulSoup | |
from random import randint | |
from time import sleep | |
import csv | |
def screen_login(): | |
br = mechanize.Browser() | |
cj = cookielib.LWPCookieJar() | |
br.set_cookiejar(cj) | |
# setting | |
br.set_handle_equiv(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
# br.set_debug_http(True) | |
# User-Agent (this is cheating, ok?) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Chrome/17.0.963.56 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
# Follows refresh 0 but not hangs on refresh > 0 | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
# br.add_password('http://members.lovingfromadistance.com/', 'chengjun', 'wangchj0') | |
br.open('http://members.lovingfromadistance.com/login.php?do=login') | |
# Open the login page | |
br.select_form(nr = 0) # Find the login form | |
br['vb_login_username'] = 'your_name' # Set the form values | |
br['vb_login_password'] = 'your_pwd' | |
br.submit() # Submit the form | |
return(br) | |
# read data | |
def get_urls(total_page_num): | |
urls = [] | |
profiles = [] | |
web_url = 'http://members.lovingfromadistance.com/forumdisplay.php?18-Closing-the-Distance/page' | |
for page_num in range(1, total_page_num + 1): | |
browser = br.open( web_url + str(page_num) ) | |
# '''sleep''' | |
sleep(0.01*randint(1,7)) | |
print page_num | |
# browser = br.open('http://members.lovingfromadistance.com/forumdisplay.php?18-Closing-the-Distance/page1') | |
# parse data | |
soup = BeautifulSoup(browser) | |
titles = soup.find_all('a', {'class', 'title'}) | |
users = soup.find_all('a', {'class', 'siteicon_profile'}) | |
for i in range(len(titles)): | |
title = titles[i]['href'] # .split('&')[0] | |
user = users[i]['href'] | |
title = 'http://members.lovingfromadistance.com/' + title | |
profiles.append(user) | |
urls.append(title) | |
print title | |
return urls, profiles | |
br = screen_login() | |
thread_and_user = get_urls(41) | |
thread_urls = thread_and_user[0] | |
user_profiles_all = thread_and_user[1] | |
user_profiles = list(set(user_profiles_all)) # delete duplicated users | |
#seq_id = user_profiles_all.index(u'member.php?557-agentholli') | |
#thread_urls[seq_id] | |
#http://members.lovingfromadistance.com/forumdisplay.php?18-Closing-the-Distance/page34 | |
#http://members.lovingfromadistance.com/showthread.php?11305-Moving-from-England-to-Republic-Of-Ireland | |
# 27 people are guests or change their names! | |
# [1] "9152-BabyDimples" "0-Lina" "0-NaNi" "6409-kaylamcquaig" "3018-FadedSunrise" "11103-Schlafmütze" "782-rsvpnj" "0-Madge" | |
# [9] "0-Brandie" "0-books" "0-Mio" "8211-Tooki" "15461-Krissy12512" "10382-staystrong12" "9594-LoveL" "0-BJL_Sweetheart1109" | |
# [17] "557-agentholli" "0-Èternity" "11883-jlcsgirl" "9132-rach92g" "0-Bluestars" "8953-lademoiselle" "1468-Élan" "664-Caitlin2009" | |
# [25] "8871-MeganK" | |
"""" | |
'''Thread info''' | |
""" | |
## bigger collection: | |
cp1252 = { | |
u"\x80": u"\u20AC", # e282ac | |
u"\x81": u"\uFFFD", # ` ? efbfbd | |
u"\x82": u"\u201A", # e2809a | |
u"\x83": u"\u0192", # à à c692 | |
u"\x84": u"\u201E", # Ġ Ġ e2809e | |
u"\x85": u"\u2026", # Š Š e280a6 | |
u"\x86": u"\u2020", # Ơ Ơ e280a0 | |
u"\x87": u"\u2021", # Ǡ Ǡ e280a1 | |
u"\x88": u"\u02C6", # Ƞ Ƞ cb86 | |
u"\x89": u"\u2030", # ɠ ɠ e280b0 | |
u"\x8a": u"\u0160", # ʠ ʠ c5a0 | |
u"\x8b": u"\u2039", # ˠ ˠ e280b9 | |
u"\x8c": u"\u0152", # ̠ ̠ c592 | |
u"\x8d": u"\uFFFD", # ͠ ? efbfbd | |
u"\x8e": u"\u017D", # Π Π c5bd | |
u"\x8f": u"\uFFFD", # Ϡ ? efbfbd | |
u"\x90": u"\uFFFD", # Р ? efbfbd | |
u"\x91": u"\u2018", # Ѡ Ѡ e28098 | |
u"\x92": u"\u2019", # Ҡ Ҡ e28099 | |
u"\x93": u"\u201C", # Ӡ Ӡ e2809c | |
u"\x94": u"\u201D", # Ԡ Ԡ e2809d | |
u"\x95": u"\u2022", # ՠ ՠ e280a2 | |
u"\x96": u"\u2013", # ֠ ֠ e28093 | |
u"\x97": u"\u2014", # נ נ e28094 | |
u"\x98": u"\u02DC", # ؠ ؠ cb9c | |
u"\x99": u"\u2122", # ٠ ٠ e284a2 | |
u"\x9a": u"\u0161", # ڠ ڠ c5a1 | |
u"\x9b": u"\u203A", # ۠ ۠ e280ba | |
u"\x9c": u"\u0153", # ܠ ܠ c593 | |
u"\x9d": u"\uFFFD", # ݠ ? efbfbd | |
u"\x9e": u"\u017E", # ޠ ޠ c5be | |
u"\x9f": u"\u0178", # ߠ ߠ c5b8 | |
u"\xa0": u"\u00A0", # c2a0 | |
u"\xa1": u"\u00A1", # ` ` c2a1 | |
u"\xa2": u"\u00A2", # c2a2 | |
u"\xa3": u"\u00A3", # à à c2a3 | |
u"\xa4": u"\u00A4", # Ġ Ġ c2a4 | |
u"\xa5": u"\u00A5", # Š Š c2a5 | |
u"\xa6": u"\u00A6", # Ơ Ơ c2a6 | |
u"\xa7": u"\u00A7", # Ǡ Ǡ c2a7 | |
u"\xa8": u"\u00A8", # Ƞ Ƞ c2a8 | |
u"\xa9": u"\u00A9", # ɠ ɠ c2a9 | |
u"\xaa": u"\u00AA", # ʠ ʠ c2aa | |
u"\xab": u"\u00AB", # ˠ ˠ c2ab | |
u"\xac": u"\u00AC", # ̠ ̠ c2ac | |
u"\xad": u"\u00AD", # ͠ ͠ c2ad | |
u"\xae": u"\u00AE", # Π Π c2ae | |
u"\xaf": u"\u00AF", # Ϡ Ϡ c2af | |
u"\xb0": u"\u00B0", # Р Р c2b0 | |
u"\xb1": u"\u00B1", # Ѡ Ѡ c2b1 | |
u"\xb2": u"\u00B2", # Ҡ Ҡ c2b2 | |
u"\xb3": u"\u00B3", # Ӡ Ӡ c2b3 | |
u"\xb4": u"\u00B4", # Ԡ Ԡ c2b4 | |
u"\xb5": u"\u00B5", # ՠ ՠ c2b5 | |
u"\xb6": u"\u00B6", # ֠ ֠ c2b6 | |
u"\xb7": u"\u00B7", # נ נ c2b7 | |
u"\xb8": u"\u00B8", # ؠ ؠ c2b8 | |
u"\xb9": u"\u00B9", # ٠ ٠ c2b9 | |
u"\xba": u"\u00BA", # ڠ ڠ c2ba | |
u"\xbb": u"\u00BB", # ۠ ۠ c2bb | |
u"\xbc": u"\u00BC", # ܠ ܠ c2bc | |
u"\xbd": u"\u00BD", # ݠ ݠ c2bd | |
u"\xbe": u"\u00BE", # ޠ ޠ c2be | |
u"\xbf": u"\u00BF", # ߠ ߠ c2bf | |
u"\xc0": u"\u00C0", # c380 | |
u"\xc1": u"\u00C1", # ` ` c381 | |
u"\xc2": u"\u00C2", # c382 | |
u"\xc3": u"\u00C3", # à à c383 | |
u"\xc4": u"\u00C4", # Ġ Ġ c384 | |
u"\xc5": u"\u00C5", # Š Š c385 | |
u"\xc6": u"\u00C6", # Ơ Ơ c386 | |
u"\xc7": u"\u00C7", # Ǡ Ǡ c387 | |
u"\xc8": u"\u00C8", # Ƞ Ƞ c388 | |
u"\xc9": u"\u00C9", # ɠ ɠ c389 | |
u"\xca": u"\u00CA", # ʠ ʠ c38a | |
u"\xcb": u"\u00CB", # ˠ ˠ c38b | |
u"\xcc": u"\u00CC", # ̠ ̠ c38c | |
u"\xcd": u"\u00CD", # ͠ ͠ c38d | |
u"\xce": u"\u00CE", # Π Π c38e | |
u"\xcf": u"\u00CF", # Ϡ Ϡ c38f | |
u"\xd0": u"\u00D0", # Р Р c390 | |
u"\xd1": u"\u00D1", # Ѡ Ѡ c391 | |
u"\xd2": u"\u00D2", # Ҡ Ҡ c392 | |
u"\xd3": u"\u00D3", # Ӡ Ӡ c393 | |
u"\xd4": u"\u00D4", # Ԡ Ԡ c394 | |
u"\xd5": u"\u00D5", # ՠ ՠ c395 | |
u"\xd6": u"\u00D6", # ֠ ֠ c396 | |
u"\xd7": u"\u00D7", # נ נ c397 | |
u"\xd8": u"\u00D8", # ؠ ؠ c398 | |
u"\xd9": u"\u00D9", # ٠ ٠ c399 | |
u"\xda": u"\u00DA", # ڠ ڠ c39a | |
u"\xdb": u"\u00DB", # ۠ ۠ c39b | |
u"\xdc": u"\u00DC", # ܠ ܠ c39c | |
u"\xdd": u"\u00DD", # ݠ ݠ c39d | |
u"\xde": u"\u00DE", # ޠ ޠ c39e | |
u"\xdf": u"\u00DF", # ߠ ߠ c39f | |
u"\xe0": u"\u00E0", # ࠠ ࠠ c3a0 | |
u"\xe1": u"\u00E1", # ᠠ ᠠ c3a1 | |
u"\xe2": u"\u00E2", # ⠠ ⠠ c3a2 | |
u"\xe3": u"\u00E3", # 㠠 㠠 c3a3 | |
u"\xe4": u"\u00E4", # 䠠 䠠 c3a4 | |
u"\xe5": u"\u00E5", # 堠 堠 c3a5 | |
u"\xe6": u"\u00E6", # 栠 栠 c3a6 | |
u"\xe7": u"\u00E7", # 砠 砠 c3a7 | |
u"\xe8": u"\u00E8", # 蠠 蠠 c3a8 | |
u"\xe9": u"\u00E9", # 頠 頠 c3a9 | |
u"\xea": u"\u00EA", # ꠠ ꠠ c3aa | |
u"\xeb": u"\u00EB", # 렠 렠 c3ab | |
u"\xec": u"\u00EC", # 젠 젠 c3ac | |
u"\xed": u"\u00ED", # �� �� c3ad | |
u"\xee": u"\u00EE", # c3ae | |
u"\xef": u"\u00EF", # c3af | |
u"\xf0": u"\u00F0", # 𠠠 𠠠 c3b0 | |
u"\xf1": u"\u00F1", # c3b1 | |
u"\xf2": u"\u00F2", # c3b2 | |
u"\xf3": u"\u00F3", # c3b3 | |
u"\xf4": u"\u00F4", # ���� ���� c3b4 | |
u"\xf5": u"\u00F5", # ���� ���� c3b5 | |
u"\xf6": u"\u00F6", # ���� ���� c3b6 | |
u"\xf7": u"\u00F7", # ���� ���� c3b7 | |
u"\xf8": u"\u00F8", # 𠠠 𠠠 c3b8 | |
u"\xf9": u"\u00F9", # c3b9 | |
u"\xfa": u"\u00FA", # c3ba | |
u"\xfb": u"\u00FB", # c3bb | |
u"\xfc": u"\u00FC", # ���� ���� c3bc | |
u"\xfd": u"\u00FD", # ���� ���� c3bd | |
u"\xfe": u"\u00FE", # ���� ���� c3be | |
u"\xff": u"\u00FF", # ���� ���� c3bf | |
} | |
import re | |
def killgremlins(text): | |
# map cp1252 gremlins to real unicode characters | |
if re.search(u"[\x80-\xff]", text): | |
def fixup(m): | |
s = m.group(0) | |
return cp1252.get(s, s) | |
if isinstance(text, type("")): | |
# make sure we have a unicode string | |
text = unicode(text, "iso-8859-1") | |
text = re.sub(u"[\x80-\xff]", fixup, text) | |
return text | |
def clean_repliers(participants): | |
persons = [] | |
for person in participants: | |
person = person['href'].replace('member.php?', '') | |
persons.append(person) | |
return persons | |
def remove_img_tags(data): | |
p = re.compile(r'<img.*?/>') | |
img = re.findall(p, data) | |
emotions = [] | |
for i in img: | |
emotion = i.split('title=')[1].split('/')[0] | |
emotions.append(emotion) | |
emotions = ','.join(emotions) | |
return p.sub('', data), emotions | |
#test2 = u'I love you<img alt="" border="0" class="inlineimg" src="images/smilies/smile.png" title="Smile"/>I hate bad men <img alt="" border="0" class="inlineimg" src="images/smilies/mad.png" title="Mad"/>' | |
#a = remove_img_tags(test2) | |
def parse_webpage(url): | |
browser = br.open(url) | |
# '''sleep''' | |
sleep(0.01*randint(1,7)) | |
# browser = br.open(thread_urls[670]) | |
soup = BeautifulSoup(browser) | |
thread = soup.find('blockquote').contents | |
thread = ''.join(unicode(tag).replace('<br/>', '') for tag in thread) | |
thread = thread.strip() | |
thread = thread.replace(u'\uf04a', '?') | |
thread = thread.replace(u'\uf04c', '?') | |
thread = thread.replace(u'\u2665', 'love') | |
thread = thread.replace('\r', '').replace('\n', '') | |
thread = thread.replace('<i>', ''). replace('</i>', '') | |
thread = thread.replace('<b>', ''). replace('</b>', '') | |
thread = killgremlins(thread) | |
thread_tags = remove_img_tags(thread) | |
thread = thread_tags[0] | |
thread_emotion = thread_tags[1] | |
thread_url = url | |
datetime = soup.find_all('span', {'class', 'date'}) | |
reply_num = len(datetime) - 1 | |
thread_date = datetime[0].contents[0] | |
thread_time = datetime[0].contents[1].contents[0].encode('utf-8') | |
participants = soup.find_all('a', {'class', 'siteicon_profile'}) | |
author_type = soup.find('span', {'class', 'usertitle'}).contents[0].strip() | |
if author_type == 'Guest': | |
thread_author = 'guest' | |
repliers = clean_repliers(participants) | |
repliers = ','.join(unicode(tag).replace('<br/>', ' ') for tag in repliers).strip() | |
else: | |
thread_author = participants[0]['href'].replace('member.php?', '') | |
repliers = clean_repliers(participants[1:]) | |
repliers = ','.join(unicode(tag).replace('<br/>', ' ') for tag in repliers) | |
return (thread_author, thread_url, thread_date, thread_time, reply_num, repliers, thread_emotion, thread) | |
#<span class="username guest">prinzeza87</span> | |
br = screen_login() | |
thread_save = open("C:/Users/chengwang6/Desktop/WinPython-64bit-2.7.6.4/thread_info.csv",'wb') # save to csv file | |
seq_num = 0 | |
for url in thread_urls: | |
print seq_num, round(float(seq_num)/820, 3), url | |
result = parse_webpage(url) | |
# result = parse_webpage(thread_urls[330]) | |
print >>thread_save, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % result | |
seq_num += 1 | |
thread_save.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment