Created
July 25, 2013 09:34
-
-
Save Xorcerer/6078244 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
import re | |
import urllib2 | |
import httplib | |
from datetime import datetime, date, timedelta | |
from bs4 import BeautifulSoup | |
prices_of_deals = {} # (region, server) : price_list | |
today = date.today() | |
yesterday = today - timedelta(days=1) | |
titles_of_funcs = { | |
len: u'游戏币交易量', | |
min: u'游戏币低价位', | |
max: u'游戏币高价位', | |
} | |
region_of_servers = {} | |
prefix_of_servers = { | |
u'黄巾起义': u'【桃】', | |
u'桃园结义': u'【桃】', | |
u'天下无双': u'【桃】', | |
u'五关六将': u'【桃】', | |
u'锦囊妙计': u'【桃】', | |
u'三顾茅庐': u'【三足】', | |
u'千里单骑': u'【桃】', | |
u'赤壁之战': u'【三足】', | |
u'赤兔宝马': u'【三足】', | |
u'卧龙凤雏': u'【三足】', | |
u'奇门遁甲': u'【三足】', | |
u'对酒当歌': u'【三足】', | |
u'草船借箭': u'【三足】', | |
u'舌战群儒': u'【三足】', | |
u'方天画戟': u'【倚】', | |
u'倚天青釭': u'【倚】', | |
u'羽扇纶巾': u'【倚】', | |
u'丈八蛇矛': u'【倚】', | |
u'青龙偃月': u'【倚】', | |
u'三足鼎立': u'【三足】', | |
u'群雄逐鹿': u'【三足】', | |
u'月明星稀': u'【月】', | |
u'戎马一生': u'【月】', | |
u'江山如画': u'【江山】', | |
u'洛神赋': u'【江山】', | |
u'白马义从': u'【义从】', | |
} | |
orders = (u'黄巾起义', | |
u'桃园结义', | |
u'天下无双', | |
u'五关六将', | |
u'三顾茅庐', | |
u'赤壁之战', | |
u'锦囊妙计', | |
u'千里单骑', | |
u'赤兔宝马', | |
u'卧龙凤雏', | |
u'奇门遁甲', | |
u'对酒当歌', | |
u'草船借箭', | |
u'舌战群儒', | |
u'方天画戟', | |
u'倚天青釭', | |
u'羽扇纶巾', | |
u'丈八蛇矛', | |
u'青龙偃月', | |
u'三足鼎立', | |
u'群雄逐鹿', | |
u'月明星稀', | |
u'江山如画', | |
u'洛神赋', | |
u'戎马一生', | |
u'白马义从', | |
u'白马跃溪', | |
u'龙胆银枪', | |
u'乱世之战', | |
u'青春飞扬', | |
u'斗转星移', | |
u'武魂降临', | |
u'梦回三国', | |
) | |
def print_and_save_to_file(prices_dict, filename='result.txt'): | |
with open(filename, 'a') as f: | |
def p(l): | |
print l | |
f.write(l.encode('GBK')) | |
f.write('\n') | |
l = u'------我是华丽丽的分割线---%s---\n' % datetime.now() | |
p(l) | |
for server in orders: | |
if server not in prices_dict: | |
l = u'服务器 %s 找不到\n\n' % server | |
p(l) | |
continue | |
prices = prices_dict[server] | |
region = region_of_servers[server] | |
prefix = prefix_of_servers.get(server, '') | |
for func in (len, min, max): | |
time = today.strftime('%Y/%m/%d 18:00') | |
l = u'%s %s%s "%s" %s %s' % \ | |
(region, prefix, server, time, titles_of_funcs[func], func(prices)) | |
p(l) | |
def get_soup(url): | |
t1 = datetime.now() | |
print 'Loading:', url | |
try: | |
html = urllib2.urlopen(url).read() | |
except httplib.IncompleteRead as e: | |
print 'IncompleteRead Exception, try parsing the loaded partial.' | |
html = e.partial | |
print '%s to load.' % (datetime.now() - t1,) | |
return BeautifulSoup(html.decode('GBK', errors='replace')) | |
def append_price_for_servers(region, servers_str, price_node, prices_dict): | |
for s in servers_str.split('/'): | |
region_of_servers[s] = region | |
price = price_node.string.split(u'元')[0] | |
prices_dict.setdefault(s, []).append(float(price)) | |
def extract_prices_of_deals(url): | |
""" Return False if we reach yesterday's data. """ | |
soup = get_soup(url) | |
date_nodes = soup.find_all(class_='li5') | |
todays_record_found = False | |
for node in date_nodes: | |
if node.li: | |
date_str = node.li.string.split(' ')[0] | |
item_date = datetime.strptime(date_str, '%Y-%m-%d').date() | |
if item_date != today: | |
print '%s record found.' % item_date | |
return todays_record_found | |
todays_record_found = True | |
price_node = node.parent.find(text=re.compile(u'^[0-9]+\\.[0-9]+元/金')) | |
region_servers_str = node.parent.find('li', text=re.compile(u'^游戏')).string.split(u'桃园将星录/', 1)[1] | |
if price_node: | |
region, servers_str = region_servers_str.split('/', 1) | |
append_price_for_servers(region, servers_str, price_node, prices_of_deals) | |
return True | |
def extract_deals(): | |
template_url = 'http://trading.5173.com/list/viewlastestdeallist.aspx?ts=&gm=7e46ecc864a745a69294d597f71a55ec&sort=LastRefresh_DESC&pg=%d' | |
page = 1 | |
while extract_prices_of_deals(template_url % page): | |
page += 1 | |
print 'Total %d page(s).' % page | |
return prices_of_deals | |
prices_of_sales = {} # server : price_list | |
def extract_prices_of_items(url): | |
soup = get_soup(url) | |
# Removed ads. | |
hot_recommand_node = soup.find(class_='hot_recommend') | |
if hot_recommand_node: | |
hot_recommand_node.clear() | |
item_nodes = soup.find_all(class_='sin_pdlbox') | |
print len(item_nodes) | |
if not item_nodes: | |
return False | |
for item_node in item_nodes: | |
region_and_server_links = item_node.find(text=re.compile(u'游戏/区/服')).parent.find_all('a') | |
region = region_and_server_links[1].string | |
servers_str = region_and_server_links[2].string | |
price_node = item_node.find(text=re.compile(u'元/金')) | |
append_price_for_servers(region, servers_str, price_node, prices_of_sales) | |
return True | |
def extract_prices_for_sales(): | |
template_url = 'http://s.5173.com/search/7e46ecc864a745a69294d597f71a55ec-%d.shtml?=&cate=af21c660f670427181b5487c3ab850d0&sort=itemprice_asc' | |
page = 1 | |
while extract_prices_of_items(template_url % page): | |
page += 1 | |
print 'Total %d page(s).' % page | |
return prices_of_sales | |
print_and_save_to_file(extract_deals()) | |
# print_and_save_to_file(extract_prices_for_sales()) | |
raw_input('Press any key to exit.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment