Last active
August 29, 2015 14:07
-
-
Save t0mst0ne/c4b8f7fe64a0b3bf0056 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding:UTF-8 | |
import requests | |
import re | |
import json | |
import pandas as pd | |
import datetime | |
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Encoding':'gzip,deflate', | |
'Accept-Language':'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2', | |
'Cache-Control':'max-age=0', | |
'Connection':'keep-alive', | |
'Content-Length':'64', | |
'Content-Type':'application/x-www-form-urlencoded', | |
'Cookie':'JSESSIONID=0000Cm83BTiaoUqt0riqIACFS3X:-1; NCES_JSESSIONID=2OcKWTq9NErn2OnflYqMI1lxHMjNy3VFQbPNbXl6oxlrMTGggVhXDo+9xFPb/bgCY3ux9UN5Mh19sEZ4ckezH993dWNfB1ByvogzG4hb/kG5pGi7QP06lQ==; NCIS_EntryControl=zWYFYZmim4JS9N2i2y3EyXX8ggf3xpL6VnqM+n0WuiKRbWYwrqcuEsI57ILG/vVntLNaZNCYass4sTI4UYkVuA==', | |
'DNT':'1', | |
'Host':'aomp.judicial.gov.tw', | |
'Origin':'http://aomp.judicial.gov.tw', | |
'Referer':'http://aomp.judicial.gov.tw/abbs/wkw/WHD2A00.jsp', | |
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36' | |
} | |
data = { | |
'pageTotal':'100', | |
'pageSize':'15', | |
'rowStart':'16', | |
'order':'odcrm', | |
'query_typeX':'session', | |
'saleno':'', | |
'hsimun':'all', | |
'ctmd':'all', | |
'sec':'all', | |
'crmyy':'', | |
'crmid':'', | |
'crmno':'', | |
'dpt':'', | |
'saledate1':'', | |
'saledate2':'', | |
'minprice1':'', | |
'minprice2':'', | |
'sumprice1':'', | |
'sumprice2':'', | |
'area1':'', | |
'area2':'', | |
'registeno':'', | |
'checkyn':'all', | |
'emptyyn':'all', | |
'order':'odcrm', | |
'owner1':'', | |
'landkd':'', | |
'comm_yn':'', | |
} | |
WHD2A00 = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A00.jsp') | |
nccharset = re.findall(u'name="nccharset" value="(.+?)"',WHD2A00.text) | |
# courts = ['HLD',] | |
# for court in courts : | |
data['nccharset'] = nccharset | |
data['court'] = 'HLD' | |
WHD2A01 = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A01.jsp',data=data,headers=headers ) | |
nccharset = re.findall(u'name="nccharset" value="(.+?)"',WHD2A01.text) | |
data['nccharset'] = nccharset | |
data['court'] = 'HLD' | |
data['proptype'] = 'C51' #C52:房屋 C51:土地 | |
data['saletype'] = '1' | |
WHD2A02 = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A02.jsp',data=data,headers=headers ) | |
data['courtX'] = 'HLD' # courtX in WHD2A03.jsp ; court in WHD2A02.jsp | |
data['proptypeX'] = 'C51' | |
data['saletypeX'] = '1' | |
vs = re.findall('"hidden" name=".+?" value="(.+?)">', WHD2A02.text) | |
ks = re.findall('"hidden" name="(.+?)" value=".+?">', WHD2A02.text) | |
d ={ k:v for k,v in zip(ks,vs) } | |
data.update(d) | |
html = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A03.jsp', verify=False, headers=headers, data=data) | |
total_cases = re.findall(u'<td>\s+合計件數: (\d+?) 件',html.text) | |
pages = int(total_cases[0])/15 + 2 | |
print total_cases , pages | |
foreclosure = {} | |
df2 = pd.DataFrame() | |
for x in range (1,pages): | |
data['pageNow'] = x | |
html = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A03.jsp', verify=False, headers=headers, data=data) | |
num = re.findall(u'<div align="center">(\d+?)</div>',html.text) | |
item = re.findall(u'<td width="15%">(.+?)<br>',html.text) | |
auction_date = re.findall(u'<div align="center">(\d+\/\d+\/\d+?)<br>\s+(.+?)\s+</div>',html.text) | |
county = re.findall(u'<div align="center">(.+)<br>\s+(.+)?<',html.text) | |
address = re.findall(u'blank">\s+(.+)\s+</a>\s+<br>\s+(\d+)\s+(.+)\s+<BR>(.+)\r\n', html.text) | |
price = re.findall(u'<td>\s+(.+?)\s+</td>',html.text) | |
check = re.findall(u'6%">\s+<div align="center">(.+?)</div>',html.text) | |
#print 'page' + str(x) | |
for i in range (0,len(num)): | |
foreclosure['num'] = num[i].encode('utf-8') | |
foreclosure['item'] = item[i].encode('utf-8') | |
foreclosure['auction_date'] = auction_date[i][0].encode('utf-8') | |
foreclosure['times'] = auction_date[i][1].encode('utf-8') | |
foreclosure['county'] = county[i+1][1].encode('utf-8') | |
foreclosure['address'] = address[i][0].encode('utf-8') | |
foreclosure['area'] = address[i][1].encode('utf-8') + address[i][2].encode('utf-8') + address[i][3].encode('utf-8') | |
foreclosure['price'] = price[i].encode('utf-8') | |
foreclosure['court'] = data['court'].encode('utf-8') | |
foreclosure['check'] = check[i].encode('utf-8') | |
#print ( json.dumps(foreclosure, ensure_ascii=False) ) | |
df = pd.DataFrame.from_dict(foreclosure, orient='index').T | |
df2 = df2.append(df, ignore_index=True) | |
datestr = datetime.date.today().strftime('%Y%m%d') | |
df2.to_csv('HLD_%s.csv' % (datestr)) |
res = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A02.jsp', data={'proptype':'C51', 'saletype':'1'}, headers=headers)
ks = re.findall('"hidden" name="." value="(.+?)">', res.text)
vs = re.findall('"hidden" name="(.+?)" value=".">', res.text)
d ={ k:v for k,v in zip(ks,vs) }
data.update(d)
headers as your headers, data as your data.
(1) Minor update
ks = re.findall('"hidden" name="(.+?)" value=".+?">', res.text)
vs = re.findall('"hidden" name=".+?" value="(.+?)">', res.text)
(2) The data form format is different between WHD2A02.jsp & WHD2A03.jsp
courtX, proptypeX,saletypeX in WHD2A03.jsp
court,saletype, proptype in WHD2A02.jsp
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
'BCBC9440416E9EF00BDB80F199411869':'4E552F2AA089FD3A045C1D5A350C3275'
=> It seems changed when browser is closed