Skip to content

Instantly share code, notes, and snippets.

@t0mst0ne
Last active August 29, 2015 14:07
Show Gist options
  • Save t0mst0ne/c4b8f7fe64a0b3bf0056 to your computer and use it in GitHub Desktop.
Save t0mst0ne/c4b8f7fe64a0b3bf0056 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#coding:UTF-8
import requests
import re
import json
import pandas as pd
import datetime
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate',
'Accept-Language':'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Content-Length':'64',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'JSESSIONID=0000Cm83BTiaoUqt0riqIACFS3X:-1; NCES_JSESSIONID=2OcKWTq9NErn2OnflYqMI1lxHMjNy3VFQbPNbXl6oxlrMTGggVhXDo+9xFPb/bgCY3ux9UN5Mh19sEZ4ckezH993dWNfB1ByvogzG4hb/kG5pGi7QP06lQ==; NCIS_EntryControl=zWYFYZmim4JS9N2i2y3EyXX8ggf3xpL6VnqM+n0WuiKRbWYwrqcuEsI57ILG/vVntLNaZNCYass4sTI4UYkVuA==',
'DNT':'1',
'Host':'aomp.judicial.gov.tw',
'Origin':'http://aomp.judicial.gov.tw',
'Referer':'http://aomp.judicial.gov.tw/abbs/wkw/WHD2A00.jsp',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'
}
data = {
'pageTotal':'100',
'pageSize':'15',
'rowStart':'16',
'order':'odcrm',
'query_typeX':'session',
'saleno':'',
'hsimun':'all',
'ctmd':'all',
'sec':'all',
'crmyy':'',
'crmid':'',
'crmno':'',
'dpt':'',
'saledate1':'',
'saledate2':'',
'minprice1':'',
'minprice2':'',
'sumprice1':'',
'sumprice2':'',
'area1':'',
'area2':'',
'registeno':'',
'checkyn':'all',
'emptyyn':'all',
'order':'odcrm',
'owner1':'',
'landkd':'',
'comm_yn':'',
}
WHD2A00 = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A00.jsp')
nccharset = re.findall(u'name="nccharset" value="(.+?)"',WHD2A00.text)
# courts = ['HLD',]
# for court in courts :
data['nccharset'] = nccharset
data['court'] = 'HLD'
WHD2A01 = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A01.jsp',data=data,headers=headers )
nccharset = re.findall(u'name="nccharset" value="(.+?)"',WHD2A01.text)
data['nccharset'] = nccharset
data['court'] = 'HLD'
data['proptype'] = 'C51' #C52:房屋 C51:土地
data['saletype'] = '1'
WHD2A02 = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A02.jsp',data=data,headers=headers )
data['courtX'] = 'HLD' # courtX in WHD2A03.jsp ; court in WHD2A02.jsp
data['proptypeX'] = 'C51'
data['saletypeX'] = '1'
vs = re.findall('"hidden" name=".+?" value="(.+?)">', WHD2A02.text)
ks = re.findall('"hidden" name="(.+?)" value=".+?">', WHD2A02.text)
d ={ k:v for k,v in zip(ks,vs) }
data.update(d)
html = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A03.jsp', verify=False, headers=headers, data=data)
total_cases = re.findall(u'<td>\s+合計件數:&nbsp;(\d+?)&nbsp;件',html.text)
pages = int(total_cases[0])/15 + 2
print total_cases , pages
foreclosure = {}
df2 = pd.DataFrame()
for x in range (1,pages):
data['pageNow'] = x
html = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A03.jsp', verify=False, headers=headers, data=data)
num = re.findall(u'<div align="center">(\d+?)</div>',html.text)
item = re.findall(u'<td width="15%">(.+?)<br>',html.text)
auction_date = re.findall(u'<div align="center">(\d+\/\d+\/\d+?)<br>\s+(.+?)\s+</div>',html.text)
county = re.findall(u'<div align="center">(.+)<br>\s+(.+)?<',html.text)
address = re.findall(u'blank">\s+(.+)\s+</a>\s+<br>\s+(\d+)\s+(.+)\s+<BR>(.+)\r\n', html.text)
price = re.findall(u'<td>\s+(.+?)\s+</td>',html.text)
check = re.findall(u'6%">\s+<div align="center">(.+?)</div>',html.text)
#print 'page' + str(x)
for i in range (0,len(num)):
foreclosure['num'] = num[i].encode('utf-8')
foreclosure['item'] = item[i].encode('utf-8')
foreclosure['auction_date'] = auction_date[i][0].encode('utf-8')
foreclosure['times'] = auction_date[i][1].encode('utf-8')
foreclosure['county'] = county[i+1][1].encode('utf-8')
foreclosure['address'] = address[i][0].encode('utf-8')
foreclosure['area'] = address[i][1].encode('utf-8') + address[i][2].encode('utf-8') + address[i][3].encode('utf-8')
foreclosure['price'] = price[i].encode('utf-8')
foreclosure['court'] = data['court'].encode('utf-8')
foreclosure['check'] = check[i].encode('utf-8')
#print ( json.dumps(foreclosure, ensure_ascii=False) )
df = pd.DataFrame.from_dict(foreclosure, orient='index').T
df2 = df2.append(df, ignore_index=True)
datestr = datetime.date.today().strftime('%Y%m%d')
df2.to_csv('HLD_%s.csv' % (datestr))
@t0mst0ne
Copy link
Author

t0mst0ne commented Oct 7, 2014

'BCBC9440416E9EF00BDB80F199411869':'4E552F2AA089FD3A045C1D5A350C3275'
=> It seems changed when browser is closed

@lanfon72
Copy link

res = requests.post('http://aomp.judicial.gov.tw/abbs/wkw/WHD2A02.jsp', data={'proptype':'C51', 'saletype':'1'}, headers=headers)
ks = re.findall('"hidden" name="." value="(.+?)">', res.text)
vs = re.findall('"hidden" name="(.+?)" value=".
">', res.text)
d ={ k:v for k,v in zip(ks,vs) }
data.update(d)

headers as your headers, data as your data.

@t0mst0ne
Copy link
Author

(1) Minor update
ks = re.findall('"hidden" name="(.+?)" value=".+?">', res.text)
vs = re.findall('"hidden" name=".+?" value="(.+?)">', res.text)
(2) The data form format is different between WHD2A02.jsp & WHD2A03.jsp
courtX, proptypeX,saletypeX in WHD2A03.jsp
court,saletype, proptype in WHD2A02.jsp

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment