Last active
May 24, 2016 21:53
-
-
Save taoalpha/9478aa3503b5cfb03a754e0c542071f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re,os | |
import json | |
import xml.etree.ElementTree as ET | |
import string | |
import datetime | |
import zipfile | |
# name mapper for mapping pinyin with name | |
nameMap = { | |
"fsfxy":"风速风向仪", | |
"dqwdj":"大气温度计", | |
"tjgwdj":"砼结构温度计", | |
"gwdj":"钢温度计", | |
"zxjsdj":"纵向加速度计", | |
"sxjsdj":"竖向加速度计", | |
"hxjsdj":"横向加速度计", | |
"sxjsuj":"双向加速度计", | |
"sxjsdj":"三向加速度计", | |
"wyj":"位移计", | |
"gzxybj":"钢纵向应变计", | |
"ghxybj":"钢横向应变计", | |
"tzxybj":"砼纵向应变计", | |
"tsxybj":"砼竖向应变计", | |
"slj":"索力计", | |
"zxplj":"纵向疲劳计", | |
"hxplj":"横向疲劳计", | |
"gpsz":"GPS站", | |
"jlszy":"静力水准仪", | |
"qxy":"倾斜仪", | |
"gjj":"钢筋计", | |
"wbywdj":"温补用温度计", | |
"tylj":"土压力计", | |
"sxwyj":"三向位移计", | |
"yjt":"阳极梯", | |
"gjylj":"钢筋应力计", | |
"cjq":"车检器", | |
"znq":"阻尼器", | |
"dtczxt":"动态称重系统", | |
"yjbhxt":"阴极保护系统", | |
"csj":"除湿机", | |
"hntylj":"混凝土应力计", | |
"sxplj":"竖向疲劳计" | |
} | |
# header for all requests, simulate the browsers | |
headers = { | |
"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", | |
"referer":"http://health.shcjsq.com/web/m/m04/channel-data-file-tree?treetype=type" | |
} | |
# data api entry, to get the categories' id and other information | |
treeDataUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-tree!treedata?treetype=type" | |
# post data, need to use tuple instead of dictionary for passing the verification | |
treePostData = [ | |
("imageUrl","/web/themes/default/images/icons/tree_doc.gif"), | |
("imageUrl","/web/themes/default/images/icons/componenttypeblue.gif"), | |
("imageUrl","/web/themes/default/images/icons/contract.gif"), | |
("imageUrl","/web/themes/default/images/icons/office_supplies.gif"), | |
("imageUrl","/web/themes/default/images/icons/qbs_subbid1.gif"), | |
("imageUrl","/web/themes/default/images/icons/tree_ssjc_td.gif"), | |
("imageUrl","/web/themes/default/images/icons/flow.gif") | |
] | |
# keep the session | |
s = requests.Session() | |
def login(s, person, password): | |
''' | |
Start and keep the login session. | |
@param {object} s - session keeping requests object | |
@param {string} person - username to login the system | |
@param {string} password - password | |
''' | |
data = {"loginname":person, "loginpass":password, "logon":"登 录"} | |
url = "http://health.shcjsq.com/web/logon" | |
r = s.post(url, data=data, headers=headers) | |
def downloadZip(s,filepath,downloadUrl): | |
''' | |
Download the zip file through given url. | |
@param {object} s - session keeping requests object | |
@param {filepath} filepath - name for the zip | |
@param {string} downloadUrl - download url of the zip file | |
@return {string} filepath - filepath of the zip file we downloaded | |
''' | |
r = s.get(downloadUrl,stream=True) | |
count = 1 | |
block_size = 1024 | |
#totalSize = int(r.headers['Content-Length']) | |
with open(filepath, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=block_size): | |
#percent = int(count * block_size * 100 / totalSize) | |
print("Downloading... "+str(count)) | |
count = count + 1 | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
return filepath | |
def getTheId(s,id): | |
''' | |
Retrieve the idMap information from the server. | |
@param {object} s - session keeping requests object | |
@param {name} id - the id we need pass to the server to get the data | |
@return {object} idMap - name-id map | |
''' | |
devicePostData = [x for x in treePostData] | |
devicePostData.append(("id", id)) | |
devicePostData.append(("node", id)) | |
r = s.post(treeDataUrl,data=devicePostData, headers=headers) | |
text = r.text | |
p1 = re.compile(r'text:\'(.*?)\'') | |
p2 = re.compile(r'id:\'(.*?)\'') | |
groups1 = re.findall(p1, text) | |
groups2 = re.findall(p2, text) | |
return dict(zip(groups1,groups2)) | |
def getTheXML(s,ids,date): | |
''' | |
Retrieve the XML that contains all data file id. | |
@param {object} s - session keeping requests object | |
@param {name} ids - the channels ids we want to get file within | |
@param {string} date - the date of the files we want | |
@return {string} xml - the content of the xml | |
''' | |
xmlUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-grid!griddataByDay?channelCode=("+",".join(ids)+")&day="+date | |
print(xmlUrl) | |
xmlPostData = { | |
"start":0, | |
"limit":1000, | |
"columns":"id" | |
} | |
r = s.post(xmlUrl,data=xmlPostData, headers=headers) | |
return r.text | |
def getXmlId(s): | |
''' | |
Extract the ids from the xml. | |
@param {object} s - session keeping requests object | |
@return {list} ids - the ids | |
''' | |
xml_doc = s.replace('<?xml version="1.0" encoding="utf-8"?>','') | |
root = ET.fromstring(xml_doc) | |
id_list = [] | |
num_of_element = (int)(root[0].text) | |
for row in root.findall('row'): | |
id_list.append(row.find('id').text) | |
return id_list | |
def create_folder(directory): | |
''' | |
Create the folder. | |
@param {string} directory - path to the folder | |
@return {string} directory - the path | |
''' | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
return directory | |
def arrangeFile(file_path): | |
#print(file_path) | |
if os.path.isdir(file_path): | |
for file_name in os.listdir(file_path): | |
arrangeFile(os.path.join(file_path,file_name)) | |
elif os.path.isfile(file_path): | |
dir_path, file_name = os.path.split(file_path) | |
pos = file_name.find("#") | |
new_dir = file_name[0:pos] | |
new_dir_path = create_folder(os.path.join(dir_path,new_dir)) | |
new_file_path = os.path.join(new_dir_path,file_name) | |
shutil.move(file_path,new_file_path) | |
else: | |
print("It's not a file directory path or the path is invalid") | |
def UnzipFile(file_path): | |
dir_path, file_name = os.path.split(file_path) | |
new_file_name, suffix = file_name.split('.') | |
zip_ref = zipfile.ZipFile(file_path, 'r') | |
zip_ref.extractall(os.path.join(dir_path,new_file_name)) | |
zip_ref.close() | |
os.remove(file_path) | |
person = input('Enter the username: ') | |
password = input('Enter the password: ') | |
cate = input('Enter the category you want to download: ') | |
startDate = input('Enter the start date: ') | |
endDate = input('Enter the end date: ') | |
root = input('Enter the folder you want to store the data(default as current dir): ') | |
start_date = datetime.datetime.strptime(startDate, "%Y-%m-%d") | |
end_date = datetime.datetime.strptime(endDate, "%Y-%m-%d") | |
totalDelta = end_date - start_date | |
totalDays = totalDelta.days | |
login(s, person, password) | |
deviceIdMap = getTheId(s,"device-type|<3000>") | |
count = 1 | |
crawlId = deviceIdMap[nameMap[cate]] | |
if root != "": | |
root = create_folder(root) | |
dataPath = create_folder(os.path.join(root, nameMap[cate])) | |
categories = getTheId(s,crawlId) | |
for j in categories: | |
# createTheFolderWithJ | |
channels = getTheId(s,categories[j]) | |
catePath = create_folder(os.path.join(dataPath,j)) | |
for i in range(0,totalDays): | |
xml = getTheXML(s,[x.split("<")[1][:-1] for x in list(channels.values())], start_date.strftime("%Y-%m-%d")) | |
ids = getXmlId(xml) | |
# create the date folder for this | |
dateTimePath = create_folder(os.path.join(catePath,start_date.strftime("%Y-%m-%d"))) | |
#DOWNLOAD THE FILES | |
downloadUrl="http://health.shcjsq.com/web/m/m04/channel-data-file!downloadall?downloadType=pack&ids="+",".join(ids) | |
filepath = downloadZip(s, os.path.join(dateTimePath, j+".zip"), downloadUrl) | |
start_date = start_date + datetime.timedelta(days=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment