Skip to content

Instantly share code, notes, and snippets.

@taoalpha
Last active May 24, 2016 21:53
Show Gist options
  • Save taoalpha/9478aa3503b5cfb03a754e0c542071f8 to your computer and use it in GitHub Desktop.
Save taoalpha/9478aa3503b5cfb03a754e0c542071f8 to your computer and use it in GitHub Desktop.
import requests
import re,os
import json
import xml.etree.ElementTree as ET
import string
import datetime
import zipfile
# name mapper for mapping pinyin with name
nameMap = {
"fsfxy":"风速风向仪",
"dqwdj":"大气温度计",
"tjgwdj":"砼结构温度计",
"gwdj":"钢温度计",
"zxjsdj":"纵向加速度计",
"sxjsdj":"竖向加速度计",
"hxjsdj":"横向加速度计",
"sxjsuj":"双向加速度计",
"sxjsdj":"三向加速度计",
"wyj":"位移计",
"gzxybj":"钢纵向应变计",
"ghxybj":"钢横向应变计",
"tzxybj":"砼纵向应变计",
"tsxybj":"砼竖向应变计",
"slj":"索力计",
"zxplj":"纵向疲劳计",
"hxplj":"横向疲劳计",
"gpsz":"GPS站",
"jlszy":"静力水准仪",
"qxy":"倾斜仪",
"gjj":"钢筋计",
"wbywdj":"温补用温度计",
"tylj":"土压力计",
"sxwyj":"三向位移计",
"yjt":"阳极梯",
"gjylj":"钢筋应力计",
"cjq":"车检器",
"znq":"阻尼器",
"dtczxt":"动态称重系统",
"yjbhxt":"阴极保护系统",
"csj":"除湿机",
"hntylj":"混凝土应力计",
"sxplj":"竖向疲劳计"
}
# header for all requests, simulate the browsers
headers = {
"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"referer":"http://health.shcjsq.com/web/m/m04/channel-data-file-tree?treetype=type"
}
# data api entry, to get the categories' id and other information
treeDataUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-tree!treedata?treetype=type"
# post data, need to use tuple instead of dictionary for passing the verification
treePostData = [
("imageUrl","/web/themes/default/images/icons/tree_doc.gif"),
("imageUrl","/web/themes/default/images/icons/componenttypeblue.gif"),
("imageUrl","/web/themes/default/images/icons/contract.gif"),
("imageUrl","/web/themes/default/images/icons/office_supplies.gif"),
("imageUrl","/web/themes/default/images/icons/qbs_subbid1.gif"),
("imageUrl","/web/themes/default/images/icons/tree_ssjc_td.gif"),
("imageUrl","/web/themes/default/images/icons/flow.gif")
]
# keep the session
s = requests.Session()
def login(s, person, password):
'''
Start and keep the login session.
@param {object} s - session keeping requests object
@param {string} person - username to login the system
@param {string} password - password
'''
data = {"loginname":person, "loginpass":password, "logon":"登 录"}
url = "http://health.shcjsq.com/web/logon"
r = s.post(url, data=data, headers=headers)
def downloadZip(s,filepath,downloadUrl):
'''
Download the zip file through given url.
@param {object} s - session keeping requests object
@param {filepath} filepath - name for the zip
@param {string} downloadUrl - download url of the zip file
@return {string} filepath - filepath of the zip file we downloaded
'''
r = s.get(downloadUrl,stream=True)
count = 1
block_size = 1024
#totalSize = int(r.headers['Content-Length'])
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=block_size):
#percent = int(count * block_size * 100 / totalSize)
print("Downloading... "+str(count))
count = count + 1
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return filepath
def getTheId(s,id):
'''
Retrieve the idMap information from the server.
@param {object} s - session keeping requests object
@param {name} id - the id we need pass to the server to get the data
@return {object} idMap - name-id map
'''
devicePostData = [x for x in treePostData]
devicePostData.append(("id", id))
devicePostData.append(("node", id))
r = s.post(treeDataUrl,data=devicePostData, headers=headers)
text = r.text
p1 = re.compile(r'text:\'(.*?)\'')
p2 = re.compile(r'id:\'(.*?)\'')
groups1 = re.findall(p1, text)
groups2 = re.findall(p2, text)
return dict(zip(groups1,groups2))
def getTheXML(s,ids,date):
'''
Retrieve the XML that contains all data file id.
@param {object} s - session keeping requests object
@param {name} ids - the channels ids we want to get file within
@param {string} date - the date of the files we want
@return {string} xml - the content of the xml
'''
xmlUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-grid!griddataByDay?channelCode=("+",".join(ids)+")&day="+date
print(xmlUrl)
xmlPostData = {
"start":0,
"limit":1000,
"columns":"id"
}
r = s.post(xmlUrl,data=xmlPostData, headers=headers)
return r.text
def getXmlId(s):
'''
Extract the ids from the xml.
@param {object} s - session keeping requests object
@return {list} ids - the ids
'''
xml_doc = s.replace('<?xml version="1.0" encoding="utf-8"?>','')
root = ET.fromstring(xml_doc)
id_list = []
num_of_element = (int)(root[0].text)
for row in root.findall('row'):
id_list.append(row.find('id').text)
return id_list
def create_folder(directory):
'''
Create the folder.
@param {string} directory - path to the folder
@return {string} directory - the path
'''
if not os.path.exists(directory):
os.makedirs(directory)
return directory
def arrangeFile(file_path):
#print(file_path)
if os.path.isdir(file_path):
for file_name in os.listdir(file_path):
arrangeFile(os.path.join(file_path,file_name))
elif os.path.isfile(file_path):
dir_path, file_name = os.path.split(file_path)
pos = file_name.find("#")
new_dir = file_name[0:pos]
new_dir_path = create_folder(os.path.join(dir_path,new_dir))
new_file_path = os.path.join(new_dir_path,file_name)
shutil.move(file_path,new_file_path)
else:
print("It's not a file directory path or the path is invalid")
def UnzipFile(file_path):
dir_path, file_name = os.path.split(file_path)
new_file_name, suffix = file_name.split('.')
zip_ref = zipfile.ZipFile(file_path, 'r')
zip_ref.extractall(os.path.join(dir_path,new_file_name))
zip_ref.close()
os.remove(file_path)
person = input('Enter the username: ')
password = input('Enter the password: ')
cate = input('Enter the category you want to download: ')
startDate = input('Enter the start date: ')
endDate = input('Enter the end date: ')
root = input('Enter the folder you want to store the data(default as current dir): ')
start_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
end_date = datetime.datetime.strptime(endDate, "%Y-%m-%d")
totalDelta = end_date - start_date
totalDays = totalDelta.days
login(s, person, password)
deviceIdMap = getTheId(s,"device-type|<3000>")
count = 1
crawlId = deviceIdMap[nameMap[cate]]
if root != "":
root = create_folder(root)
dataPath = create_folder(os.path.join(root, nameMap[cate]))
categories = getTheId(s,crawlId)
for j in categories:
# createTheFolderWithJ
channels = getTheId(s,categories[j])
catePath = create_folder(os.path.join(dataPath,j))
for i in range(0,totalDays):
xml = getTheXML(s,[x.split("<")[1][:-1] for x in list(channels.values())], start_date.strftime("%Y-%m-%d"))
ids = getXmlId(xml)
# create the date folder for this
dateTimePath = create_folder(os.path.join(catePath,start_date.strftime("%Y-%m-%d")))
#DOWNLOAD THE FILES
downloadUrl="http://health.shcjsq.com/web/m/m04/channel-data-file!downloadall?downloadType=pack&ids="+",".join(ids)
filepath = downloadZip(s, os.path.join(dateTimePath, j+".zip"), downloadUrl)
start_date = start_date + datetime.timedelta(days=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment