taoalpha · May 24, 2016 21:53
diff --git a/song.py b/song.py
 import requests
 import re,os
 import json
 import xml.etree.ElementTree as ET
 import string
 import datetime
 import zipfile

 # name mapper for mapping pinyin with name
 nameMap = {
    "fsfxy":"风速风向仪",
    "dqwdj":"大气温度计",
    "tjgwdj":"砼结构温度计",
    "gwdj":"钢温度计",
    "zxjsdj":"纵向加速度计",
    "sxjsdj":"竖向加速度计",
    "hxjsdj":"横向加速度计",
    "sxjsuj":"双向加速度计",
    "sxjsdj":"三向加速度计",
    "wyj":"位移计",
    "gzxybj":"钢纵向应变计",
    "ghxybj":"钢横向应变计",
    "tzxybj":"砼纵向应变计",
    "tsxybj":"砼竖向应变计",
    "slj":"索力计",
    "zxplj":"纵向疲劳计",
    "hxplj":"横向疲劳计",
    "gpsz":"GPS站",
    "jlszy":"静力水准仪",
    "qxy":"倾斜仪",
    "gjj":"钢筋计",
    "wbywdj":"温补用温度计",
    "tylj":"土压力计",
    "sxwyj":"三向位移计",
    "yjt":"阳极梯",
    "gjylj":"钢筋应力计",
    "cjq":"车检器",
    "znq":"阻尼器",
    "dtczxt":"动态称重系统",
    "yjbhxt":"阴极保护系统",
    "csj":"除湿机",
    "hntylj":"混凝土应力计",
    "sxplj":"竖向疲劳计"
    }

 # header for all requests, simulate the browsers
 headers = {
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
    "referer":"http://health.shcjsq.com/web/m/m04/channel-data-file-tree?treetype=type"
 }

 # data api entry, to get the categories' id and other information
 treeDataUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-tree!treedata?treetype=type"

 # post data, need to use tuple instead of dictionary for passing the verification
 treePostData = [
    ("imageUrl","/web/themes/default/images/icons/tree_doc.gif"),
    ("imageUrl","/web/themes/default/images/icons/componenttypeblue.gif"),
    ("imageUrl","/web/themes/default/images/icons/contract.gif"),
    ("imageUrl","/web/themes/default/images/icons/office_supplies.gif"),
    ("imageUrl","/web/themes/default/images/icons/qbs_subbid1.gif"),
    ("imageUrl","/web/themes/default/images/icons/tree_ssjc_td.gif"),
    ("imageUrl","/web/themes/default/images/icons/flow.gif")
    ]


 # keep the session
 s = requests.Session()


 def login(s, person, password):
    '''
    	Start and keep the login session.
    	@param {object} s        - session keeping requests object
    	@param {string} person   - username to login the system
    	@param {string} password - password
    '''
    data = {"loginname":person, "loginpass":password, "logon":"登 录"}
    url = "http://health.shcjsq.com/web/logon"
    r = s.post(url, data=data, headers=headers)

 def downloadZip(s,filepath,downloadUrl):
    '''
    	Download the zip file through given url.
    	@param {object} s           - session keeping requests object
    	@param {filepath} filepath  - name for the zip
    	@param {string} downloadUrl - download url of the zip file
 	@return {string} filepath - filepath of the zip file we downloaded
    '''
    r = s.get(downloadUrl,stream=True)
    count = 1
    block_size = 1024
    #totalSize = int(r.headers['Content-Length'])
    with open(filepath, 'wb') as f:
        for chunk in r.iter_content(chunk_size=block_size):
            #percent = int(count * block_size * 100 / totalSize)
            print("Downloading... "+str(count))
            count = count + 1
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
    return filepath

 def getTheId(s,id):
    '''
    	Retrieve the idMap information from the server.
    	@param {object} s      - session keeping requests object
    	@param {name}   id     - the id we need pass to the server to get the data
 	@return {object} idMap - name-id map
    '''
    devicePostData = [x for x in treePostData]
    devicePostData.append(("id", id))
    devicePostData.append(("node", id))
    r = s.post(treeDataUrl,data=devicePostData, headers=headers)
    text = r.text
    p1 = re.compile(r'text:\'(.*?)\'')
    p2 = re.compile(r'id:\'(.*?)\'')
    groups1 = re.findall(p1, text)
    groups2 = re.findall(p2, text)
    return dict(zip(groups1,groups2))

 def getTheXML(s,ids,date):
    '''
 	Retrieve the XML that contains all data file id.
    	@param {object} s      - session keeping requests object
    	@param {name}   ids    - the channels ids we want to get file within
    	@param {string}  date  - the date of the files we want
 	@return {string} xml   - the content of the xml
    '''

    xmlUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-grid!griddataByDay?channelCode=("+",".join(ids)+")&day="+date
    print(xmlUrl)
    xmlPostData = {
        "start":0,
        "limit":1000,
        "columns":"id"
        }
    r = s.post(xmlUrl,data=xmlPostData, headers=headers)
    return r.text


 def getXmlId(s):
    '''
 	Extract the ids from the xml.
    	@param {object} s  - session keeping requests object
 	@return {list} ids - the ids
    '''
    xml_doc = s.replace('<?xml version="1.0" encoding="utf-8"?>','')
    root = ET.fromstring(xml_doc)
    id_list = []

    num_of_element = (int)(root[0].text)
    for row in root.findall('row'):
        id_list.append(row.find('id').text)
    return id_list


 def create_folder(directory):
    '''
 	Create the folder.
    	@param {string} directory  - path to the folder
 	@return {string} directory - the path
    '''
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory


 def arrangeFile(file_path):

    #print(file_path)
    if os.path.isdir(file_path):
        for file_name in os.listdir(file_path):
            arrangeFile(os.path.join(file_path,file_name))

    elif os.path.isfile(file_path):
        dir_path, file_name = os.path.split(file_path)
        pos = file_name.find("#")

        new_dir = file_name[0:pos]
        new_dir_path = create_folder(os.path.join(dir_path,new_dir))
        new_file_path = os.path.join(new_dir_path,file_name)

        shutil.move(file_path,new_file_path)

    else:
        print("It's not a file directory path or the path is invalid")

 def UnzipFile(file_path):
    dir_path, file_name = os.path.split(file_path)
    new_file_name, suffix = file_name.split('.')

    zip_ref = zipfile.ZipFile(file_path, 'r')
    zip_ref.extractall(os.path.join(dir_path,new_file_name))
    zip_ref.close()
    os.remove(file_path)


 person = input('Enter the username: ')
 password = input('Enter the password: ')
 cate = input('Enter the category you want to download: ')
 startDate = input('Enter the start date: ')
 endDate = input('Enter the end date: ')
 root = input('Enter the folder you want to store the data(default as current dir): ')

 start_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
 end_date = datetime.datetime.strptime(endDate, "%Y-%m-%d")

 totalDelta = end_date - start_date
 totalDays = totalDelta.days

 login(s, person, password)

 deviceIdMap = getTheId(s,"device-type|<3000>")
 count = 1
 crawlId = deviceIdMap[nameMap[cate]]

 if root != "":
   root = create_folder(root)

 dataPath = create_folder(os.path.join(root, nameMap[cate]))

 categories = getTheId(s,crawlId)

 for j in categories:
    # createTheFolderWithJ
    channels = getTheId(s,categories[j])
    catePath = create_folder(os.path.join(dataPath,j))
    for i in range(0,totalDays):
        xml = getTheXML(s,[x.split("<")[1][:-1] for x in list(channels.values())], start_date.strftime("%Y-%m-%d"))
        ids = getXmlId(xml)
        # create the date folder for this
        dateTimePath = create_folder(os.path.join(catePath,start_date.strftime("%Y-%m-%d")))
        #DOWNLOAD THE FILES
        downloadUrl="http://health.shcjsq.com/web/m/m04/channel-data-file!downloadall?downloadType=pack&ids="+",".join(ids)
        filepath = downloadZip(s, os.path.join(dateTimePath, j+".zip"), downloadUrl)
        start_date = start_date + datetime.timedelta(days=1)
	import requests
	import re,os
	import json
	import xml.etree.ElementTree as ET
	import string
	import datetime
	import zipfile

	# name mapper for mapping pinyin with name
	nameMap = {
	"fsfxy":"风速风向仪",
	"dqwdj":"大气温度计",
	"tjgwdj":"砼结构温度计",
	"gwdj":"钢温度计",
	"zxjsdj":"纵向加速度计",
	"sxjsdj":"竖向加速度计",
	"hxjsdj":"横向加速度计",
	"sxjsuj":"双向加速度计",
	"sxjsdj":"三向加速度计",
	"wyj":"位移计",
	"gzxybj":"钢纵向应变计",
	"ghxybj":"钢横向应变计",
	"tzxybj":"砼纵向应变计",
	"tsxybj":"砼竖向应变计",
	"slj":"索力计",
	"zxplj":"纵向疲劳计",
	"hxplj":"横向疲劳计",
	"gpsz":"GPS站",
	"jlszy":"静力水准仪",
	"qxy":"倾斜仪",
	"gjj":"钢筋计",
	"wbywdj":"温补用温度计",
	"tylj":"土压力计",
	"sxwyj":"三向位移计",
	"yjt":"阳极梯",
	"gjylj":"钢筋应力计",
	"cjq":"车检器",
	"znq":"阻尼器",
	"dtczxt":"动态称重系统",
	"yjbhxt":"阴极保护系统",
	"csj":"除湿机",
	"hntylj":"混凝土应力计",
	"sxplj":"竖向疲劳计"
	}

	# header for all requests, simulate the browsers
	headers = {
	"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
	"referer":"http://health.shcjsq.com/web/m/m04/channel-data-file-tree?treetype=type"
	}

	# data api entry, to get the categories' id and other information
	treeDataUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-tree!treedata?treetype=type"

	# post data, need to use tuple instead of dictionary for passing the verification
	treePostData = [
	("imageUrl","/web/themes/default/images/icons/tree_doc.gif"),
	("imageUrl","/web/themes/default/images/icons/componenttypeblue.gif"),
	("imageUrl","/web/themes/default/images/icons/contract.gif"),
	("imageUrl","/web/themes/default/images/icons/office_supplies.gif"),
	("imageUrl","/web/themes/default/images/icons/qbs_subbid1.gif"),
	("imageUrl","/web/themes/default/images/icons/tree_ssjc_td.gif"),
	("imageUrl","/web/themes/default/images/icons/flow.gif")
	]


	# keep the session
	s = requests.Session()


	def login(s, person, password):
	'''
	Start and keep the login session.
	@param {object} s - session keeping requests object
	@param {string} person - username to login the system
	@param {string} password - password
	'''
	data = {"loginname":person, "loginpass":password, "logon":"登录"}
	url = "http://health.shcjsq.com/web/logon"
	r = s.post(url, data=data, headers=headers)

	def downloadZip(s,filepath,downloadUrl):
	'''
	Download the zip file through given url.
	@param {object} s - session keeping requests object
	@param {filepath} filepath - name for the zip
	@param {string} downloadUrl - download url of the zip file
	@return {string} filepath - filepath of the zip file we downloaded
	'''
	r = s.get(downloadUrl,stream=True)
	count = 1
	block_size = 1024
	#totalSize = int(r.headers['Content-Length'])
	with open(filepath, 'wb') as f:
	for chunk in r.iter_content(chunk_size=block_size):
	#percent = int(count * block_size * 100 / totalSize)
	print("Downloading... "+str(count))
	count = count + 1
	if chunk: # filter out keep-alive new chunks
	f.write(chunk)
	return filepath

	def getTheId(s,id):
	'''
	Retrieve the idMap information from the server.
	@param {object} s - session keeping requests object
	@param {name} id - the id we need pass to the server to get the data
	@return {object} idMap - name-id map
	'''
	devicePostData = [x for x in treePostData]
	devicePostData.append(("id", id))
	devicePostData.append(("node", id))
	r = s.post(treeDataUrl,data=devicePostData, headers=headers)
	text = r.text
	p1 = re.compile(r'text:\'(.*?)\'')
	p2 = re.compile(r'id:\'(.*?)\'')
	groups1 = re.findall(p1, text)
	groups2 = re.findall(p2, text)
	return dict(zip(groups1,groups2))

	def getTheXML(s,ids,date):
	'''
	Retrieve the XML that contains all data file id.
	@param {object} s - session keeping requests object
	@param {name} ids - the channels ids we want to get file within
	@param {string} date - the date of the files we want
	@return {string} xml - the content of the xml
	'''

	xmlUrl = "http://health.shcjsq.com/web/m/m04/channel-data-file-grid!griddataByDay?channelCode=("+",".join(ids)+")&day="+date
	print(xmlUrl)
	xmlPostData = {
	"start":0,
	"limit":1000,
	"columns":"id"
	}
	r = s.post(xmlUrl,data=xmlPostData, headers=headers)
	return r.text


	def getXmlId(s):
	'''
	Extract the ids from the xml.
	@param {object} s - session keeping requests object
	@return {list} ids - the ids
	'''
	xml_doc = s.replace('<?xml version="1.0" encoding="utf-8"?>','')
	root = ET.fromstring(xml_doc)
	id_list = []

	num_of_element = (int)(root[0].text)
	for row in root.findall('row'):
	id_list.append(row.find('id').text)
	return id_list


	def create_folder(directory):
	'''
	Create the folder.
	@param {string} directory - path to the folder
	@return {string} directory - the path
	'''
	if not os.path.exists(directory):
	os.makedirs(directory)
	return directory


	def arrangeFile(file_path):

	#print(file_path)
	if os.path.isdir(file_path):
	for file_name in os.listdir(file_path):
	arrangeFile(os.path.join(file_path,file_name))

	elif os.path.isfile(file_path):
	dir_path, file_name = os.path.split(file_path)
	pos = file_name.find("#")

	new_dir = file_name[0:pos]
	new_dir_path = create_folder(os.path.join(dir_path,new_dir))
	new_file_path = os.path.join(new_dir_path,file_name)

	shutil.move(file_path,new_file_path)

	else:
	print("It's not a file directory path or the path is invalid")

	def UnzipFile(file_path):
	dir_path, file_name = os.path.split(file_path)
	new_file_name, suffix = file_name.split('.')

	zip_ref = zipfile.ZipFile(file_path, 'r')
	zip_ref.extractall(os.path.join(dir_path,new_file_name))
	zip_ref.close()
	os.remove(file_path)


	person = input('Enter the username: ')
	password = input('Enter the password: ')
	cate = input('Enter the category you want to download: ')
	startDate = input('Enter the start date: ')
	endDate = input('Enter the end date: ')
	root = input('Enter the folder you want to store the data(default as current dir): ')

	start_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
	end_date = datetime.datetime.strptime(endDate, "%Y-%m-%d")

	totalDelta = end_date - start_date
	totalDays = totalDelta.days

	login(s, person, password)

	deviceIdMap = getTheId(s,"device-type\|<3000>")
	count = 1
	crawlId = deviceIdMap[nameMap[cate]]

	if root != "":
	root = create_folder(root)

	dataPath = create_folder(os.path.join(root, nameMap[cate]))

	categories = getTheId(s,crawlId)

	for j in categories:
	# createTheFolderWithJ
	channels = getTheId(s,categories[j])
	catePath = create_folder(os.path.join(dataPath,j))
	for i in range(0,totalDays):
	xml = getTheXML(s,[x.split("<")[1][:-1] for x in list(channels.values())], start_date.strftime("%Y-%m-%d"))
	ids = getXmlId(xml)
	# create the date folder for this
	dateTimePath = create_folder(os.path.join(catePath,start_date.strftime("%Y-%m-%d")))
	#DOWNLOAD THE FILES
	downloadUrl="http://health.shcjsq.com/web/m/m04/channel-data-file!downloadall?downloadType=pack&ids="+",".join(ids)
	filepath = downloadZip(s, os.path.join(dateTimePath, j+".zip"), downloadUrl)
	start_date = start_date + datetime.timedelta(days=1)