Created
December 15, 2014 01:56
-
-
Save XimingCheng/358b6305649dcea9c257 to your computer and use it in GitHub Desktop.
A python script for getting the location data from National Bureau of Statistics of the People's Republic of China
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import time | |
import sys | |
import socket | |
def getProvice(base_url): | |
try: | |
#time.sleep(5) | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
except urllib2.URLError as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
except socket.timeout as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
soup = BeautifulSoup(page, fromEncoding="gb18030") | |
pro_nodes = soup.findAll(attrs = {"class" : "provincetr"}) | |
pro_data = [] | |
for pro_list in pro_nodes: | |
for pro in pro_list.contents: | |
proname = pro.a.contents[0] | |
prolink = pro.a.attrs[0][1] | |
code = prolink[0 : -5] | |
data = (proname, code, prolink) | |
pro_data.append(data) | |
return pro_data | |
def getCity(base_url): | |
try: | |
#time.sleep(5) | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
except urllib2.URLError as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
except socket.timeout as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
soup = BeautifulSoup(page, fromEncoding="gb18030") | |
city_nodes = soup.findAll(attrs = {"class" : "citytr"}) | |
city_data = [] | |
for city in city_nodes: | |
fullcode = city.a.contents[0] | |
code = fullcode[0 : 4] | |
cityname = city.contents[1].a.contents[0] | |
link = city.a.attrs[0][1] | |
data = (cityname, code, link) | |
city_data.append(data) | |
return city_data | |
def getDistrict(base_url): | |
try: | |
#time.sleep(5) | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
except urllib2.URLError as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
except socket.timeout as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
soup = BeautifulSoup(page, fromEncoding="gb18030") | |
district_nodes = soup.findAll(attrs = {"class" : "countytr"}) | |
district_data = [] | |
if len(district_nodes) > 0: | |
for district in district_nodes: | |
if district.a: | |
fullcode = district.a.contents[0] | |
code = fullcode[0 : 6] | |
districtname = district.contents[1].a.contents[0] | |
link = district.a.attrs[0][1] | |
data = (districtname, code, link, True) | |
district_data.append(data) | |
else: | |
fullcode = district.contents[0].contents[0] | |
code = fullcode[0 : 6] | |
districtname = district.contents[1].contents[0] | |
link = None | |
data = (districtname, code, link, True) | |
district_data.append(data) | |
else: | |
data_street = getStreet(base_url) | |
for street in data_street: | |
data = (street[0], street[1][0 : 9], None, False) | |
#print data | |
district_data.append(data) | |
return district_data | |
def getStreet(base_url): | |
sys.stderr.write(base_url + "\n") | |
try: | |
#time.sleep(5) | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
except urllib2.URLError as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
except socket.timeout as e: | |
sys.stderr.write("!!!!!!!!!\n") | |
request = urllib2.Request(base_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14') | |
page = urllib2.urlopen(request, timeout = 100) | |
soup = BeautifulSoup(page, fromEncoding="gb18030") | |
street_nodes = soup.findAll(attrs = {"class" : "towntr"}) | |
street_data = [] | |
for street in street_nodes: | |
fullcode = street.a.contents[0] | |
code = fullcode[0 : 9] | |
streetname = street.contents[1].a.contents[0] | |
data = (streetname, code) | |
street_data.append(data) | |
return street_data | |
def genSQL(table, name, code, count, parent, level, time): | |
count = str(count) | |
head = "NULL" | |
head_map = {"11" : "B", "12" : "T", "13" : "H", "14" : "S", | |
"15" : "N", "21" : "L", "22" : "J", "23" : "H", "31" : "S", | |
"32" : "J", "33" : "Z", "34" : "A", "35" : "F", "36" : "J", | |
"37" : "S", "41" : "H", "42" : "H", "43" : "H", "44" : "G", | |
"45" : "G", "46" : "H", "50" : "C", "51" : "S", "52" : "G", | |
"53" : "Y", "54" : "X", "61" : "S", "62" : "G", "63" : "Q", | |
"64" : "L", "65" : "X", "71" : "T", "81" : "X", "82" : "A"} | |
if head_map.has_key(code): | |
head = head_map[code] | |
sql = r"INSERT INTO `" + table + "` VALUES (" + count + ",1, " + code | |
sql += (r", " + level + ", '" + name + r"', " + parent + r", " + time + r", ") | |
if head == "NULL": | |
sql += r"NULL);" | |
else: | |
sql += (r"'" + head + r"');") | |
print sql.encode('gbk') | |
def main(): | |
# for district in district_data: | |
# new_node = district[1][0 : 6] | |
# print district | |
# sys.stderr.write(new_node + "\n") | |
# count += 1 | |
# genSQL(table, u"区", new_node, count, city[1], "3", timec) | |
# count += 1 | |
# genSQL(table, district[0], district[1], count, new_node, "4", timec) | |
pro_base_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/" | |
table = 'TB_IP_LocationCode' | |
count = 0 | |
timec = str(int(1000 * time.time())) | |
pro_data = getProvice(pro_base_url) | |
pro_data = pro_data[35 : 40] | |
for pro in pro_data: | |
count += 1 | |
genSQL(table, pro[0], pro[1], count, "0", "1", timec) | |
prolink = pro[2] | |
city_data = getCity(pro_base_url + prolink) | |
for city in city_data: | |
count += 1 | |
genSQL(table, city[0], city[1], count, pro[1], "2", timec) | |
citylink = city[2] | |
district_data = getDistrict(pro_base_url + citylink) | |
for district in district_data: | |
if district[3]: # district | |
count += 1 | |
genSQL(table, district[0], district[1], count, city[1], "3", timec) | |
district_link = district[2] | |
if district_link: | |
street_data = getStreet(pro_base_url + pro[1] + '/' + district_link) | |
for street in street_data: | |
#print street[0], street[1] | |
count += 1 | |
genSQL(table, street[0], street[1], count, district[1], "4", timec) | |
else: | |
new_node = district[1][0 : 6] | |
sys.stderr.write(new_node + "\n") | |
count += 1 | |
genSQL(table, u"区", new_node, count, city[1], "3", timec) | |
count += 1 | |
genSQL(table, district[0], district[1], count, new_node, "4", timec) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment