Created
October 29, 2015 04:26
-
-
Save LuoZijun/a6b95437e6a43c41b1d2 to your computer and use it in GitHub Desktop.
工商营业执照报备信息 抓取脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding: utf8 | |
import json, re | |
import requests | |
try: | |
from bs4 import BeautifulSoup | |
except: | |
import BeautifulSoup | |
""" | |
工商营业执照报备信息 抓取脚本 | |
也可以使用百度的 Site 命令来查询(这里以上海工商局为例): | |
https://www.baidu.com/s | |
?wd=site%3A%20www.sgs.gov.cn | |
&pn=10&oq=site%3A%20www.sgs.gov.cn | |
&ie=utf-8 | |
&usm=1 | |
&rsv_idx=1 | |
&rsv_pq=eebb46df0000c4c9 | |
&rsv_t=9ad1PvQgC5mDQX%2BEDu%2F%2Bk6LbiKQ802nBP2ksQkMM4dHpuIwRTpXO2uJEOX0 | |
&rsv_page=1 | |
Note: | |
不同的地区,查询的地址亦不一样,这里使用的是上海工商局查询地址。 | |
""" | |
def trim(s): | |
s = re.sub("\r\n", "\n", re.sub("\t", " ", s) ) | |
s = re.sub("\s+", " ", s) | |
return re.sub("^\n+|\n+$", "", s ) | |
def query(eid): | |
url = "http://www.sgs.gov.cn/lz/etpsInfo.do?method=viewDetail" | |
data = {"etpsId": str(eid)} # etpsId: 260000012015080700015 | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36", | |
"Referer": "http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch", | |
#"Upgrade-Insecure-Requests": "1", | |
} | |
r = requests.post(url, data=data, headers=headers) | |
return parse(r.content) | |
def parse(content): | |
DOM = BeautifulSoup(content, 'html.parser') | |
result = {} | |
rlist = DOM.find_all("table", class_="list") | |
if rlist == None or rlist == "": rlist = [] | |
if len(rlist) < 2: | |
print "ERROR: HTML 无法解析." | |
return "" | |
table = rlist[1] | |
# 企业基本信息 | |
company_info = table.find("table", id="resultTbInfo") | |
nodes = company_info.findChildren() | |
section_title = trim(nodes[0].find("td", class_="info_title").string).replace(" ", "") | |
section_content = nodes[1].find_next().find_next() | |
try: | |
title = trim(section_content.find("center", class_="info_table_head").string).replace(" ", "") | |
rows = section_content.find("table", class_="list_boder").findChildren() | |
except: | |
print "WARN: 解析异常." | |
return {} | |
result[section_title] = {title: {}} | |
for row in rows: | |
cols = row.find_all("td") | |
if len(cols) % 2 != 0: | |
# error ... | |
return result | |
for col in range(0, len(cols), 2): | |
key = trim(cols[col].string).replace("\n", "").replace(" ", "").replace(":", "") | |
value = trim(cols[col+1].string).replace("\n", "") | |
result[section_title][title][key] = value | |
# 工商年检信息 | |
section = table.find("table", id="resultTbAnnl") | |
section_title = trim(section.findChildren()[0].find("td", class_="info_title").string).replace(" ", "") | |
section_content = section.findChildren()[1].find_next().find_next() | |
rows = section_content.find("table", class_="list_boder").findChildren() | |
result[section_title] = {} | |
for row in rows: | |
cols = row.find_all("td") | |
if len(cols) % 2 != 0: | |
# error ... | |
return result | |
for col in range(0, len(cols), 2): | |
key = trim(cols[col].string).replace("\n", "").replace(" ", "").replace(":", "") | |
value = trim(cols[col+1].string).replace("\n", "") | |
result[section_title][key] = value | |
""" | |
{ | |
"工商年检信息":{ | |
"年检年度":"年检状态或结果" | |
}, | |
"企业基本信息":{ | |
"企业法人营业执照":{ | |
"注册资本":"1000.000000 万人民币 ", | |
"经营范围":"从事信息科技领域内的技术开发、技术咨询、技术服务 ....", | |
"公司类型":"有限责任公司(国内合资)", | |
"成立日期":" 2015年08月07日 ", | |
"注册号":"310120002873260", | |
"受理机关":"奉贤区市场监管局", | |
"住所":"上海市奉贤区青村镇光明东街1号第20幢2140室", | |
"法定代表人姓名":"彭铭海", | |
"企业状态":"确立", | |
"营业期限":"2015年08月07日 至2045年08月06日", | |
"名称":"上海萌萃信息科技有限公司", | |
"登记机关":"奉贤区市场监管局" | |
} | |
} | |
} | |
""" | |
return result | |
def report(result): | |
for key in result.keys(): | |
print u"%s :" % (key) | |
if type(result[key]) == type({}): | |
for skey in result[key].keys(): | |
print u"\t %s :" % (skey) | |
if type(result[key][skey]) == type({}): | |
for tkey in result[key][skey].keys(): | |
print u"\t\t %s :" % (tkey) | |
print u"\t\t\t %s: %s" %(tkey, result[key][skey][tkey]) | |
else: | |
print u"%s : %s" % (key, result[key]) | |
""" | |
etpsId: | |
length: 21 | |
260000012015080700015 | |
""" | |
for i in range(260000012015080700015, 260000012015080700300): | |
report(query(str(i))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment