Skip to content

Instantly share code, notes, and snippets.

@LuoZijun
Created October 29, 2015 04:26
Show Gist options
  • Save LuoZijun/a6b95437e6a43c41b1d2 to your computer and use it in GitHub Desktop.
Save LuoZijun/a6b95437e6a43c41b1d2 to your computer and use it in GitHub Desktop.
工商营业执照报备信息 抓取脚本
#!/usr/bin/env python
#coding: utf8
import json, re
import requests
try:
from bs4 import BeautifulSoup
except:
import BeautifulSoup
"""
工商营业执照报备信息 抓取脚本
也可以使用百度的 Site 命令来查询(这里以上海工商局为例):
https://www.baidu.com/s
?wd=site%3A%20www.sgs.gov.cn
&pn=10&oq=site%3A%20www.sgs.gov.cn
&ie=utf-8
&usm=1
&rsv_idx=1
&rsv_pq=eebb46df0000c4c9
&rsv_t=9ad1PvQgC5mDQX%2BEDu%2F%2Bk6LbiKQ802nBP2ksQkMM4dHpuIwRTpXO2uJEOX0
&rsv_page=1
Note:
不同的地区,查询的地址亦不一样,这里使用的是上海工商局查询地址。
"""
def trim(s):
s = re.sub("\r\n", "\n", re.sub("\t", " ", s) )
s = re.sub("\s+", " ", s)
return re.sub("^\n+|\n+$", "", s )
def query(eid):
url = "http://www.sgs.gov.cn/lz/etpsInfo.do?method=viewDetail"
data = {"etpsId": str(eid)} # etpsId: 260000012015080700015
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36",
"Referer": "http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch",
#"Upgrade-Insecure-Requests": "1",
}
r = requests.post(url, data=data, headers=headers)
return parse(r.content)
def parse(content):
DOM = BeautifulSoup(content, 'html.parser')
result = {}
rlist = DOM.find_all("table", class_="list")
if rlist == None or rlist == "": rlist = []
if len(rlist) < 2:
print "ERROR: HTML 无法解析."
return ""
table = rlist[1]
# 企业基本信息
company_info = table.find("table", id="resultTbInfo")
nodes = company_info.findChildren()
section_title = trim(nodes[0].find("td", class_="info_title").string).replace(" ", "")
section_content = nodes[1].find_next().find_next()
try:
title = trim(section_content.find("center", class_="info_table_head").string).replace(" ", "")
rows = section_content.find("table", class_="list_boder").findChildren()
except:
print "WARN: 解析异常."
return {}
result[section_title] = {title: {}}
for row in rows:
cols = row.find_all("td")
if len(cols) % 2 != 0:
# error ...
return result
for col in range(0, len(cols), 2):
key = trim(cols[col].string).replace("\n", "").replace(" ", "").replace(":", "")
value = trim(cols[col+1].string).replace("\n", "")
result[section_title][title][key] = value
# 工商年检信息
section = table.find("table", id="resultTbAnnl")
section_title = trim(section.findChildren()[0].find("td", class_="info_title").string).replace(" ", "")
section_content = section.findChildren()[1].find_next().find_next()
rows = section_content.find("table", class_="list_boder").findChildren()
result[section_title] = {}
for row in rows:
cols = row.find_all("td")
if len(cols) % 2 != 0:
# error ...
return result
for col in range(0, len(cols), 2):
key = trim(cols[col].string).replace("\n", "").replace(" ", "").replace(":", "")
value = trim(cols[col+1].string).replace("\n", "")
result[section_title][key] = value
"""
{
"工商年检信息":{
"年检年度":"年检状态或结果"
},
"企业基本信息":{
"企业法人营业执照":{
"注册资本":"1000.000000  万人民币 ",
"经营范围":"从事信息科技领域内的技术开发、技术咨询、技术服务 ....",
"公司类型":"有限责任公司(国内合资)",
"成立日期":" 2015年08月07日 ",
"注册号":"310120002873260",
"受理机关":"奉贤区市场监管局",
"住所":"上海市奉贤区青村镇光明东街1号第20幢2140室",
"法定代表人姓名":"彭铭海",
"企业状态":"确立",
"营业期限":"2015年08月07日 至2045年08月06日",
"名称":"上海萌萃信息科技有限公司",
"登记机关":"奉贤区市场监管局"
}
}
}
"""
return result
def report(result):
for key in result.keys():
print u"%s :" % (key)
if type(result[key]) == type({}):
for skey in result[key].keys():
print u"\t %s :" % (skey)
if type(result[key][skey]) == type({}):
for tkey in result[key][skey].keys():
print u"\t\t %s :" % (tkey)
print u"\t\t\t %s: %s" %(tkey, result[key][skey][tkey])
else:
print u"%s : %s" % (key, result[key])
"""
etpsId:
length: 21
260000012015080700015
"""
for i in range(260000012015080700015, 260000012015080700300):
report(query(str(i)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment