Skip to content

Instantly share code, notes, and snippets.

@imweijh
Last active November 26, 2020 14:06
Show Gist options
  • Save imweijh/54c79fa815c39f1e5bfb5f249c68b82e to your computer and use it in GitHub Desktop.
Save imweijh/54c79fa815c39f1e5bfb5f249c68b82e to your computer and use it in GitHub Desktop.
szfdc project web scraping huarun4
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
count = 1
url0 = "http://zjj.sz.gov.cn/ris/bol/szfdc/projectdetail.aspx?id=49916"
#url0 = "http://zjj.sz.gov.cn/ris/bol/szfdc/projectdetail.aspx?id=49818"
#url0 = "http://zjj.sz.gov.cn/ris/bol/szfdc/projectdetail.aspx?id=49633"
#url0 = "http://zjj.sz.gov.cn/ris/bol/szfdc/projectdetail.aspx?id=49574"
html0 = urlopen(url0)
soup0 = BeautifulSoup(html0, 'lxml')
all_links = soup0.find_all("a",href=re.compile('building'))
for link in all_links[:]:
theurl = "http://zjj.sz.gov.cn/ris/bol/szfdc/" + link.get("href")
#print (theurl)
htmlz = urlopen(theurl)
soupz = BeautifulSoup(htmlz, 'lxml')
all_links = soupz.find_all("a",href=re.compile('building'))
for link in all_links[:]:
theurl1 = "http://zjj.sz.gov.cn/ris/bol/szfdc/" + link.get("href")
# print (theurl1)
thehtml1 = urlopen(theurl1)
thesoup1 = BeautifulSoup(thehtml1, 'lxml')
all_links = thesoup1.find_all("a",href=re.compile('housedetail'))
for link in all_links[:]:
theurl2 = "http://zjj.sz.gov.cn/ris/bol/szfdc/" + link.get("href")
#print (theurl2)
thehtml2 = urlopen(theurl2)
thesoup2 = BeautifulSoup(thehtml2, 'lxml')
tds = thesoup2.find_all('td')
mystr = ",".join([format(count,'04d'),tds[1].text.strip(),tds[3].text.strip(),tds[9].text.strip(),tds[11].text.strip(),tds[7].text.strip(),tds[15].text.strip(),tds[17].text.strip(),tds[19].text.strip(),tds[13].text.strip()])
print(mystr)
count += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment