Created
April 27, 2019 11:15
-
-
Save gaiar/981e4b848f81627639373b5299508fac to your computer and use it in GitHub Desktop.
Pulling the list of all birds inhabiting Berlin from berlin.de website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import csv | |
import random | |
# Using random user-agent to emulate regular browser | |
user_agent_list = [ | |
#Chrome | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', | |
#Firefox | |
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', | |
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', | |
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', | |
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)' | |
] | |
user_agent = random.choice(user_agent_list) | |
#Set the headers | |
headers = {'User-Agent': user_agent} | |
URL = "https://www.berlin.de/ba-charlottenburg-wilmersdorf/verwaltung/aemter/umwelt-und-naturschutzamt/naturschutz/pflanzen-artenschutz/artikel.112976.php" | |
r = requests.get(URL, headers) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
# Pull all text from the BodyText div | |
header_links = soup.find_all(class_='html5-header header') | |
[header_link.decompose() for header_link in header_links] | |
birds_list = soup.find_all(class_='html5-section block modul-text_bild float') | |
# Write content to CSV format | |
with open('berlin-birds.csv', 'w') as outfile: | |
f = csv.writer(outfile) | |
f.writerow(['Name', 'Latin name','Image URL']) | |
for bird in birds_list: | |
try: | |
link = "https://www.berlin.de"+bird.div.a['href'] | |
name = bird.find(class_="textile").p.a.get_text().replace(","," ") | |
latin_name = bird.find(class_="html5-section body").div.div.p.contents[2].replace(","," ") | |
f.writerow([name, latin_name, link]) | |
except Exception as e: | |
print(e) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment