Last active
April 26, 2021 06:24
-
-
Save georgeblck/f6d57905436517153f9f8926fc42fd7e to your computer and use it in GitHub Desktop.
Download "Windzone" for all german zipcodes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4f9444e2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import pandas as pd\n", | |
"import bs4\n", | |
"import random\n", | |
"import pathlib\n", | |
"import numpy as np\n", | |
"\n", | |
"def read_windzone(plz):\n", | |
" # Download\n", | |
" temp_text = requests.get(base_url + plz).text\n", | |
" # Parse\n", | |
" temp_return = bs4.BeautifulSoup(temp_text,\"html.parser\").select(\"h2 span\")\n", | |
" # Return Windzone if there is one\n", | |
" if len(temp_return) == 2:\n", | |
" return plz, temp_return[1].text\n", | |
" else:\n", | |
" return plz, '0'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "bdd04c14", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"base_url = 'https://www.dehn.de/windzonen/?zip='\n", | |
"windzonen_path = \"windzonen.csv\"\n", | |
"\n", | |
"# Download PLZ-Data from github (not official)\n", | |
"plz_full = pd.read_csv('https://raw.githubusercontent.com/zauberware/postal-codes-json-xml-csv/master/data/DE/zipcodes.de.csv',\n", | |
" dtype={'zipcode': str})\n", | |
"\n", | |
"# Get Unique PLZ Codes and randomize \n", | |
"plz_data = plz_full.drop_duplicates(subset='zipcode')[['zipcode']].sample(frac=1,random_state=1888).reset_index(drop=True)\n", | |
"\n", | |
"# If doesnt exist yet --> Write csv that will be appended\n", | |
"if not pathlib.Path(windzonen_path).exists():\n", | |
" plz_out = pd.DataFrame(columns = [\"zipcode\", \"windzone\"])\n", | |
" plz_out.to_csv(windzonen_path, sep = \";\", index = False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "7c2db3e0", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Read existing windzonen \n", | |
"existing_plz = pd.read_csv(windzonen_path, sep = \";\", dtype={'zipcode': str})\n", | |
"\n", | |
"# Download windzone\n", | |
"for index, row in plz_data.iterrows():\n", | |
" print(index)\n", | |
" print(row[\"zipcode\"])\n", | |
" if row[\"zipcode\"] in existing_plz.zipcode.values:\n", | |
" print(\"exists already\")\n", | |
" continue\n", | |
" temp_windzone = pd.DataFrame([read_windzone(row[\"zipcode\"])], columns = [\"zipcode\", \"windzone\"] )\n", | |
" temp_windzone.to_csv(windzonen_path, mode = 'a', header = False, sep = \";\", index = False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "c4d6ca14", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"existing_plz = pd.read_csv(windzonen_path, sep = \";\", dtype={'zipcode': str})\n", | |
"# Check distribution of windzonen\n", | |
"print(existing_plz.windzone.value_counts(dropna=False))\n", | |
"# Replace 0 with np.nan and rename column\n", | |
"existing_plz.windzone.replace(0.0,np.nan,inplace=True)\n", | |
"existing_plz.rename(columns = {'zipcode':'plz'}, inplace = True)\n", | |
"print(existing_plz.windzone.value_counts(dropna=False))\n", | |
"# Save final version\n", | |
"existing_plz.to_csv(\"plz_windzone.csv\", header = True, sep = \";\", index = False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment