Skip to content

Instantly share code, notes, and snippets.

@georgeblck
Last active April 26, 2021 06:24
Show Gist options
  • Save georgeblck/f6d57905436517153f9f8926fc42fd7e to your computer and use it in GitHub Desktop.
Save georgeblck/f6d57905436517153f9f8926fc42fd7e to your computer and use it in GitHub Desktop.
Download "Windzone" for all german zipcodes
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4f9444e2",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd\n",
"import bs4\n",
"import random\n",
"import pathlib\n",
"import numpy as np\n",
"\n",
"def read_windzone(plz):\n",
" # Download\n",
" temp_text = requests.get(base_url + plz).text\n",
" # Parse\n",
" temp_return = bs4.BeautifulSoup(temp_text,\"html.parser\").select(\"h2 span\")\n",
" # Return Windzone if there is one\n",
" if len(temp_return) == 2:\n",
" return plz, temp_return[1].text\n",
" else:\n",
" return plz, '0'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdd04c14",
"metadata": {},
"outputs": [],
"source": [
"base_url = 'https://www.dehn.de/windzonen/?zip='\n",
"windzonen_path = \"windzonen.csv\"\n",
"\n",
"# Download PLZ-Data from github (not official)\n",
"plz_full = pd.read_csv('https://raw.githubusercontent.com/zauberware/postal-codes-json-xml-csv/master/data/DE/zipcodes.de.csv',\n",
" dtype={'zipcode': str})\n",
"\n",
"# Get Unique PLZ Codes and randomize \n",
"plz_data = plz_full.drop_duplicates(subset='zipcode')[['zipcode']].sample(frac=1,random_state=1888).reset_index(drop=True)\n",
"\n",
"# If doesnt exist yet --> Write csv that will be appended\n",
"if not pathlib.Path(windzonen_path).exists():\n",
" plz_out = pd.DataFrame(columns = [\"zipcode\", \"windzone\"])\n",
" plz_out.to_csv(windzonen_path, sep = \";\", index = False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c2db3e0",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Read existing windzonen \n",
"existing_plz = pd.read_csv(windzonen_path, sep = \";\", dtype={'zipcode': str})\n",
"\n",
"# Download windzone\n",
"for index, row in plz_data.iterrows():\n",
" print(index)\n",
" print(row[\"zipcode\"])\n",
" if row[\"zipcode\"] in existing_plz.zipcode.values:\n",
" print(\"exists already\")\n",
" continue\n",
" temp_windzone = pd.DataFrame([read_windzone(row[\"zipcode\"])], columns = [\"zipcode\", \"windzone\"] )\n",
" temp_windzone.to_csv(windzonen_path, mode = 'a', header = False, sep = \";\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4d6ca14",
"metadata": {},
"outputs": [],
"source": [
"existing_plz = pd.read_csv(windzonen_path, sep = \";\", dtype={'zipcode': str})\n",
"# Check distribution of windzonen\n",
"print(existing_plz.windzone.value_counts(dropna=False))\n",
"# Replace 0 with np.nan and rename column\n",
"existing_plz.windzone.replace(0.0,np.nan,inplace=True)\n",
"existing_plz.rename(columns = {'zipcode':'plz'}, inplace = True)\n",
"print(existing_plz.windzone.value_counts(dropna=False))\n",
"# Save final version\n",
"existing_plz.to_csv(\"plz_windzone.csv\", header = True, sep = \";\", index = False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment