Created
June 15, 2016 09:54
-
-
Save ladyrassilon/9549da46fdbb91ddbc775d7eb522bfe8 to your computer and use it in GitHub Desktop.
Import Companies House basic bulk data into compressed hdf5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import datetime | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import pandas as pd | |
| import zipfile | |
| import StringIO | |
| import numpy as np | |
| def download_companies_house( | |
| url="http://download.companieshouse.gov.uk/en_output.html", | |
| output_file="company_data_dual_index.h5"): | |
| download_page = requests.get(url) | |
| soup = BeautifulSoup(download_page.content, "html.parser") | |
| links = soup.findAll("a") | |
| download_links = [link for link in links if link["href"].endswith("zip")] | |
| filenames = [link["href"] for link in download_links] | |
| frames = [] | |
| for idx, fn in enumerate(filenames): | |
| response = requests.get("http://download.companieshouse.gov.uk/{}".format(fn)) | |
| frame = pd.read_csv(StringIO.StringIO(response.content), index_col=[0, 1], compression="zip", parse_dates=['DissolutionDate','IncorporationDate','Accounts.NextDueDate','Accounts.LastMadeUpDate','Returns.NextDueDate','Returns.LastMadeUpDate'], infer_datetime_format=True, dayfirst=True) | |
| frames.append(frame) | |
| company_data = pd.concat(frames) | |
| company_data.rename(columns=lambda x: x.strip(),inplace=True) | |
| company_data['RegAddress.POBox'] = company_data['RegAddress.POBox'].astype('str') | |
| def mangle_date(date): | |
| if type(date) == np.datetime64: | |
| return date | |
| elif type(date) == datetime.datetime: | |
| return np.datetime64(date.isoformat()) | |
| else: | |
| return np.nan | |
| company_data["IncorporationDate"] = company_data["IncorporationDate"].apply(mangle_date) | |
| company_data["IncorporationDate"] = company_data["IncorporationDate"].astype("datetime64[ns]") | |
| company_data.to_hdf(output_file,'company_data',mode='w',format='table',complevel=9,complib="blosc") | |
| return company_data | |
| if __name__ == "__main__": | |
| download_companies_house() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment