Skip to content

Instantly share code, notes, and snippets.

@ladyrassilon
Created June 15, 2016 09:54
Show Gist options
  • Select an option

  • Save ladyrassilon/9549da46fdbb91ddbc775d7eb522bfe8 to your computer and use it in GitHub Desktop.

Select an option

Save ladyrassilon/9549da46fdbb91ddbc775d7eb522bfe8 to your computer and use it in GitHub Desktop.
Import Companies House basic bulk data into compressed hdf5
import datetime
import requests
from bs4 import BeautifulSoup
import requests
import pandas as pd
import zipfile
import StringIO
import numpy as np
def download_companies_house(
url="http://download.companieshouse.gov.uk/en_output.html",
output_file="company_data_dual_index.h5"):
download_page = requests.get(url)
soup = BeautifulSoup(download_page.content, "html.parser")
links = soup.findAll("a")
download_links = [link for link in links if link["href"].endswith("zip")]
filenames = [link["href"] for link in download_links]
frames = []
for idx, fn in enumerate(filenames):
response = requests.get("http://download.companieshouse.gov.uk/{}".format(fn))
frame = pd.read_csv(StringIO.StringIO(response.content), index_col=[0, 1], compression="zip", parse_dates=['DissolutionDate','IncorporationDate','Accounts.NextDueDate','Accounts.LastMadeUpDate','Returns.NextDueDate','Returns.LastMadeUpDate'], infer_datetime_format=True, dayfirst=True)
frames.append(frame)
company_data = pd.concat(frames)
company_data.rename(columns=lambda x: x.strip(),inplace=True)
company_data['RegAddress.POBox'] = company_data['RegAddress.POBox'].astype('str')
def mangle_date(date):
if type(date) == np.datetime64:
return date
elif type(date) == datetime.datetime:
return np.datetime64(date.isoformat())
else:
return np.nan
company_data["IncorporationDate"] = company_data["IncorporationDate"].apply(mangle_date)
company_data["IncorporationDate"] = company_data["IncorporationDate"].astype("datetime64[ns]")
company_data.to_hdf(output_file,'company_data',mode='w',format='table',complevel=9,complib="blosc")
return company_data
if __name__ == "__main__":
download_companies_house()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment