buswedg · April 14, 2020 19:47
diff --git a/abs_economic_series_pull.ipynb b/abs_economic_series_pull.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "import shutil\n",
    "import urllib.request as request\n",
    "from contextlib import closing\n",
    "\n",
    "from pandasdmx import Request\n",
    "import xmltodict\n",
    "\n",
    "from datetime import date\n",
    "\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_exists(path):\n",
    "\n",
    "    return os.path.exists(path)\n",
    "\n",
    "\n",
    "def check_or_create_dir(directory):\n",
    "\n",
    "    if not check_exists(directory):\n",
    "        os.makedirs(directory)\n",
    "        \n",
    "        \n",
    "def find_from(s, first):\n",
    "\n",
    "    try:\n",
    "        start = s.index(first) + len(first)\n",
    "\n",
    "        return s[start:]\n",
    "\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "        \n",
    "def copy_url_to_file(url, tmp_file):\n",
    "\n",
    "    try:\n",
    "\n",
    "        with closing(request.urlopen(url)) as r:\n",
    "\n",
    "            with open(tmp_file, 'wb') as f:\n",
    "                shutil.copyfileobj(r, f)\n",
    "\n",
    "    except Exception as e:\n",
    "        print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_dsd(df, dsd_url):\n",
    "    copy_url_to_file(dsd_url, '_temp')\n",
    "\n",
    "    with open('_temp') as f:\n",
    "        dsd = xmltodict.parse(f.read().replace('&', '&'))\n",
    "\n",
    "    os.remove('_temp')\n",
    "\n",
    "    for i, col in enumerate(df.columns):\n",
    "\n",
    "        if col not in ['Time Period', 0]:\n",
    "            dic_val = {}\n",
    "\n",
    "            ls_codes = dsd['message:Structure']['message:CodeLists']['CodeList'][i]['Code']\n",
    "            ls_codes = ls_codes if type(ls_codes) is list else [ls_codes]\n",
    "\n",
    "            for code in ls_codes:\n",
    "                dic_val[code['@value']] = code['Description'][0]['#text']\n",
    "\n",
    "            df = df.replace({col: dic_val})\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# http://stat.data.abs.gov.au/Index.aspx\n",
    "\n",
    "ls_query = []\n",
    "\n",
    "series_cat = \"Economy\"\n",
    "dtype_cat = \"Business Indicators\"\n",
    "dtype_subcat = \"Business Indicators\"\n",
    "dtype_desc = \"Inventories, Current Prices and Chain Volume Measures\"\n",
    "dtype_abb = \"QBIS\"\n",
    "res_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetData/QBIS/50+70.B+C+D+F+G+H+TOTAL.0.99.10+20+30.Q/\"\n",
    "dsd_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetDataStructure/QBIS\"\n",
    "ls_query.append([series_cat, dtype_cat, dtype_subcat, dtype_desc, dtype_abb, res_url, dsd_url])\n",
    "\n",
    "series_cat = \"Health\"\n",
    "dtype_cat = \"Health Conditions and Risk Factors\"\n",
    "dtype_subcat = \"Apparent Consumption of Alcohol, Australia\"\n",
    "dtype_desc = \"Pure Alcohol Available for Consumption\"\n",
    "dtype_abb = \"ALC\"\n",
    "res_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetData/ALC/1.1+2.1+2+3+5+4.6+10+11+12+15+14.A/\"\n",
    "dsd_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetDataStructure/ALC\"\n",
    "ls_query.append([series_cat, dtype_cat, dtype_subcat, dtype_desc, dtype_abb, res_url, dsd_url])\n",
    "\n",
    "\n",
    "df_query = pd.DataFrame(ls_query, columns=['series_cat', 'dtype_cat', 'dtype_subcat', 'dtype_desc', 'dtype_abb', 'res_url', 'dsd_url'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, row in df_query.iterrows():\n",
    "\n",
    "    agency_code = 'ABS'\n",
    "\n",
    "    ABS = Request(agency_code)\n",
    "    ABS.client.config['timeout'] = 100000\n",
    "\n",
    "    res_url = row['dtype_abb'] + find_from(row['res_url'], row['dtype_abb'])\n",
    "\n",
    "    data_response = ABS.data(resource_id=res_url)\n",
    "\n",
    "    df = data_response.write().unstack().reset_index()\n",
    "    df = apply_dsd(df, row['dsd_url'])\n",
    "    \n",
    "    data_file = row['dtype_abb'] + \".csv.gz\"\n",
    "    \n",
    "    df.to_csv(data_file, sep=';', quotechar='\"', date_format='%Y-%m-%d %H:%M:%S.%f', compression='gzip', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import sys\n",
	"\n",
	"import shutil\n",
	"import urllib.request as request\n",
	"from contextlib import closing\n",
	"\n",
	"from pandasdmx import Request\n",
	"import xmltodict\n",
	"\n",
	"from datetime import date\n",
	"\n",
	"import pandas as pd\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def check_exists(path):\n",
	"\n",
	" return os.path.exists(path)\n",
	"\n",
	"\n",
	"def check_or_create_dir(directory):\n",
	"\n",
	" if not check_exists(directory):\n",
	" os.makedirs(directory)\n",
	" \n",
	" \n",
	"def find_from(s, first):\n",
	"\n",
	" try:\n",
	" start = s.index(first) + len(first)\n",
	"\n",
	" return s[start:]\n",
	"\n",
	" except Exception as e:\n",
	" print(e)\n",
	" \n",
	" \n",
	"def copy_url_to_file(url, tmp_file):\n",
	"\n",
	" try:\n",
	"\n",
	" with closing(request.urlopen(url)) as r:\n",
	"\n",
	" with open(tmp_file, 'wb') as f:\n",
	" shutil.copyfileobj(r, f)\n",
	"\n",
	" except Exception as e:\n",
	" print(e)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def apply_dsd(df, dsd_url):\n",
	" copy_url_to_file(dsd_url, '_temp')\n",
	"\n",
	" with open('_temp') as f:\n",
	" dsd = xmltodict.parse(f.read().replace('&', '&'))\n",
	"\n",
	" os.remove('_temp')\n",
	"\n",
	" for i, col in enumerate(df.columns):\n",
	"\n",
	" if col not in ['Time Period', 0]:\n",
	" dic_val = {}\n",
	"\n",
	" ls_codes = dsd['message:Structure']['message:CodeLists']['CodeList'][i]['Code']\n",
	" ls_codes = ls_codes if type(ls_codes) is list else [ls_codes]\n",
	"\n",
	" for code in ls_codes:\n",
	" dic_val[code['@value']] = code['Description'][0]['#text']\n",
	"\n",
	" df = df.replace({col: dic_val})\n",
	"\n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# http://stat.data.abs.gov.au/Index.aspx\n",
	"\n",
	"ls_query = []\n",
	"\n",
	"series_cat = \"Economy\"\n",
	"dtype_cat = \"Business Indicators\"\n",
	"dtype_subcat = \"Business Indicators\"\n",
	"dtype_desc = \"Inventories, Current Prices and Chain Volume Measures\"\n",
	"dtype_abb = \"QBIS\"\n",
	"res_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetData/QBIS/50+70.B+C+D+F+G+H+TOTAL.0.99.10+20+30.Q/\"\n",
	"dsd_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetDataStructure/QBIS\"\n",
	"ls_query.append([series_cat, dtype_cat, dtype_subcat, dtype_desc, dtype_abb, res_url, dsd_url])\n",
	"\n",
	"series_cat = \"Health\"\n",
	"dtype_cat = \"Health Conditions and Risk Factors\"\n",
	"dtype_subcat = \"Apparent Consumption of Alcohol, Australia\"\n",
	"dtype_desc = \"Pure Alcohol Available for Consumption\"\n",
	"dtype_abb = \"ALC\"\n",
	"res_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetData/ALC/1.1+2.1+2+3+5+4.6+10+11+12+15+14.A/\"\n",
	"dsd_url = \"http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetDataStructure/ALC\"\n",
	"ls_query.append([series_cat, dtype_cat, dtype_subcat, dtype_desc, dtype_abb, res_url, dsd_url])\n",
	"\n",
	"\n",
	"df_query = pd.DataFrame(ls_query, columns=['series_cat', 'dtype_cat', 'dtype_subcat', 'dtype_desc', 'dtype_abb', 'res_url', 'dsd_url'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"for index, row in df_query.iterrows():\n",
	"\n",
	" agency_code = 'ABS'\n",
	"\n",
	" ABS = Request(agency_code)\n",
	" ABS.client.config['timeout'] = 100000\n",
	"\n",
	" res_url = row['dtype_abb'] + find_from(row['res_url'], row['dtype_abb'])\n",
	"\n",
	" data_response = ABS.data(resource_id=res_url)\n",
	"\n",
	" df = data_response.write().unstack().reset_index()\n",
	" df = apply_dsd(df, row['dsd_url'])\n",
	" \n",
	" data_file = row['dtype_abb'] + \".csv.gz\"\n",
	" \n",
	" df.to_csv(data_file, sep=';', quotechar='\"', date_format='%Y-%m-%d %H:%M:%S.%f', compression='gzip', index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}