a script to download jukyo jusho data and load into BigQuery.
Data source: 電子国土基本図(地名情報)「住居表示住所」
also utilizes csv file from the this site that is the officially published Chiiki Code (標準地域コード). (last update: 2019-03-25).
| data/ | |
| .DS_Store |
a script to download jukyo jusho data and load into BigQuery.
Data source: 電子国土基本図(地名情報)「住居表示住所」
also utilizes csv file from the this site that is the officially published Chiiki Code (標準地域コード). (last update: 2019-03-25).
| #!/bin/bash | |
| PROJECT_ID=moritani-sandbox-opendata | |
| BUCKET=moritani-sandbox-opendata-japan-sg | |
| REGION=asia-southeast1 | |
| OPENDATA_NAME=Jukyo_Jusho | |
| OPENDATA_COUNTRY=Japan | |
| OPENDATA_NAME=$(echo $OPENDATA_NAME | tr '[:upper:]' '[:lower:]') | |
| OPENDATA_COUNTRY=$(echo $OPENDATA_COUNTRY | tr '[:upper:]' '[:lower:]') | |
| BQ_DATASET=japan_${OPENDATA_NAME} | |
| # bq --project_id ${PROJECT_ID} ls ${BQ_DATASET} \ | |
| # || bq --project_id ${PROJECT_ID} mk -d --data_location=${REGION} ${BQ_DATASET} | |
| # create dataset if not existing yet | |
| bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
| CREATE SCHEMA IF NOT EXISTS ${BQ_DATASET} | |
| OPTIONS( | |
| location="$REGION", | |
| labels=[ | |
| ("country","$OPENDATA_COUNTRY") | |
| ] | |
| ) | |
| EOSQL | |
| if [ ! -d ./data ]; then | |
| mkdir -p ./data | |
| fi | |
| if [ ! -d ./data/processed ]; then | |
| mkdir -p ./data/processed | |
| fi | |
| # generate a list of URLs for each code represented in the code list | |
| # in addition to downloading: | |
| # * file is in Shift JIS to converting to UTF-8 | |
| # * extracting just the code column | |
| # * adding zero padding so that the code is always five digits | |
| # * structuring url to fetch per-code jukyo jusho data | |
| function genDownloadList { | |
| local codes=$(curl -s https://www.soumu.go.jp/main_content/000608358.csv | iconv -f SHIFT_JIS -t UTF-8 | cut -f 3 -d "," | grep -v tiiki) | |
| for code in $codes; do | |
| local code_padded=$(printf "%05d" $code) | |
| if [ ! -f ./data/downloaded/saigai.gsi.go.jp/jusho/download/data/${code_padded}.zip ]; then | |
| echo https://saigai.gsi.go.jp/jusho/download/data/${code_padded}.zip | |
| fi | |
| done | |
| } | |
| # parallel downloading with xargs | |
| # not all urls will have data as the coverage across codes is partial | |
| function download { | |
| genDownloadList | xargs -n 1 -P 0 wget -q -c -x -r -P ./data/downloaded | |
| } | |
| # unzipping all downloaded files | |
| function unzipAll { | |
| local files=$(find ./data/downloaded -name "*.zip") | |
| echo ${#files[@]} files | |
| for file in $files ; do | |
| #echo $file | |
| local filename=${file##*/} | |
| local filename_prefix=${filename%.*} | |
| #echo $filename_prefix | |
| if [ ! -d ./data/unzipped/$filename_prefix ]; then | |
| unzip -o $file -d ./data/unzipped/ | |
| else | |
| echo skipping $file already unzipped | |
| fi | |
| done | |
| } | |
| # unzipped content includes .shp file and .csv file | |
| # * .csv is sufficient so concatenating all files into a single file and adding a header | |
| # * upload the large .csv file to Cloud Storage | |
| # note: only use if to use with Datasets API which does not handle multi-file datasets and also a header line. | |
| function preprocessAndUpload { | |
| #local files=$(find ./data/unzipped/ -name "*.csv") | |
| cat $(find ./data/unzipped/ -name "*.csv") > ./data/processed/all.csv | |
| wc -l ./data/processed/all.csv | |
| ## datasets API requires "long" so changing from "lon" in original data | |
| echo city_code,jusho1,jusho2,jusho3,jcode1,jcode2,long,lat,seido > ./data/processed/all_with_header.csv | |
| cat ./data/processed/all.csv >> ./data/processed/all_with_header.csv | |
| gsutil cp -c ./data/processed/all_with_header.csv gs://${BUCKET}/${OPENDATA_NAME}/ | |
| } | |
| # unzipped content includes .shp file and .csv file | |
| # * .csv is sufficient so concatenating all files into a single file | |
| # * for efficiency, split the single file into multiple file (by 1M lines each) | |
| # * upload files to Cloud Storage | |
| function preprocessAndUpload2 { | |
| #local files=$(find ./data/unzipped/ -name "*.csv") | |
| cat $(find ./data/unzipped/ -name "*.csv") > ./data/processed/all.csv | |
| wc -l ./data/processed/all.csv | |
| if [ -d ./data/processed/split ]; then | |
| rm -rf ./data/processed/split | |
| fi | |
| mkdir -p ./data/processed/split | |
| pushd ./data/processed/split | |
| split -d -l 1000000 ../all.csv ${OPENDATA_NAME}_ | |
| popd | |
| for file in ./data/processed/split/* ; do | |
| mv -- ${file} ${file}.csv | |
| done | |
| gsutil -m rm gs://${BUCKET}/${OPENDATA_NAME}/split/* | |
| gsutil -m cp -z csv -c ./data/processed/split/* gs://${BUCKET}/${OPENDATA_NAME}/split | |
| } | |
| function writeTableSchema { | |
| cat <<EOF | |
| [ | |
| {"name": "city_code", "type": "STRING", "mode": "REQUIRED"}, | |
| {"name": "jusho1", "type": "STRING", "mode": "NULLABLE"}, | |
| {"name": "jusho2", "type": "STRING", "mode": "NULLABLE"}, | |
| {"name": "jusho3", "type": "STRING", "mode": "NULLABLE"}, | |
| {"name": "jcode1", "type": "STRING", "mode": "NULLABLE"}, | |
| {"name": "jcode2", "type": "STRING", "mode": "NULLABLE"}, | |
| {"name": "lon", "type": "NUMERIC", "mode": "NULLABLE"}, | |
| {"name": "lat", "type": "NUMERIC", "mode": "NULLABLE"}, | |
| {"name": "seido", "type": "NUMERIC", "mode": "NULLABLE"} | |
| ] | |
| EOF | |
| } | |
| # loads split files into a BigQuery table. | |
| # since this is a relatively large file, it can be a little slow. | |
| function loadToBq { | |
| local CSV_PATH=gs://${BUCKET}/${OPENDATA_NAME}/all_with_header.csv | |
| writeTableSchema > ./data/schema.json | |
| bq --project_id ${PROJECT_ID} load \ | |
| --source_format CSV \ | |
| --skip_leading_rows 1 \ | |
| ${BQ_DATASET}.${OPENDATA_NAME}_all_raw ${CSV_PATH} ./data/schema.json | |
| } | |
| # loads split files into a BigQuery table. | |
| # since the load is parallelized, it should complete fairly quickly | |
| function loadToBq2 { | |
| local CSV_PATH=gs://${BUCKET}/${OPENDATA_NAME}/split/* | |
| local TABLE_OUTPUT=${OPENDATA_NAME}_raw | |
| #writeTableSchema | |
| # bq --project_id ${PROJECT_ID} load --replace --source_format CSV \ | |
| # ${BQ_DATASET}.${TABLE_OUTPUT} "${CSV_PATH}" ./data/schema.json | |
| # load table from csv stored on gcs | |
| bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
| load data OVERWRITE ${BQ_DATASET}.${TABLE_OUTPUT} | |
| ( | |
| city_code STRING not null, | |
| jusho1 STRING, | |
| jusho2 STRING, | |
| jusho3 STRING, | |
| jcode1 STRING, | |
| jcode2 STRING, | |
| lon numeric, | |
| lat numeric, | |
| seido numeric | |
| ) | |
| options( | |
| description="電子国土基本図(地名情報)「住居表示住所」 https://www.gsi.go.jp/kihonjohochousa/jukyo_jusho.html", | |
| labels=[ | |
| ("country","$OPENDATA_COUNTRY") | |
| ] | |
| ) | |
| FROM FILES ( | |
| format="CSV", | |
| uris=[ | |
| "${CSV_PATH}" | |
| ] | |
| ) | |
| EOSQL | |
| # bq --project_id ${PROJECT_ID} update \ | |
| # --description "電子国土基本図(地名情報)「住居表示住所」 | |
| # source : https://www.gsi.go.jp/kihonjohochousa/jukyo_jusho.html | |
| # usage : https://www.gsi.go.jp/LAW/2930-index.html" \ | |
| # ${BQ_DATASET}.${TABLE_OUTPUT} | |
| } | |
| function showSchema { | |
| bq --project_id ${PROJECT_ID} show --schema ${BQ_DATASET}.saigai_all_raw | |
| } | |
| function processRaw { | |
| local TABLE_OUTPUT=${OPENDATA_NAME}_processed | |
| local TABLE_RAW=${OPENDATA_NAME}_raw | |
| bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
| create or replace table ${BQ_DATASET}.${TABLE_OUTPUT} | |
| partition by RANGE_BUCKET(ken_code, GENERATE_ARRAY(1, 47, 1)) | |
| cluster by city_code, jusho1, jusho2, jusho3 | |
| OPTIONS ( | |
| description="enriched jukyo jusho data across japan", | |
| labels=[ | |
| ("country", "japan") | |
| ] | |
| ) | |
| as ( | |
| SELECT | |
| city_code, | |
| codes.ken_code, | |
| codes.ken_code_2, | |
| codes.sityouson_code, | |
| codes.sityouson_code_3, | |
| codes.ken_name, codes.ken_name_yomigana, | |
| codes.sityouson_name1, codes.sityouson_name1_yomigana, | |
| # codes.sityouson_name2, codes.sityouson_name2_yomigana, | |
| codes.sityouson_name3, codes.sityouson_name3_yomigana, | |
| jusho1, | |
| jusho2, | |
| jusho3, | |
| IF ( CONTAINS_SUBSTR(jusho3,"-") , (SPLIT(jusho3,"-") [ OFFSET (0)]), jusho3 ) AS jusho3_1, | |
| IF ( CONTAINS_SUBSTR(jusho3,"-") and array_length(SPLIT(jusho3,"-"))>1, (SPLIT(jusho3,"-") [ OFFSET (1)]), NULL ) AS jusho3_2, | |
| IF ( CONTAINS_SUBSTR(jusho3,"-") and array_length(SPLIT(jusho3,"-"))>2, (SPLIT(jusho3,"-") [ OFFSET (2)]), NULL ) AS jusho3_3, | |
| lon as long, | |
| lat, | |
| seido, | |
| ST_GEOGPOINT(lon,lat) AS geometry | |
| FROM | |
| ${BQ_DATASET}.${TABLE_RAW} AS jukyo_jusho | |
| LEFT JOIN | |
| japan_chiiki_code.codes codes | |
| ON | |
| jukyo_jusho.city_code=codes.tiiki_code_5 | |
| ) | |
| EOSQL | |
| } | |
| ## process data only for Tokyo (city_code 13XXX) | |
| function processTokyo { | |
| bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
| create or replace table ${BQ_DATASET}.${OPENDATA_NAME}_processed_tokyo | |
| as ( | |
| select * from ${BQ_DATASET}.${OPENDATA_NAME}_processed | |
| where ken_code=13 | |
| ) | |
| EOSQL | |
| } | |
| "$@" |