Skip to content

Instantly share code, notes, and snippets.

@holgerd77
Forked from stefanw/get_farmsubsidy_data.sh
Last active August 29, 2015 14:01
Show Gist options
  • Save holgerd77/95813c743f71cd7fafa7 to your computer and use it in GitHub Desktop.
Save holgerd77/95813c743f71cd7fafa7 to your computer and use it in GitHub Desktop.
Fixes for Farmsubsidy data files
# Fixes for Farmsubsidy data
# Order of countries is taken form the website:
# http://farmsubsidy.openspending.org/countries/
set -x verbose
#wget -O AT.zip "http://data.farmsubsidy.org/web/at20130501.zip"
#unzip -d AT AT.zip
# Generic fixes/helpers
# HTML encoding fix
sed 's/\&ouml;/ö/g; s/&Ouml;/Ö/g; s/&uuml;/ü/g; s/&Uuml;/Ü/g; s/&auml;/ä/g; s/&Auml;/Ä/g; s/&szlig;/ß/g; s/&quot;/"/g; s/&lt;/</g; s/&gt;/>/g; s/&amp;/\&/g; s/&#[0-9]*;//g' [COUNTRY_CODE]/recipient_bad.txt > [COUNTRY_CODE]/recipient.txt
# Finding DOS formatted files
find DK -type f -print0 | xargs -0 grep -l `printf '\r\n'`
# AT (Austria)
# 2013 Website : Done
# 2013 Data : Done
# HTML coded german umlaute (like "&ouml") leading to wrong column numbers in payment.txt
mv AT/recipient.txt AT/recipient_bad.txt
# Apply generic HTML encoding fix
# BE (Belgium)
# 2013 Website : Done
# 2013 Data : Done
# Some double entries in recipient.txt file
mv BE/recipient.txt BE/recipient_bad.txt
cat BE/recipient_bad.txt | uniq > BE/recipient.txt
# BG (Bulgaria)
# 2013 Website : Done
# 2013 Data : Done
#CY (Cyprus)
# 2013 Website : Done
# 2013 Data : Done
# Tried, went through without errors but no data shown!
# Recipients in DB but no Payments or Schemes
# Add english name in scheme.txt as national language name where entry is missing
# Add some missing globalPaymentIds in payment.txt
mv CY/scheme.txt CY/scheme_bad.txt
sed 's/^\([^;]*\);;\([^;]*\);/\1;\2;\2;/g' CY/scheme_bad.txt > CY/scheme.txt
mv CY/payment.txt CY/payment_bad.txt
sed 's/^\([^;]*\);;/\1;"CY\1";/g' CY/payment_bad.txt > CY/payment.txt
#CZ (Czech Republic)
# 2013 Website : Done
# 2013 Data : Done
# Remove 2014 entries misleading whole DB
mv CZ/payment.txt CZ/payment_bad.txt
awk < CZ/payment_bad.txt -F\; ' ($8 != "\"2014\""){ print $0 }' > CZ/payment.txt
# DK (Denmark)
# 2013 Website : Done
# 2013 Data : Done
# (ignored: negative payments, to be further investigated!)
# payment.txt ---
# payment.txt file coming in two parts, concatenating together
dos2unix DK/payment1.txt DK/payment.txt
tail -n +2 DK/payment1.txt >> DK/payment.txt
# recipient.txt ---
# Some recipients represent empty lat, long values at the end of column as empty string, replacing
cat DK/recipient.txt | dos2unix > DK/recipient_bad.txt
sed 's/;"";""$/;;/g' DK/recipient_bad.txt > DK/recipient.txt
# scheme.txt ---
cat DK/scheme.txt | dos2unix > DK/scheme_bad.txt
# Removing integer id from beginning of lines
sed 's/^[^;]*;\(.*\)/\1/g' DK/scheme_bad.txt > DK/scheme_bad2.txt
# Adding missing "DK" at end of lines
sed 's/\;$/\;"DK"/g' DK/scheme_bad2.txt > DK/scheme.txt
# EE (Estonia)
# 2013 Website : Done
# 2013 Data : Done
# No modification needed.
# FI (Finland)
# 2013 Website : Data missing (only raw.txt recipient file)
# 2013 Data : -
# Coming with a raw.txt recipient file, not clear yet how to procede...
# FR (France)
# 2013 Website : Done
# 2013 Data : Done
# DE (Germany)
# 2013 Website : Done
# 2013 Data : Done
# GR (Greece)
# 2013 Website : Data missing
# 2013 Data : -
# HU (Hungary)
# 2013 Website : Done
# 2013 Data : Done
# IT (Italy)
# 2013 Website : Done
# 2013 Data : Done
# Manual correction of 12 recipient entries (";" char in name)
# IE (Ireland)
# 2013 Website : Done
# 2013 Data : Done
# LV (Latvia)
# 2013 Website : Done
# 2013 Data : Done
# Adds national language name in scheme.txt as english name where entry is missing
mv LV/scheme.txt LV/scheme_bad.txt
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' LV/scheme_bad.txt > LV/scheme.txt
# LT (Lithuania)
# 2013 Website : Done
# 2013 Data : Done
# scheme.txt ---
# Deletes schemes with no IDs from scheme.txt
dos2unix LT/*.txt
mv LT/scheme.txt LT/scheme_bad.txt
grep "^\"" LT/scheme_bad.txt > LT/scheme.txt
# payment.txt ---
# Remove payments with no globalSchemeId
mv LT/payment.txt LT/payment_bad.txt
awk < LT/payment_bad.txt -F \; '($5 != ""){ print $0 }' > LT/payment.txt
# LU (Luxembourg)
# 2013 Website : Done
# 2013 Data : Done
# MA (Malta)
# 2013 Website : Data missing (wrong data files)
# 2013 Data : -
# Files containing data from Ireland
# NL (Netherlands)
# 2013 Website : Done
# 2013 Data : Done
# Copy national language name to english name for empty fields in scheme.txt
mv NL/scheme.txt NL/scheme_bad.txt
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' NL/scheme_bad.txt > NL/scheme.txt
# Super tricky mix of newlines and carriage returns in recipient.txt
# Additionally deleted ; from line 819434 manually
cp NL/recipient.txt NL/recipient_bad.txt
# DON'T COPY PASTE BUT CREATE ^M CHARACTER WITH CTRL-V AND THEN CTRL-M!!!
sed 's/^M/NEWLINEPLACEHOLDER/g' NL/recipient_bad.txt > NL/recipient.txt
dos2unix NL/recipient.txt
tr -d '\n' < NL/recipient.txt > NL/recipient_tmp.txt
sed 's/NEWLINEPLACEHOLDER/\n/g' NL/recipient_tmp.txt > NL/recipient.txt
rm NL/recipient_tmp.txt
# PL (Poland)
# 2013 Website : Done
# 2013 Data : Done
# PT (Portugal)
# 2013 Website : Done
# 2013 Data : Done
# RO (Romania)
# 2013 Website : Done
# 2013 Data : Done
# Went through, but no data, no years are showing, debugging needed!
# recipient.txt ---
# Many HTML encoded strings, generic fix, some manual fixes afterwards
mv RO/recipient.txt RO/recipient_bad.txt
# Apply generic HTML encoding fix
# => recipient_bad2.txt
# Manually edited recipients "RO1340283", "RO1421819" deletion of " inside name
awk < RO/recipient_bad2.txt -F \; '{ field=$5;cnt=gsub(/"/,"\"",field) } (cnt>2){ str=""; for(i=1;i<=NF;i++){ if(i>1){ printf ";"; } if(i==5){ gsub(/"/, "", $i); printf "\""$i"\""; } else { printf "%s", $i; } }; printf "\n" } (cnt<=2){ print $0 }' > RO/recipient.txt
# SK (Slovakia)
# 2013 Website : Done
# 2013 Data : Done
# recipient.txt ---
# Manual normalization of one recipient entry (";" char in name)
# scheme.txt ---
# Copy national language name to english name for empty fields in scheme.txt
mv SK/scheme.txt SK/scheme_bad.txt
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' SK/scheme_bad.txt > SK/scheme.txt
# SI (Slovenia)
# 2013 Website : Done
# 2013 Data : Done
mv SI/scheme.txt SI/scheme_bad.txt
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' SI/scheme_bad.txt > SI/scheme.txt
mv SI/payment.txt SI/payment_bad.txt
# Somehow the country SL (Sierra Leone) appears in there instead of SI
# Fix it by replacing with SI
sed 's/^\([^;]*\);;/\1;"SI\1";/g' SI/payment_bad.txt | sed 's/"SL/"SI/g' > SI/payment.txt
mv SI/recipient.txt SI/recipient_bad.txt
sed 's/"SL/"SI/g' SI/recipient_bad.txt > SI/recipient.txt
# ES (Spain)
# 2013 Website : Done
# 2013 Data : Done
# scheme.txt ---
mv ES/scheme.txt ES/scheme_bad.txt
tr -d '\n' < ES/scheme_bad.txt | tr '\r' '\n' | sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' > ES/scheme.txt
# recipient.txt ---
# Concatenate separated lines
mv ES/recipient.txt ES/recipient_bad.txt
awk < ES/recipient_bad.txt -F \; 'BEGIN { p=0;cl="";} (NF<21 && p+NF<21){ p=p+NF;cl=cl$0; } (NF<21 && p+NF>=21){ print cl$0;p=0;cl="";} (NF>=21) { print $0;p=0;cl=""; }' > ES/recipient_bad2.txt
# Concatenate name entries with one or several unescaped ;
awk < ES/recipient_bad2.txt -F \; '( NF==21 ){ print $0 } ( NF==22){ print $1";"$2";"$3";"$4";"$5 $6";"$7";"$8";"$9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==23){ print $1";"$2";"$3";"$4";"$5 $6 $7";"$8";"$9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==24){ print $1";"$2";"$3";"$4";"$5 $6 $7 $8";"$9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==25){ print $1";"$2";"$3";"$4";"$5 $6 $7 $8 $9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==26){ print $1";"$2";"$3";"$4";"$5 $6 $7 $8 $9 $10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 }' > ES/recipient.txt
# Test for wrong line counts at the end
awk < ES/recipient.txt -F \; '( NF<21 ){ print $0 }'
# SE (Sweden)
# 2013 Website : Done
# 2013 Data : Done
# recipient.txt ---
cat SE/recipient.txt | dos2unix > SE/recipient_bad.txt
# Remove empty string on several fields and entries of recipient file
sed 's/;""/;/g' SE/recipient_bad.txt > SE/recipient.txt
#OLD Remove "" in latitude field of several recipients
#OLD sed 's/;;"";$/;;;/g' SE/recipient_bad.txt > SE/recipient_bad2.txt
#OLD sed 's/;"";""$/;;/g' SE/recipient_bad2.txt > SE/recipient.txt
# scheme.txt ---
mv SE/scheme.txt SE/scheme_bad.txt
# Use scheme ID as english name if no name is provided
sed 's/^\([^;]*\);;;/\1;;\1;/g' SE/scheme_bad.txt > SE/scheme.txt
# GB (United Kingdom)
# 2013 Website : Done
# 2013 Data : Done
# Pre-Processing to get validatable files
cp GB/payments1.txt GB/payment.txt
tail -n +2 GB/payments2.txt >> GB/payment.txt
tail -n +2 GB/payment3.txt >> GB/payment.txt
mv GB/schemes.txt GB/scheme.txt
# payment.txt ---
mv GB/payment.txt GB/payment_bad.txt
dos2unix GB/payment_bad.txt
# Fix doubled country code in payment file
sed 's/;"GBGB/;"GB/g' GB/payment_bad.txt > GB/payment.txt
# recipient.txt ---
# Manually corrected two lines 475111 475112 and 493001 493002 split into two (wrong number of columns)
# scheme.txt ---
mv GB/scheme.txt GB/scheme_bad.txt
# Use scheme ID as english name if no name is provided
sed 's/^\([^;]*\);;;/\1;;\1;/g' GB/scheme_bad.txt | dos2unix > GB/scheme.txt
printf '"GB200450302032";;"Unknown";;"GB"\n' >> GB/scheme.txt
printf '"GB200450302052";;"Unknown";;"GB"\n' >> GB/scheme.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment