-
-
Save holgerd77/95813c743f71cd7fafa7 to your computer and use it in GitHub Desktop.
Fixes for Farmsubsidy data files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fixes for Farmsubsidy data | |
# Order of countries is taken form the website: | |
# http://farmsubsidy.openspending.org/countries/ | |
set -x verbose | |
#wget -O AT.zip "http://data.farmsubsidy.org/web/at20130501.zip" | |
#unzip -d AT AT.zip | |
# Generic fixes/helpers | |
# HTML encoding fix | |
sed 's/\ö/ö/g; s/Ö/Ö/g; s/ü/ü/g; s/Ü/Ü/g; s/ä/ä/g; s/Ä/Ä/g; s/ß/ß/g; s/"/"/g; s/</</g; s/>/>/g; s/&/\&/g; s/&#[0-9]*;//g' [COUNTRY_CODE]/recipient_bad.txt > [COUNTRY_CODE]/recipient.txt | |
# Finding DOS formatted files | |
find DK -type f -print0 | xargs -0 grep -l `printf '\r\n'` | |
# AT (Austria) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# HTML coded german umlaute (like "ö") leading to wrong column numbers in payment.txt | |
mv AT/recipient.txt AT/recipient_bad.txt | |
# Apply generic HTML encoding fix | |
# BE (Belgium) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Some double entries in recipient.txt file | |
mv BE/recipient.txt BE/recipient_bad.txt | |
cat BE/recipient_bad.txt | uniq > BE/recipient.txt | |
# BG (Bulgaria) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
#CY (Cyprus) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Tried, went through without errors but no data shown! | |
# Recipients in DB but no Payments or Schemes | |
# Add english name in scheme.txt as national language name where entry is missing | |
# Add some missing globalPaymentIds in payment.txt | |
mv CY/scheme.txt CY/scheme_bad.txt | |
sed 's/^\([^;]*\);;\([^;]*\);/\1;\2;\2;/g' CY/scheme_bad.txt > CY/scheme.txt | |
mv CY/payment.txt CY/payment_bad.txt | |
sed 's/^\([^;]*\);;/\1;"CY\1";/g' CY/payment_bad.txt > CY/payment.txt | |
#CZ (Czech Republic) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Remove 2014 entries misleading whole DB | |
mv CZ/payment.txt CZ/payment_bad.txt | |
awk < CZ/payment_bad.txt -F\; ' ($8 != "\"2014\""){ print $0 }' > CZ/payment.txt | |
# DK (Denmark) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# (ignored: negative payments, to be further investigated!) | |
# payment.txt --- | |
# payment.txt file coming in two parts, concatenating together | |
dos2unix DK/payment1.txt DK/payment.txt | |
tail -n +2 DK/payment1.txt >> DK/payment.txt | |
# recipient.txt --- | |
# Some recipients represent empty lat, long values at the end of column as empty string, replacing | |
cat DK/recipient.txt | dos2unix > DK/recipient_bad.txt | |
sed 's/;"";""$/;;/g' DK/recipient_bad.txt > DK/recipient.txt | |
# scheme.txt --- | |
cat DK/scheme.txt | dos2unix > DK/scheme_bad.txt | |
# Removing integer id from beginning of lines | |
sed 's/^[^;]*;\(.*\)/\1/g' DK/scheme_bad.txt > DK/scheme_bad2.txt | |
# Adding missing "DK" at end of lines | |
sed 's/\;$/\;"DK"/g' DK/scheme_bad2.txt > DK/scheme.txt | |
# EE (Estonia) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# No modification needed. | |
# FI (Finland) | |
# 2013 Website : Data missing (only raw.txt recipient file) | |
# 2013 Data : - | |
# Coming with a raw.txt recipient file, not clear yet how to procede... | |
# FR (France) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# DE (Germany) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# GR (Greece) | |
# 2013 Website : Data missing | |
# 2013 Data : - | |
# HU (Hungary) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# IT (Italy) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Manual correction of 12 recipient entries (";" char in name) | |
# IE (Ireland) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# LV (Latvia) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Adds national language name in scheme.txt as english name where entry is missing | |
mv LV/scheme.txt LV/scheme_bad.txt | |
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' LV/scheme_bad.txt > LV/scheme.txt | |
# LT (Lithuania) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# scheme.txt --- | |
# Deletes schemes with no IDs from scheme.txt | |
dos2unix LT/*.txt | |
mv LT/scheme.txt LT/scheme_bad.txt | |
grep "^\"" LT/scheme_bad.txt > LT/scheme.txt | |
# payment.txt --- | |
# Remove payments with no globalSchemeId | |
mv LT/payment.txt LT/payment_bad.txt | |
awk < LT/payment_bad.txt -F \; '($5 != ""){ print $0 }' > LT/payment.txt | |
# LU (Luxembourg) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# MA (Malta) | |
# 2013 Website : Data missing (wrong data files) | |
# 2013 Data : - | |
# Files containing data from Ireland | |
# NL (Netherlands) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Copy national language name to english name for empty fields in scheme.txt | |
mv NL/scheme.txt NL/scheme_bad.txt | |
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' NL/scheme_bad.txt > NL/scheme.txt | |
# Super tricky mix of newlines and carriage returns in recipient.txt | |
# Additionally deleted ; from line 819434 manually | |
cp NL/recipient.txt NL/recipient_bad.txt | |
# DON'T COPY PASTE BUT CREATE ^M CHARACTER WITH CTRL-V AND THEN CTRL-M!!! | |
sed 's/^M/NEWLINEPLACEHOLDER/g' NL/recipient_bad.txt > NL/recipient.txt | |
dos2unix NL/recipient.txt | |
tr -d '\n' < NL/recipient.txt > NL/recipient_tmp.txt | |
sed 's/NEWLINEPLACEHOLDER/\n/g' NL/recipient_tmp.txt > NL/recipient.txt | |
rm NL/recipient_tmp.txt | |
# PL (Poland) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# PT (Portugal) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# RO (Romania) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Went through, but no data, no years are showing, debugging needed! | |
# recipient.txt --- | |
# Many HTML encoded strings, generic fix, some manual fixes afterwards | |
mv RO/recipient.txt RO/recipient_bad.txt | |
# Apply generic HTML encoding fix | |
# => recipient_bad2.txt | |
# Manually edited recipients "RO1340283", "RO1421819" deletion of " inside name | |
awk < RO/recipient_bad2.txt -F \; '{ field=$5;cnt=gsub(/"/,"\"",field) } (cnt>2){ str=""; for(i=1;i<=NF;i++){ if(i>1){ printf ";"; } if(i==5){ gsub(/"/, "", $i); printf "\""$i"\""; } else { printf "%s", $i; } }; printf "\n" } (cnt<=2){ print $0 }' > RO/recipient.txt | |
# SK (Slovakia) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# recipient.txt --- | |
# Manual normalization of one recipient entry (";" char in name) | |
# scheme.txt --- | |
# Copy national language name to english name for empty fields in scheme.txt | |
mv SK/scheme.txt SK/scheme_bad.txt | |
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' SK/scheme_bad.txt > SK/scheme.txt | |
# SI (Slovenia) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
mv SI/scheme.txt SI/scheme_bad.txt | |
sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' SI/scheme_bad.txt > SI/scheme.txt | |
mv SI/payment.txt SI/payment_bad.txt | |
# Somehow the country SL (Sierra Leone) appears in there instead of SI | |
# Fix it by replacing with SI | |
sed 's/^\([^;]*\);;/\1;"SI\1";/g' SI/payment_bad.txt | sed 's/"SL/"SI/g' > SI/payment.txt | |
mv SI/recipient.txt SI/recipient_bad.txt | |
sed 's/"SL/"SI/g' SI/recipient_bad.txt > SI/recipient.txt | |
# ES (Spain) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# scheme.txt --- | |
mv ES/scheme.txt ES/scheme_bad.txt | |
tr -d '\n' < ES/scheme_bad.txt | tr '\r' '\n' | sed 's/^\([^;]*\);\([^;]*\);;/\1;\2;\2;/g' > ES/scheme.txt | |
# recipient.txt --- | |
# Concatenate separated lines | |
mv ES/recipient.txt ES/recipient_bad.txt | |
awk < ES/recipient_bad.txt -F \; 'BEGIN { p=0;cl="";} (NF<21 && p+NF<21){ p=p+NF;cl=cl$0; } (NF<21 && p+NF>=21){ print cl$0;p=0;cl="";} (NF>=21) { print $0;p=0;cl=""; }' > ES/recipient_bad2.txt | |
# Concatenate name entries with one or several unescaped ; | |
awk < ES/recipient_bad2.txt -F \; '( NF==21 ){ print $0 } ( NF==22){ print $1";"$2";"$3";"$4";"$5 $6";"$7";"$8";"$9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==23){ print $1";"$2";"$3";"$4";"$5 $6 $7";"$8";"$9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==24){ print $1";"$2";"$3";"$4";"$5 $6 $7 $8";"$9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==25){ print $1";"$2";"$3";"$4";"$5 $6 $7 $8 $9";"$10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 } ( NF==26){ print $1";"$2";"$3";"$4";"$5 $6 $7 $8 $9 $10";"$11";"$12";"$13";"$14";"$15";"$16";"$17";"$18";"$19";"$20";"$21";"$22 }' > ES/recipient.txt | |
# Test for wrong line counts at the end | |
awk < ES/recipient.txt -F \; '( NF<21 ){ print $0 }' | |
# SE (Sweden) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# recipient.txt --- | |
cat SE/recipient.txt | dos2unix > SE/recipient_bad.txt | |
# Remove empty string on several fields and entries of recipient file | |
sed 's/;""/;/g' SE/recipient_bad.txt > SE/recipient.txt | |
#OLD Remove "" in latitude field of several recipients | |
#OLD sed 's/;;"";$/;;;/g' SE/recipient_bad.txt > SE/recipient_bad2.txt | |
#OLD sed 's/;"";""$/;;/g' SE/recipient_bad2.txt > SE/recipient.txt | |
# scheme.txt --- | |
mv SE/scheme.txt SE/scheme_bad.txt | |
# Use scheme ID as english name if no name is provided | |
sed 's/^\([^;]*\);;;/\1;;\1;/g' SE/scheme_bad.txt > SE/scheme.txt | |
# GB (United Kingdom) | |
# 2013 Website : Done | |
# 2013 Data : Done | |
# Pre-Processing to get validatable files | |
cp GB/payments1.txt GB/payment.txt | |
tail -n +2 GB/payments2.txt >> GB/payment.txt | |
tail -n +2 GB/payment3.txt >> GB/payment.txt | |
mv GB/schemes.txt GB/scheme.txt | |
# payment.txt --- | |
mv GB/payment.txt GB/payment_bad.txt | |
dos2unix GB/payment_bad.txt | |
# Fix doubled country code in payment file | |
sed 's/;"GBGB/;"GB/g' GB/payment_bad.txt > GB/payment.txt | |
# recipient.txt --- | |
# Manually corrected two lines 475111 475112 and 493001 493002 split into two (wrong number of columns) | |
# scheme.txt --- | |
mv GB/scheme.txt GB/scheme_bad.txt | |
# Use scheme ID as english name if no name is provided | |
sed 's/^\([^;]*\);;;/\1;;\1;/g' GB/scheme_bad.txt | dos2unix > GB/scheme.txt | |
printf '"GB200450302032";;"Unknown";;"GB"\n' >> GB/scheme.txt | |
printf '"GB200450302052";;"Unknown";;"GB"\n' >> GB/scheme.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment