Created
February 24, 2021 20:55
-
-
Save hadisfr/5078731c041585df55ea5d41ca86753c to your computer and use it in GitHub Desktop.
Bank Refah Iranian DB Plotter and Aggregator - https://refahdb.mcls.gov.ir/fa/sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import csv | |
from collections import defaultdict | |
from sys import stderr | |
import numpy as np | |
from matplotlib import pyplot as plt | |
from tqdm import tqdm | |
FIELDS_TO_AGGREGATE = [ | |
"Bardasht95", | |
"Variz95", | |
"MandehAval95", | |
"MandehAkhar95", | |
"Sood95", | |
"Bardasht96", | |
"Variz96", | |
"MandehAval96", | |
"MandehAkhar96", | |
"Sood96", | |
"Bardasht97", | |
"Variz97", | |
"MandehAval97", | |
"MandehAkhar97", | |
"Sood97", | |
"Card9801", | |
"Card9802", | |
"Card9803", | |
"Card9804", | |
"Card9805", | |
"Card9806", | |
"Trip_AirNonPilgrimageCount_95", | |
"Trip_AirNonPilgrimageCount_96", | |
"Trip_AirNonPilgrimageCount_97", | |
"Trip_AirNonPilgrimageCount_98", | |
"Cars_Count", | |
"CarPrice_Sum", | |
"Daramad_Total_Rials", | |
] | |
FIELD_TO_PLOT = "MandehAval97" | |
PLOT_PRECISION = 1 | |
AGGREGATE_KEY = "ParentId" | |
def read_db(): | |
print("reading", file=stderr) | |
fields_to_aggregate_set = set(FIELDS_TO_AGGREGATE) | |
with open("500000FamilySample-990402.csv") as f: # available at http://mashghema.ir/500000FamilySample-990402.rar | |
aggregated = defaultdict(lambda: defaultdict(int)) | |
reader = csv.DictReader(f) | |
for person in tqdm(reader, total=1456232): | |
aggregated[person[AGGREGATE_KEY]][AGGREGATE_KEY] = person[AGGREGATE_KEY] | |
for field in fields_to_aggregate_set: | |
aggregated[person[AGGREGATE_KEY]][field] += int(person[field]) | |
return aggregated | |
def plot(capital): | |
print("plotting", file=stderr) | |
plt.figure(figsize=(5, 5)) | |
plt.plot( | |
np.arange(0, 100, PLOT_PRECISION), | |
list(map( | |
lambda x: sum(sorted(capital)[-int(len(capital)/100*x):]) / sum(capital) * 100 if x > 0 else 0, | |
tqdm(np.arange(0, 100, PLOT_PRECISION))) | |
) | |
) | |
plt.xlabel("families") | |
plt.ylabel("capital (%s)" % AGGREGATE_KEY) | |
plt.grid() | |
plt.show() | |
def write_aggregated_db(aggregated): | |
print("writing", file=stderr) | |
with open("500000FamilySample-990402-aggregated.csv", "w") as f: | |
writer = csv.DictWriter(f, fieldnames=[AGGREGATE_KEY] + FIELDS_TO_AGGREGATE) | |
writer.writeheader() | |
writer.writerows(map(lambda row: row[1], tqdm(aggregated.items()))) | |
def main(): | |
aggregated = read_db() | |
print("%d aggregated rows" % len(aggregated)) | |
write_aggregated_db(aggregated) | |
capital = list(map(lambda x: x[1][FIELD_TO_PLOT], aggregated.items())) | |
plot(capital) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment