Last active
March 16, 2024 12:45
-
-
Save srikumarks/3266c38490a0932aba4ba7bd841a7c30 to your computer and use it in GitHub Desktop.
Electoral bonds pdf to csv extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from pypdf import PdfReader | |
import pandas as pd | |
# 34ca61ebc3ddbb71a1d16f8f43db38d031872126 encashment-details.pdf | |
encashment = PdfReader("encashment-details.pdf") | |
# 11e0aff1007065d9a4de49ac5499fbf4186dcda0 purchaser-details.pdf | |
purchaser = PdfReader("purchaser-details.pdf") | |
def extractLines(file): | |
data = [] | |
for epage in enumerate(file.pages): | |
p = epage[0] | |
page = epage[1] | |
text = page.extract_text() | |
lines = text.split("\n") | |
data.extend(lines) | |
return data | |
def fields(line): | |
parts = re.match("^([0-9]+/[A-Za-z]+/[0-9]+) (.+) ([0-9,]+)$", line) | |
assert parts != None, 'line = ' + line | |
date = parts.group(1) | |
entity = parts.group(2) | |
amount = int(parts.group(3).replace(',', '')) | |
return (date, entity, amount) | |
def extractEncashment(encashment): | |
data = extractLines(encashment) | |
if data[0] == 'Date of ' and data[1] == 'Encashment Name of the Political Party Denomination': | |
data = data[2:] | |
return [fields(line) for line in data] | |
def extractPurchaser(purchaser): | |
data = extractLines(purchaser) | |
if data[0] == 'Date of Purchase Purchaser Name Denomination': | |
data = data[1:] | |
return [fields(line) for line in data] | |
def todf(arr, headers): | |
d = {} | |
for c in range(len(headers)): | |
h = headers[c] | |
d[h] = [arr[i][c] for i in range(len(arr))] | |
return pd.DataFrame(d) | |
encashmentHeaders = ['Date of Encashment', 'Name of the Political Party', 'Denomination'] | |
todf(extractEncashment(encashment), encashmentHeaders).to_csv("encashment.csv", index=False) | |
purchaserHeaders = ['Date of Purchase', 'Purchaser Name', 'Denomination'] | |
todf(extractPurchaser(purchaser), purchaserHeaders).to_csv("purchaser.csv", index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment