Skip to content

Instantly share code, notes, and snippets.

@srikumarks
Last active March 16, 2024 12:45
Show Gist options
  • Save srikumarks/3266c38490a0932aba4ba7bd841a7c30 to your computer and use it in GitHub Desktop.
Save srikumarks/3266c38490a0932aba4ba7bd841a7c30 to your computer and use it in GitHub Desktop.
Electoral bonds pdf to csv extraction
import re
from pypdf import PdfReader
import pandas as pd
# 34ca61ebc3ddbb71a1d16f8f43db38d031872126 encashment-details.pdf
encashment = PdfReader("encashment-details.pdf")
# 11e0aff1007065d9a4de49ac5499fbf4186dcda0 purchaser-details.pdf
purchaser = PdfReader("purchaser-details.pdf")
def extractLines(file):
data = []
for epage in enumerate(file.pages):
p = epage[0]
page = epage[1]
text = page.extract_text()
lines = text.split("\n")
data.extend(lines)
return data
def fields(line):
parts = re.match("^([0-9]+/[A-Za-z]+/[0-9]+) (.+) ([0-9,]+)$", line)
assert parts != None, 'line = ' + line
date = parts.group(1)
entity = parts.group(2)
amount = int(parts.group(3).replace(',', ''))
return (date, entity, amount)
def extractEncashment(encashment):
data = extractLines(encashment)
if data[0] == 'Date of ' and data[1] == 'Encashment Name of the Political Party Denomination':
data = data[2:]
return [fields(line) for line in data]
def extractPurchaser(purchaser):
data = extractLines(purchaser)
if data[0] == 'Date of Purchase Purchaser Name Denomination':
data = data[1:]
return [fields(line) for line in data]
def todf(arr, headers):
d = {}
for c in range(len(headers)):
h = headers[c]
d[h] = [arr[i][c] for i in range(len(arr))]
return pd.DataFrame(d)
encashmentHeaders = ['Date of Encashment', 'Name of the Political Party', 'Denomination']
todf(extractEncashment(encashment), encashmentHeaders).to_csv("encashment.csv", index=False)
purchaserHeaders = ['Date of Purchase', 'Purchaser Name', 'Denomination']
todf(extractPurchaser(purchaser), purchaserHeaders).to_csv("purchaser.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment