Created
March 16, 2014 09:48
-
-
Save rmehta/9580863 to your computer and use it in GitHub Desktop.
Extract Voter information from Maharashtra Electoral Rolls (PDF)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PDF to CSV Converter for Mah Electoral roles | |
# Usage: | |
# 1. Place all pdf folders in "source" | |
# 2. Set "target" to be the folder where you want output files exported | |
# Note: inner directory structure will be maintained. | |
from pdfminer.pdfinterp import PDFResourceManager, process_pdf | |
from pdfminer.converter import HTMLConverter | |
from pdfminer.layout import LAParams | |
from cStringIO import StringIO | |
import os, re, csv | |
def convert_pdf(path): | |
rsrcmgr = PDFResourceManager() | |
retstr = StringIO() | |
codec = 'utf-8' | |
laparams = LAParams() | |
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
fp = file(path, 'rb') | |
process_pdf(rsrcmgr, device, fp) | |
fp.close() | |
device.close() | |
str = retstr.getvalue() | |
retstr.close() | |
return str | |
def get_strings(html): | |
return re.findall('(?<=>)([^<]+?)(?=<)', html) | |
def get_voters(strings): | |
voters = [] | |
v = prev = stype = addr = None | |
nextsr = 1 | |
for s in strings: | |
s = s.strip() | |
ns = s.replace(" ", "").lower() | |
if not ns: | |
continue | |
if "pincode" in ns and len(ns) > 20: | |
addr = s | |
prev = stype | |
stype = None | |
if ns.startswith("mt/") or ns.startswith("nct"): | |
if v: | |
v["id"] = s | |
stype = "id" | |
continue | |
if ("photoavailable" in ns) or ("photonotavailable" in ns) \ | |
or ns.startswith("partno"): | |
if v and "age" in v: | |
voters.append(v) | |
v = {"addr": addr} | |
stype = "new" | |
if prev == "new": | |
v["father_or_husband"] = s | |
joined = re.findall("[a-z][A-Z]", s) | |
if joined: | |
i = s.index(joined[0]) | |
v["father_or_husband"] = s[:i+1] | |
v["name"] = s[i+1:] | |
if s.startswith("Age :"): | |
if not v: | |
v = {} | |
s = s.replace("Sex", "") | |
v["age"] = s.split(":")[1].strip() | |
stype = "age" | |
if prev == "age": | |
if "Male" in s: | |
v["sex"] = "Male" | |
if "Female" in s: | |
v["sex"] = "Female" | |
stype = "sex" | |
if prev == "sex": | |
p = s.split(":") | |
if p[0].strip(): | |
v["name"] = p[0].strip() | |
if p[-1].strip(): | |
v["house_no"] = p[-1].strip() | |
stype = "house" | |
if prev == "house": | |
if s==str(nextsr): | |
v["sr"] = s | |
nextsr = int(v["sr"]) + 1 | |
stype = "sr" | |
else: | |
if "houseno" not in s: | |
v["house_no"] = s | |
stype = "house" | |
if prev == "id": | |
if not ns.startswith("elec"): | |
v["house_no"] = s | |
return voters | |
def write_csv(voters, filename): | |
with open(filename, "w") as f: | |
writer = csv.writer(f) | |
keys = ["sr", "id", "name", "age", "sex", "father_or_husband", "house_no", "addr"] | |
writer.writerow([k.replace("_", " ").title() for k in keys]) | |
for v in voters: | |
writer.writerow([v.get(k, "") for k in keys]) | |
source = "source" | |
target = "out" | |
for basepath, folders, files in os.walk(source): | |
for f in files: | |
if f.endswith(".pdf"): | |
outpath = os.path.join(target, os.path.relpath(basepath, source)) | |
outfile = os.path.join(outpath, f.rsplit(".")[0] + ".csv") | |
if not os.path.exists(outpath): | |
os.makedirs(outpath) | |
html = convert_pdf(os.path.join(basepath, f)) | |
print outfile | |
write_csv(get_voters(get_strings(html)), os.path.join(outpath, f.rsplit(".")[0] + ".csv")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gist.github.com/rmehta/9580863#file-convert-py-L7
is
process_pdf
an added function topdfminer
module. If yes can you share it please?Also does this script handle converting marathi electoral roll to txt/csv etc ..?
Thanks!