rmehta · March 16, 2014 09:48 · suyashdb · Sep 19, 2018
diff --git a/convert.py b/convert.py
 # PDF to CSV Converter for Mah Electoral roles
 # Usage:
 # 1. Place all pdf folders in "source"
 # 2. Set "target" to be the folder where you want output files exported
 # Note: inner directory structure will be maintained.

 from pdfminer.pdfinterp import PDFResourceManager, process_pdf
 from pdfminer.converter import HTMLConverter
 from pdfminer.layout import LAParams
 from cStringIO import StringIO
 import os, re, csv

 def convert_pdf(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str
 	
 def get_strings(html):
 	return re.findall('(?<=>)([^<]+?)(?=<)', html)
 	
 def get_voters(strings):
 	voters = []
 	v = prev = stype = addr = None
 	nextsr = 1
 	for s in strings:
 		s = s.strip()
 		ns = s.replace(" ", "").lower()
 		if not ns:
 			continue

 		if "pincode" in ns and len(ns) > 20:
 			addr = s

 		prev = stype
 		stype = None
 		
 		if ns.startswith("mt/") or ns.startswith("nct"):
 			if v:
 				v["id"] = s
 				stype = "id"
 				continue
 		
 		if ("photoavailable" in ns) or ("photonotavailable" in ns) \
 			or ns.startswith("partno"):
 			if v and "age" in v: 
 				voters.append(v)
 			v = {"addr": addr}
 			stype = "new"
 			
 		if prev == "new":
 			v["father_or_husband"] = s
 			joined = re.findall("[a-z][A-Z]", s)
 			if joined:
 				i = s.index(joined[0])
 				v["father_or_husband"] = s[:i+1]
 				v["name"] = s[i+1:]
 			
 		if s.startswith("Age :"):
 			if not v:
 				v = {}
 			s = s.replace("Sex", "")
 			v["age"] = s.split(":")[1].strip()
 			stype = "age"
 			
 		if prev == "age":
 			if "Male" in s:
 				v["sex"] = "Male"
 			if "Female" in s:
 				v["sex"] = "Female"
 			stype = "sex"
 			
 		if prev == "sex":
 			p = s.split(":")
 			if p[0].strip():
 				v["name"] = p[0].strip()
 			if p[-1].strip():
 				v["house_no"] = p[-1].strip()
 			stype = "house"
 			
 		if prev == "house":
 			if s==str(nextsr):
 				v["sr"] = s
 				nextsr = int(v["sr"]) + 1
 				stype = "sr"
 			else:
 				if "houseno" not in s:
 					v["house_no"] = s
 					stype = "house"
 					
 		if prev == "id":
 			if not ns.startswith("elec"):
 				v["house_no"] = s
 				
 	return voters
 			
 def write_csv(voters, filename):
 	with open(filename, "w") as f:
 		writer = csv.writer(f)
 		keys = ["sr", "id", "name", "age", "sex", "father_or_husband", "house_no", "addr"]
 		writer.writerow([k.replace("_", " ").title() for k in keys])
 		for v in voters:
 			writer.writerow([v.get(k, "") for k in keys])
 			

 source = "source"
 target = "out"

 for basepath, folders, files in os.walk(source):
 	for f in files:
 		if f.endswith(".pdf"):
 			outpath = os.path.join(target, os.path.relpath(basepath, source))
 			outfile = os.path.join(outpath, f.rsplit(".")[0] + ".csv")
 			if not os.path.exists(outpath):
 				os.makedirs(outpath)
 			html = convert_pdf(os.path.join(basepath, f))
 			print outfile
 			write_csv(get_voters(get_strings(html)), os.path.join(outpath, f.rsplit(".")[0] + ".csv"))
	# PDF to CSV Converter for Mah Electoral roles
	# Usage:
	# 1. Place all pdf folders in "source"
	# 2. Set "target" to be the folder where you want output files exported
	# Note: inner directory structure will be maintained.

	from pdfminer.pdfinterp import PDFResourceManager, process_pdf
	from pdfminer.converter import HTMLConverter
	from pdfminer.layout import LAParams
	from cStringIO import StringIO
	import os, re, csv

	def convert_pdf(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()
	device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

	fp = file(path, 'rb')
	process_pdf(rsrcmgr, device, fp)
	fp.close()
	device.close()

	str = retstr.getvalue()
	retstr.close()
	return str

	def get_strings(html):
	return re.findall('(?<=>)([^<]+?)(?=<)', html)

	def get_voters(strings):
	voters = []
	v = prev = stype = addr = None
	nextsr = 1
	for s in strings:
	s = s.strip()
	ns = s.replace(" ", "").lower()
	if not ns:
	continue

	if "pincode" in ns and len(ns) > 20:
	addr = s

	prev = stype
	stype = None

	if ns.startswith("mt/") or ns.startswith("nct"):
	if v:
	v["id"] = s
	stype = "id"
	continue

	if ("photoavailable" in ns) or ("photonotavailable" in ns) \
	or ns.startswith("partno"):
	if v and "age" in v:
	voters.append(v)
	v = {"addr": addr}
	stype = "new"

	if prev == "new":
	v["father_or_husband"] = s
	joined = re.findall("[a-z][A-Z]", s)
	if joined:
	i = s.index(joined[0])
	v["father_or_husband"] = s[:i+1]
	v["name"] = s[i+1:]

	if s.startswith("Age :"):
	if not v:
	v = {}
	s = s.replace("Sex", "")
	v["age"] = s.split(":")[1].strip()
	stype = "age"

	if prev == "age":
	if "Male" in s:
	v["sex"] = "Male"
	if "Female" in s:
	v["sex"] = "Female"
	stype = "sex"

	if prev == "sex":
	p = s.split(":")
	if p[0].strip():
	v["name"] = p[0].strip()
	if p[-1].strip():
	v["house_no"] = p[-1].strip()
	stype = "house"

	if prev == "house":
	if s==str(nextsr):
	v["sr"] = s
	nextsr = int(v["sr"]) + 1
	stype = "sr"
	else:
	if "houseno" not in s:
	v["house_no"] = s
	stype = "house"

	if prev == "id":
	if not ns.startswith("elec"):
	v["house_no"] = s

	return voters

	def write_csv(voters, filename):
	with open(filename, "w") as f:
	writer = csv.writer(f)
	keys = ["sr", "id", "name", "age", "sex", "father_or_husband", "house_no", "addr"]
	writer.writerow([k.replace("_", " ").title() for k in keys])
	for v in voters:
	writer.writerow([v.get(k, "") for k in keys])


	source = "source"
	target = "out"

	for basepath, folders, files in os.walk(source):
	for f in files:
	if f.endswith(".pdf"):
	outpath = os.path.join(target, os.path.relpath(basepath, source))
	outfile = os.path.join(outpath, f.rsplit(".")[0] + ".csv")
	if not os.path.exists(outpath):
	os.makedirs(outpath)
	html = convert_pdf(os.path.join(basepath, f))
	print outfile
	write_csv(get_voters(get_strings(html)), os.path.join(outpath, f.rsplit(".")[0] + ".csv"))