abehmiel · November 1, 2017 21:12
diff --git a/fix_exhibit_b.py b/fix_exhibit_b.py
 # It's really stupid when the gov't releases pdf's of tabular data. So I made a quick, hacky script to
 # fix their mistakes for them. (I'm referring to https://t.co/oOyhHNVvjS )

 # requirements:
 # pandas
 # tabula-py

 import pandas as pd
 from tabula import read_pdf

 # read the pdf-- it's all messed up and only one space-delimited column. 
 # also it defauls to only loading one page unless you specify pages='all' or
 # a different int or list.
 df = read_pdf("exhibit_b.pdf", pages='all')

 # fix the columns
 df['user id'] = df['user id handle'].apply(lambda x: x.split()[0])
 df['handle'] = df['user id handle'].apply(lambda x: x.split()[1])
 df = df.drop('user id handle', axis=1)

 # output to csv
 df.to_csv('exhibit_b.csv', index=False)
	# It's really stupid when the gov't releases pdf's of tabular data. So I made a quick, hacky script to
	# fix their mistakes for them. (I'm referring to https://t.co/oOyhHNVvjS )

	# requirements:
	# pandas
	# tabula-py

	import pandas as pd
	from tabula import read_pdf

	# read the pdf-- it's all messed up and only one space-delimited column.
	# also it defauls to only loading one page unless you specify pages='all' or
	# a different int or list.
	df = read_pdf("exhibit_b.pdf", pages='all')

	# fix the columns
	df['user id'] = df['user id handle'].apply(lambda x: x.split()[0])
	df['handle'] = df['user id handle'].apply(lambda x: x.split()[1])
	df = df.drop('user id handle', axis=1)

	# output to csv
	df.to_csv('exhibit_b.csv', index=False)