Last active
December 21, 2015 00:18
-
-
Save mihi-tr/6218674 to your computer and use it in GitHub Desktop.
Scrape the Names of Statistik Austrias Naming PDF obtained here: http://images.derstandard.at/2013/08/12/VN2p_2012.pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scraperwiki | |
import itertools, re, csv | |
import lxml | |
# Configuration - file name and ranges for boys/girls here | |
filename="/home/mihi/Downloads/VN2p_2012.pdf" | |
# page numbers for boy and girl names | |
range_boys=[0,20] | |
range_girls=[20,43] | |
def split_element(e): | |
return re.split("[ ]+",e.text.replace("*","")) | |
def take4(x): | |
if (len(x)>5): | |
return [x[0:4],x[4:]] | |
else: | |
return [x[0:4]] | |
def select(r,rng): | |
return r.xpath('//page[@number>"%s" and @number<="%s"]/text[@left="64"]/b'%(rng[0],rng[1])) | |
# Open the file | |
f=open(filename) | |
# convert to XML | |
x=scraperwiki.pdftoxml(f.read()) | |
# Parse XML | |
r=lxml.etree.fromstring(x) | |
# Close the File | |
f.close() | |
#Select Boys/ Girls | |
boys=select(r,range_boys) | |
girls=select(r,range_girls) | |
# split the lines - so that the columns are split | |
boys=[split_element(i) for i in boys] | |
girls=[split_element(i) for i in girls] | |
#filter out empty elements | |
boys=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in boys] | |
girls=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in girls] | |
#make the two columns (4 elements each) | |
boys=reduce(lambda x,y: x+y,(take4(i) for i in boys),[]) | |
girls=reduce(lambda x,y: x+y,(take4(i) for i in girls),[]) | |
# append gender: | |
for x in boys: | |
x.append("m") | |
for x in girls: | |
x.append("f") | |
# put the two things together | |
names=boys+girls | |
# filter out extra headers etc | |
names=itertools.ifilter(lambda x: not x[0].isupper(),names) | |
names=itertools.ifilter(lambda x: not (x[0] in ["der","m","f"]) ,names) | |
# open file for writing as csv | |
f=open("names.csv","wb") | |
w=csv.writer(f) | |
# write header | |
w.writerow(["Name","Absolut","Prozent","Rank","Gender"]) | |
# write names | |
for x in names: | |
w.writerow([i.encode("utf-8") for i in x]) | |
# close file | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment