Skip to content

Instantly share code, notes, and snippets.

@do-me
Created May 25, 2019 11:28
Show Gist options
  • Save do-me/b6715dc90fccdb145cc38476de0a0498 to your computer and use it in GitHub Desktop.
Save do-me/b6715dc90fccdb145cc38476de0a0498 to your computer and use it in GitHub Desktop.
# Pdf2wordcloud
# 1) pdf to text object
import os
import PyPDF2
from PyPDF2 import PdfFileWriter
party="grüne" # and others: Linke, Grüne, SPD, FDP, CDU/CSU, AfD
pa= "C:/Users/Dome/Desktop/nu/Wahlprogramme 2017/"
os.chdir(pa)
pdoc= pa+party+".pdf"
pdfFileObj = open(pdoc,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
writer = PdfFileWriter()
count = 0
text = ""
li=""
# loop for each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
text = pageObj.extractText()
count+=1
li += " "+text
###################################
##### 2) TEXT CLEANING ############
from stop_words import get_stop_words
import re
sw = get_stop_words('de') # German stopwords. Not as useful as EN version
re.sub(r'[0-9]+', '', li) # remove numbers
# replace most common noise
liste=li.replace('\n', ' ').replace('œ', ' ').replace('Œ', ' ').replace(
"-","").replace("KAPITEL","").replace("Kapitel","").split(" ")
stz=[word for word in liste if word.isalpha()]# remove marks
# extend for most common meaningless words and word fragments
sw.extend(("macht","schen","schon","Kapitel","setzt","weitere","lehnt","viel",
"stellen","gehen","chen","geht","gilt","lehnen", "viele","gibt",
"darf","halten","dürfen","neben","gehört","vielen","jedoch",
"braucht", "nehmen","rung","seit","sollten","deren","etwa","beim",
"außerdem","stehen", "sitzen", "mehr","sollen","müssen","sowie",
"deshalb","daher","dafür","dabei","brauchen","zudem","setzen",
"besser","neue","neu","neuen","immer","gute"))
# list comprehension, only longer than 4 characters and not in stopwords
tes=[word for word in stz if word.lower() not in sw if len(word) >= 4]
###################################
##### 2) WORD CLOUD ###############
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
words = ' '.join(tes) # list to text
mask = np.array(Image.open(
"C:/Users/Dome/Desktop/nu/Wahlprogramme 2017/germany-flag-medium.png"))
image_colors = ImageColorGenerator(mask)
# function takes in your text and your mask and generates a wordcloud
def generate_wordcloud(words, mask):
word_cloud = WordCloud(width = 512, height = 512,max_words=100,
background_color='white', stopwords=STOPWORDS,
mask=mask).generate(words)
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='blue')
plt.imshow(word_cloud.recolor(color_func=image_colors), #colors
interpolation="bilinear")
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig(pa+"Wordclouds/"+party+".png", format="png") # save
plt.show()
# run function to create wordcloud
generate_wordcloud(words, mask)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment