Created
September 10, 2017 06:23
-
-
Save nikhilkumarsingh/f8cc0590b1b7967ee172a5a0e1a8507b to your computer and use it in GitHub Desktop.
Wordcloud of wikipedia articles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import path | |
import numpy as np | |
from PIL import Image | |
import wikipedia | |
from wordcloud import WordCloud, STOPWORDS | |
# get path to script's directory | |
currdir = path.dirname(__file__) | |
def get_wiki(query): | |
# get best matching title for given query | |
title = wikipedia.search(query)[0] | |
# get wikipedia page for selected title | |
page = wikipedia.page(title) | |
return page.content | |
def create_wordcloud(text): | |
# create numpy araay for wordcloud mask image | |
mask = np.array(Image.open(path.join(currdir, "cloud1.png"))) | |
# create set of stopwords | |
stopwords = set(STOPWORDS) | |
# create wordcloud object | |
wc = WordCloud(background_color="white", | |
max_words=200, | |
mask=mask, | |
stopwords=stopwords) | |
# generate wordcloud | |
wc.generate(text) | |
# save wordcloud | |
wc.to_file(path.join(currdir, "wcloud.png")) | |
if __name__ == "__main__": | |
# get text for article | |
text = get_wiki("python programming language") | |
# generate wordcloud | |
create_wordcloud(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment