Last active
July 26, 2024 09:13
-
-
Save sohang3112/8bfca1dc8ac95a67748b90767e2c1a11 to your computer and use it in GitHub Desktop.
Draw wordcloud in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from io import BytesIO | |
import os | |
# Optional: If you already downloaded NLTK resources at a non-standard location | |
# Espcially useful in case of AWS Lambda - you can put already downloaded nltk resources in layer and use in AWS Lambda | |
# Here path = /opt because that's where AWS Lambda puts all Layer contents | |
# ALREADY_DOWNLOADED_NLTK_PATH = '/opt' | |
ALREADY_DOWNLOADED_NLTK_PATH = None # nltk resources not already downloaded, to download | |
if ALREADY_DOWNLOADED_NLTK_PATH is not None: | |
print("Using already downloaded nltk resources...") | |
os.environ['NLTK_DATA'] = ALREADY_DOWNLOADED_NLTK_PATH | |
# using following already downloaded ntlk resources (paths inside folder ALREADY_DOWNLOADED_NLTK_PATH) | |
# * corpora/stopwords/ | |
# * tokenizers/punkt/ | |
from PIL import Image | |
from wordcloud import WordCloud | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
import grapheme | |
if ALREADY_DOWNLOADED_NLTK_PATH is None: | |
print("Downloading nltk resources...") | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
english_stopwords = stopwords.words('english') | |
# Download this font from https://github.com/stamen/toner-carto/blob/master/fonts/Arial-Unicode-Regular.ttf and then put location in this variable | |
# Reason for using Arial-Unicode-Regular.ttf font is that it supports ALL languages' unicode rendering glyphs | |
font_path = '/path/to/Arial-Unicode-Regular.ttf' | |
def pillow_image_to_bytes(image: Image, extension: str) -> bytes: | |
out = BytesIO() | |
image.save(out, extension) | |
return out.getvalue() | |
def wordcloud(text: str, image_name: str) -> bytes: | |
"""Draw wordcloud of text, return bytes of rendered wordcloud image. | |
Stopwords and singe-character words are removed before drawing wordcloud. | |
""" | |
words = [ | |
word | |
for word in word_tokenize(text) | |
if word not in english_stopwords and grapheme.length(word) > 1 | |
] | |
print("Words after removing stopwords:", words) | |
word_freqs = Counter(words) | |
wc = WordCloud(width=800, height=400, background_color='white', font_path=font_path) | |
wc.generate_from_frequencies(word_freqs) | |
image = wc.to_image() | |
return pillow_image_to_bytes(image, 'JPEG') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment