Skip to content

Instantly share code, notes, and snippets.

@sohang3112
Last active July 26, 2024 09:13
Show Gist options
  • Save sohang3112/8bfca1dc8ac95a67748b90767e2c1a11 to your computer and use it in GitHub Desktop.
Save sohang3112/8bfca1dc8ac95a67748b90767e2c1a11 to your computer and use it in GitHub Desktop.
Draw wordcloud in Python
from collections import Counter
from io import BytesIO
import os
# Optional: If you already downloaded NLTK resources at a non-standard location
# Espcially useful in case of AWS Lambda - you can put already downloaded nltk resources in layer and use in AWS Lambda
# Here path = /opt because that's where AWS Lambda puts all Layer contents
# ALREADY_DOWNLOADED_NLTK_PATH = '/opt'
ALREADY_DOWNLOADED_NLTK_PATH = None # nltk resources not already downloaded, to download
if ALREADY_DOWNLOADED_NLTK_PATH is not None:
print("Using already downloaded nltk resources...")
os.environ['NLTK_DATA'] = ALREADY_DOWNLOADED_NLTK_PATH
# using following already downloaded ntlk resources (paths inside folder ALREADY_DOWNLOADED_NLTK_PATH)
# * corpora/stopwords/
# * tokenizers/punkt/
from PIL import Image
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import grapheme
if ALREADY_DOWNLOADED_NLTK_PATH is None:
print("Downloading nltk resources...")
nltk.download('punkt')
nltk.download('stopwords')
english_stopwords = stopwords.words('english')
# Download this font from https://github.com/stamen/toner-carto/blob/master/fonts/Arial-Unicode-Regular.ttf and then put location in this variable
# Reason for using Arial-Unicode-Regular.ttf font is that it supports ALL languages' unicode rendering glyphs
font_path = '/path/to/Arial-Unicode-Regular.ttf'
def pillow_image_to_bytes(image: Image, extension: str) -> bytes:
out = BytesIO()
image.save(out, extension)
return out.getvalue()
def wordcloud(text: str, image_name: str) -> bytes:
"""Draw wordcloud of text, return bytes of rendered wordcloud image.
Stopwords and singe-character words are removed before drawing wordcloud.
"""
words = [
word
for word in word_tokenize(text)
if word not in english_stopwords and grapheme.length(word) > 1
]
print("Words after removing stopwords:", words)
word_freqs = Counter(words)
wc = WordCloud(width=800, height=400, background_color='white', font_path=font_path)
wc.generate_from_frequencies(word_freqs)
image = wc.to_image()
return pillow_image_to_bytes(image, 'JPEG')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment