sohang3112 · July 26, 2024 09:13
diff --git a/wordcloud.py b/wordcloud.py
 from collections import Counter
 from io import BytesIO
 import os

 # Optional: If you already downloaded NLTK resources at a non-standard location
 # Espcially useful in case of AWS Lambda - you can put already downloaded nltk resources in layer and use in AWS Lambda
 # Here path = /opt because that's where AWS Lambda puts all Layer contents
 # ALREADY_DOWNLOADED_NLTK_PATH = '/opt'
 ALREADY_DOWNLOADED_NLTK_PATH = None       # nltk resources not already downloaded, to download

 if ALREADY_DOWNLOADED_NLTK_PATH is not None:
  print("Using already downloaded nltk resources...")
  os.environ['NLTK_DATA'] =  ALREADY_DOWNLOADED_NLTK_PATH
  # using following already downloaded ntlk resources (paths inside folder ALREADY_DOWNLOADED_NLTK_PATH)
  # * corpora/stopwords/
  # * tokenizers/punkt/

 from PIL import Image
 from wordcloud import WordCloud
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 import grapheme

 if ALREADY_DOWNLOADED_NLTK_PATH is None:
  print("Downloading nltk resources...")
  nltk.download('punkt')
  nltk.download('stopwords')

 english_stopwords = stopwords.words('english')

 # Download this font from https://github.com/stamen/toner-carto/blob/master/fonts/Arial-Unicode-Regular.ttf and then put location in this variable
 # Reason for using Arial-Unicode-Regular.ttf font is that it supports ALL languages' unicode rendering glyphs
 font_path = '/path/to/Arial-Unicode-Regular.ttf'   

 def pillow_image_to_bytes(image: Image, extension: str) -> bytes:
    out = BytesIO()
    image.save(out, extension)
    return out.getvalue()
  
 def wordcloud(text: str, image_name: str) -> bytes:
    """Draw wordcloud of text, return bytes of rendered wordcloud image.
    Stopwords and singe-character words are removed before drawing wordcloud.
    """
    words = [
        word 
        for word in word_tokenize(text) 
        if word not in english_stopwords and grapheme.length(word) > 1
    ]
    print("Words after removing stopwords:", words)
    word_freqs = Counter(words)
    wc = WordCloud(width=800, height=400, background_color='white', font_path=font_path)
    wc.generate_from_frequencies(word_freqs)
    image = wc.to_image()
    return pillow_image_to_bytes(image, 'JPEG')
	from collections import Counter
	from io import BytesIO
	import os

	# Optional: If you already downloaded NLTK resources at a non-standard location
	# Espcially useful in case of AWS Lambda - you can put already downloaded nltk resources in layer and use in AWS Lambda
	# Here path = /opt because that's where AWS Lambda puts all Layer contents
	# ALREADY_DOWNLOADED_NLTK_PATH = '/opt'
	ALREADY_DOWNLOADED_NLTK_PATH = None # nltk resources not already downloaded, to download

	if ALREADY_DOWNLOADED_NLTK_PATH is not None:
	print("Using already downloaded nltk resources...")
	os.environ['NLTK_DATA'] = ALREADY_DOWNLOADED_NLTK_PATH
	# using following already downloaded ntlk resources (paths inside folder ALREADY_DOWNLOADED_NLTK_PATH)
	# * corpora/stopwords/
	# * tokenizers/punkt/

	from PIL import Image
	from wordcloud import WordCloud
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import grapheme

	if ALREADY_DOWNLOADED_NLTK_PATH is None:
	print("Downloading nltk resources...")
	nltk.download('punkt')
	nltk.download('stopwords')

	english_stopwords = stopwords.words('english')

	# Download this font from https://github.com/stamen/toner-carto/blob/master/fonts/Arial-Unicode-Regular.ttf and then put location in this variable
	# Reason for using Arial-Unicode-Regular.ttf font is that it supports ALL languages' unicode rendering glyphs
	font_path = '/path/to/Arial-Unicode-Regular.ttf'

	def pillow_image_to_bytes(image: Image, extension: str) -> bytes:
	out = BytesIO()
	image.save(out, extension)
	return out.getvalue()

	def wordcloud(text: str, image_name: str) -> bytes:
	"""Draw wordcloud of text, return bytes of rendered wordcloud image.
	Stopwords and singe-character words are removed before drawing wordcloud.
	"""
	words = [
	word
	for word in word_tokenize(text)
	if word not in english_stopwords and grapheme.length(word) > 1
	]
	print("Words after removing stopwords:", words)
	word_freqs = Counter(words)
	wc = WordCloud(width=800, height=400, background_color='white', font_path=font_path)
	wc.generate_from_frequencies(word_freqs)
	image = wc.to_image()
	return pillow_image_to_bytes(image, 'JPEG')