Last active
December 15, 2015 14:29
-
-
Save singingwolfboy/5275037 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Andreas Christian Mueller <[email protected]> | |
# (c) 2012 | |
# | |
# License: MIT | |
import os | |
import sys | |
import random | |
from PIL import Image | |
from PIL import ImageDraw | |
from PIL import ImageFont | |
import numpy as np | |
from query_integral_image import query_integral_image | |
from sklearn.feature_extraction.text import CountVectorizer | |
FONT_PATH = "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf" | |
#FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf" | |
def make_wordcloud(words, counts, fname, font_path=None, width=400, height=200, | |
margin=5, ranks_only=False): | |
"""Build word cloud using word counts, store in image. | |
Parameters | |
---------- | |
words : numpy array of strings | |
Words that will be drawn in the image. | |
counts : numpy array of word counts | |
Word counts or weighting of words. Determines the size of the word in | |
the final image. | |
Will be normalized to lie between zero and one. | |
font_path : string | |
Font path to the font that will be used. | |
Defaults to DroidSansMono path. | |
fname : sting | |
Output filename. Extension determins image type | |
(written with PIL). | |
width : int (default=400) | |
Width of the word cloud image. | |
height : int (default=200) | |
Height of the word cloud image. | |
ranks_only : boolean (default=False) | |
Only use the rank of the words, not the actual counts. | |
Notes | |
----- | |
Larger Images with make the code significantly slower. | |
If you need a large image, you can try running the algorithm at a lower | |
resolution and then drawing the result at the desired resolution. | |
In the current form it actually just uses the rank of the counts, | |
i.e. the relative differences don't matter. | |
Play with setting the font_size in the main loop vor differnt styles. | |
Colors are used completely at random. Currently the colors are sampled | |
from HSV space with a fixed S and V. | |
Adjusting the percentages at the very end gives differnt color ranges. | |
Obviously you can also set all at random - haven't tried that. | |
""" | |
if len(counts) <= 0: | |
print("We need at least 1 word to plot a word cloud, got %d." | |
% len(counts)) | |
if font_path is None: | |
font_path = FONT_PATH | |
# normalize counts | |
counts = counts / float(counts.max()) | |
# sort words by counts | |
inds = np.argsort(counts)[::-1] | |
counts = counts[inds] | |
words = words[inds] | |
# create image | |
img_grey = Image.new("L", (width, height)) | |
draw = ImageDraw.Draw(img_grey) | |
integral = np.zeros((height, width), dtype=np.uint32) | |
img_array = np.asarray(img_grey) | |
font_sizes, positions, orientations = [], [], [] | |
# intitiallize font size "large enough" | |
font_size = 1000 | |
# start drawing grey image | |
for word, count in zip(words, counts): | |
# alternative way to set the font size | |
if not ranks_only: | |
font_size = min(font_size, int(100 * np.log(count + 100))) | |
while True: | |
# try to find a position | |
font = ImageFont.truetype(font_path, font_size) | |
# transpose font optionally | |
orientation = random.choice([None, Image.ROTATE_90]) | |
transposed_font = ImageFont.TransposedFont(font, | |
orientation=orientation) | |
draw.setfont(transposed_font) | |
# get size of resulting text | |
box_size = draw.textsize(word) | |
# find possible places using integral image: | |
result = query_integral_image(integral, box_size[1] + margin, | |
box_size[0] + margin) | |
if result is not None or font_size == 0: | |
break | |
# if we didn't find a place, make font smaller | |
font_size -= 1 | |
if font_size == 0: | |
# we were unable to draw any more | |
break | |
x, y = np.array(result) + margin // 2 | |
# actually draw the text | |
draw.text((y, x), word, fill="white") | |
positions.append((x, y)) | |
orientations.append(orientation) | |
font_sizes.append(font_size) | |
# recompute integral image | |
img_array = np.asarray(img_grey) | |
# recompute bottom right | |
# the order of the cumsum's is important for speed ?! | |
partial_integral = np.cumsum(np.cumsum(img_array[x:, y:], axis=1), | |
axis=0) | |
# paste recomputed part into old image | |
# if x or y is zero it is a bit annoying | |
if x > 0: | |
if y > 0: | |
partial_integral += (integral[x - 1, y:] | |
- integral[x - 1, y - 1]) | |
else: | |
partial_integral += integral[x - 1, y:] | |
if y > 0: | |
partial_integral += integral[x:, y - 1][:, np.newaxis] | |
integral[x:, y:] = partial_integral | |
# redraw in color | |
img = Image.new("RGB", (width, height)) | |
draw = ImageDraw.Draw(img) | |
everything = zip(words, font_sizes, positions, orientations) | |
for word, font_size, position, orientation in everything: | |
font = ImageFont.truetype(font_path, font_size) | |
# transpose font optionally | |
transposed_font = ImageFont.TransposedFont(font, | |
orientation=orientation) | |
draw.setfont(transposed_font) | |
draw.text((position[1], position[0]), word, | |
fill="hsl(%d" % random.randint(0, 255) + ", 80%, 50%)") | |
img.show() | |
img.save(fname) | |
def main(*args): | |
if "-" in args: | |
lines = sys.stdin.readlines() | |
sources = ['stdin'] | |
else: | |
sources = ([arg for arg in args if os.path.exists(arg)] | |
or ["constitution.txt"]) | |
lines = [] | |
for s in sources: | |
with open(s) as f: | |
lines.extend(f.readlines()) | |
text = "".join(lines) | |
cv = CountVectorizer(min_df=1, charset_error="ignore", | |
stop_words="english", max_features=200) | |
counts = cv.fit_transform([text]).toarray().ravel() | |
words = np.array(cv.get_feature_names()) | |
# throw away some words, normalize | |
words = words[counts > 1] | |
counts = counts[counts > 1] | |
output_filename = (os.path.splitext(os.path.basename(sources[0]))[0] | |
+ "_.png") | |
counts = make_wordcloud(words, counts, output_filename) | |
if __name__ == "__main__": | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment