bostwick · February 11, 2012 04:00 · your2 · Jun 1, 2015
diff --git a/FacebookWordCloud.py b/FacebookWordCloud.py
 #!/usr/bin/env python

 import sys
 import urllib2
 import json
 from collections import defaultdict

 from nltk import word_tokenize

 IGNORED_WORDS = ["!", ".", ",", "(", ")", "'s", ":", "?", "...", "$", 
                 "<", ">", "''", "``", "-", "c", "'", "--", "&",
                 "and", "the", "or", "not", "i", "you", "to", "this", 
                 "of", "in", "for", "a", "an", "and", "your", "with", 
                 "me", "my", "be", "these", "that", "do", "at", "no", "so",
                 "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", 
                 "http", "@", "is", "am", "are", "it", "if", "n't", "'em",
                 "from", "one", "on", "up", "like", "we", "their", "they",
                 "'ll", "'d", "'m", "//www.youtube.com/watch", "by", "have",
                 "just", "will", "as"]

 def get_json(url):
    """ Return the parsed JSON from a url. """
    print "GET %s" % url
    f = urllib2.urlopen(url)
    return json.loads(f.read())


 def fb_feed_url(access_token):
    """ The graph api url for your facebook feed. access_token should be 
    a valid facebook access token as a string.
    """
    return "https://graph.facebook.com/me/home?access_token=%s" % access_token


 def fb_news_feed(access_token):
    """ Returns a set number of pages from a facebook news feed as a 
    list of parsed json. 
    """
    feed_url = fb_feed_url(access_token)
    page_json = []
    
    for x in range(0, 100):
        json = get_json(feed_url)
        page_json.append(json)
        
        # Break if there's not enough data to continue
        if 'paging' in json and 'next' in json['paging']:
            feed_url = json['paging']['next']
        else:
            break
            
    return page_json


 def word_count(words):
    """ Count the occurrences of each word. Words should be a list of strings
    that you might get from str.split() or nltk.word_tokenize().
    """
    seen = defaultdict(lambda: 0)
    
    for w in words:
        if w.lower() not in IGNORED_WORDS:
            seen[w] += 1
    
    return seen


 def word_cloud_sizes(counts):
    max_count = max(counts.values())
    min_size, max_size = (1, 5)
    word_sizes = defaultdict(lambda : 1)
    
    for word in counts:
        if counts[word] > 1:
            size = int(float(counts[word]) * (max_size - 1) / max_count) + 1
            word_sizes[word] = size
    
    return word_sizes

 def word_cloud_header():
    return """
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html>
 <head>
    <title>My Facebook Word Cloud</title>
    <style type=\"text/css\">
 .size-1 { font-size: 10pt; }
 .size-2 { font-size: 16pt; }
 .size-3 { font-size: 28pt; }
 .size-4 { font-size: 36pt; font-weight: bold; }
 .size-5 { font-size: 48pt; font-weight: bold; }
 .word-cloud { margin: 0 auto; width: 600px; padding-top: 10px; }
    </style>
 </head>
 <body>
 <div class="word-cloud">
 """

 def word_cloud_footer():
    return """
 </div>
 </body>
 </html>
 """

 def word_cloud_html(sizes):
    def word_html(word, size):
        return "<span class=\"size-%s\">%s</span>" % (size, word)

    return ' '.join([word_html(w, s) for w, s in sizes.items()])
    

 if __name__ == "__main__":
    access_token = sys.argv[1]
    
    pages = fb_news_feed(access_token)
    messages = [post['message'] for page in pages 
                                    for post in page['data']
                                        if 'message' in post]
    comments = [comment['message'] for page in pages
                                    for post in page['data']
                                        if 'comments' in post and 'data' in post['comments']
                                            for comment in post['comments']['data']
                                                if 'if message' in comment]
    messages.extend(comments)
    
    tokens = [token for msg in messages for token in word_tokenize(msg)]
    counts = word_count(tokens)
    
    sizes = word_cloud_sizes(counts)
    
    print ''.join([
        word_cloud_header(),
        word_cloud_html(sizes),
        word_cloud_footer()])
	#!/usr/bin/env python

	import sys
	import urllib2
	import json
	from collections import defaultdict

	from nltk import word_tokenize

	IGNORED_WORDS = ["!", ".", ",", "(", ")", "'s", ":", "?", "...", "$",
	"<", ">", "''", "``", "-", "c", "'", "--", "&",
	"and", "the", "or", "not", "i", "you", "to", "this",
	"of", "in", "for", "a", "an", "and", "your", "with",
	"me", "my", "be", "these", "that", "do", "at", "no", "so",
	"1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
	"http", "@", "is", "am", "are", "it", "if", "n't", "'em",
	"from", "one", "on", "up", "like", "we", "their", "they",
	"'ll", "'d", "'m", "//www.youtube.com/watch", "by", "have",
	"just", "will", "as"]

	def get_json(url):
	""" Return the parsed JSON from a url. """
	print "GET %s" % url
	f = urllib2.urlopen(url)
	return json.loads(f.read())


	def fb_feed_url(access_token):
	""" The graph api url for your facebook feed. access_token should be
	a valid facebook access token as a string.
	"""
	return "https://graph.facebook.com/me/home?access_token=%s" % access_token


	def fb_news_feed(access_token):
	""" Returns a set number of pages from a facebook news feed as a
	list of parsed json.
	"""
	feed_url = fb_feed_url(access_token)
	page_json = []

	for x in range(0, 100):
	json = get_json(feed_url)
	page_json.append(json)

	# Break if there's not enough data to continue
	if 'paging' in json and 'next' in json['paging']:
	feed_url = json['paging']['next']
	else:
	break

	return page_json


	def word_count(words):
	""" Count the occurrences of each word. Words should be a list of strings
	that you might get from str.split() or nltk.word_tokenize().
	"""
	seen = defaultdict(lambda: 0)

	for w in words:
	if w.lower() not in IGNORED_WORDS:
	seen[w] += 1

	return seen


	def word_cloud_sizes(counts):
	max_count = max(counts.values())
	min_size, max_size = (1, 5)
	word_sizes = defaultdict(lambda : 1)

	for word in counts:
	if counts[word] > 1:
	size = int(float(counts[word]) * (max_size - 1) / max_count) + 1
	word_sizes[word] = size

	return word_sizes

	def word_cloud_header():
	return """
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
	<html>
	<head>
	<title>My Facebook Word Cloud</title>
	<style type=\"text/css\">
	.size-1 { font-size: 10pt; }
	.size-2 { font-size: 16pt; }
	.size-3 { font-size: 28pt; }
	.size-4 { font-size: 36pt; font-weight: bold; }
	.size-5 { font-size: 48pt; font-weight: bold; }
	.word-cloud { margin: 0 auto; width: 600px; padding-top: 10px; }
	</style>
	</head>
	<body>
	<div class="word-cloud">
	"""

	def word_cloud_footer():
	return """
	</div>
	</body>
	</html>
	"""

	def word_cloud_html(sizes):
	def word_html(word, size):
	return "<span class=\"size-%s\">%s</span>" % (size, word)

	return ' '.join([word_html(w, s) for w, s in sizes.items()])


	if __name__ == "__main__":
	access_token = sys.argv[1]

	pages = fb_news_feed(access_token)
	messages = [post['message'] for page in pages
	for post in page['data']
	if 'message' in post]
	comments = [comment['message'] for page in pages
	for post in page['data']
	if 'comments' in post and 'data' in post['comments']
	for comment in post['comments']['data']
	if 'if message' in comment]
	messages.extend(comments)

	tokens = [token for msg in messages for token in word_tokenize(msg)]
	counts = word_count(tokens)

	sizes = word_cloud_sizes(counts)

	print ''.join([
	word_cloud_header(),
	word_cloud_html(sizes),
	word_cloud_footer()])
No results found