Last active
January 3, 2016 00:53
-
-
Save wiso/fa16452e8c12540e3663 to your computer and use it in GitHub Desktop.
Facebook word cloud
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "<h1>Facebook tag cloud with BeautifulSoup</h1>" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "These few lines will list all the words in your facebook wall. From facebook you have to download your data." | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from bs4 import BeautifulSoup\nfrom collections import Counter", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": true, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "filename = \"/home/turra/Scaricati/html/wall.htm\"", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "comments = BeautifulSoup(open(filename)).find_all('div', {\"class\" : \"comment\"})\ncomments = ' '.join(set([div.text for div in comments]))\n\nto_remove = ('di', 'no', 'all', 'proprio', 'Un', 'b', 'from', 'che', 'd', 'v', 'Le', 'La', 'a', u'è', 'And', 'non', 'il', 'la', 'un', 'e', 'in', 'per', 'the', 'Non', 'dell', '-', u'–', 'get', 'una', u'più', 'sta', 'Per', 'ha', 'sono', 'i', 'le', 'Ruggero', 'Turra', 'mi', 'is', 'su', 'si', 'ci', 'to', 'al', 'ma', 'del', 'da', 'se', 'of', 'con', 'fa', 'gli', 'o', 'anche', 'and', 'ho', 'lo', 'dei', 'questo', 'della', 'for', 'questa', 'po\\'', 'come', u'perché', 'uno', 'mai', 'ti', 'Mi', 'va', 'about', u'c\\'è', 'mia', 'hanno', 'cosa', 'at', 'Tanti', 'ne', 'Il', 'ce', 'ad', 'by', 'nel', 'it', 'Ma', 'not', 'this', 'that', 'sul', 'La', 'E', 's', 'c', 'l', 't', 'or', 'tra', 'A', 'era', 'on', 'as', u'può')\nfor tr in to_remove:\n comments = comments.replace(\" %s \" % tr, \" \")\ncomments = comments.replace(u\"'\", ' ').replace(',', ' ').replace(u\"’\", \" \").replace(\"?\", \" \").replace(\"!\", \" \").replace('\"', \" \").replace('.', ' ')\ncounter = Counter(comments.split())", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"scrolled": false, | |
"collapsed": false, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "for c in counter.most_common():\n if c[0] in to_remove: continue\n print \"%s: %d\" % c", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python2", | |
"display_name": "Python 2", | |
"language": "python" | |
}, | |
"language_info": { | |
"mimetype": "text/x-python", | |
"nbconvert_exporter": "python", | |
"name": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.5", | |
"file_extension": ".py", | |
"codemirror_mode": { | |
"version": 2, | |
"name": "ipython" | |
} | |
}, | |
"gist_id": "fa16452e8c12540e3663" | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment