Skip to content

Instantly share code, notes, and snippets.

@ricalanis
Created April 21, 2016 01:53
Show Gist options
  • Save ricalanis/12fe91e83e868672b65c21d0e7fb8f1c to your computer and use it in GitHub Desktop.
Save ricalanis/12fe91e83e868672b65c21d0e7fb8f1c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pymongo import MongoClient, ASCENDING, DESCENDING\n",
"from nltk import FreqDist\n",
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"MONGO_URL = os.environ.get('mongo_url')\n",
"client = MongoClient(MONGO_URL)\n",
"db = client.comments"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"candidate_id = \"fgcabezadevaca\"\n",
"page_id = \"1037553166317439\""
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query = db.comments.find({ \"page_id\": page_id, \"candidate_id\": candidate_id, \"post_id\": {\"$exists\":True}}).batch_size(1000)\n",
"output = []\n",
"ngrams_cabeza = {}\n",
"ngrams_lists = {}\n",
"for comment in query:\n",
" for key in comment[\"ngrams\"]:\n",
" if key in ngrams_cabeza:\n",
" ngrams_cabeza[key]= ngrams_cabeza[key]+ [tuple(ngram_single) for ngram_single in comment[\"ngrams\"][key]]\n",
" ngrams_lists[key] = ngrams_lists[key] + [ngram_single for ngram_single in comment[\"ngrams\"][key]]\n",
" else:\n",
" ngrams_cabeza[key]= [tuple(ngram_single) for ngram_single in comment[\"ngrams\"][key]]\n",
" ngrams_lists[key] = [ngram_single for ngram_single in comment[\"ngrams\"][key]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"ngrams_lists[\"1\"][0:70]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def ngram_histogram(ngrams_input):\n",
" output = {}\n",
" for idx in ngrams_input:\n",
" FreqDist(ngrams_input[idx])\n",
" freq_list = list(FreqDist(ngrams_input[idx]).items())\n",
" output[idx] = sorted(freq_list, key=lambda pair: pair[1], reverse=True)\n",
" return output"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ngram_histogram_page = ngram_histogram(ngrams_cabeza)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"231\n"
]
}
],
"source": [
"query_count = db.comments.find({ \"page_id\": page_id, \"candidate_id\": candidate_id, \"post_id\": {\"$exists\":True}}).batch_size(1000).count()\n",
"print(query_count)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(('duarte',), 27),\n",
" (('yunes',), 23),\n",
" (('pueblo',), 22),\n",
" (('voto',), 21),\n",
" (('miguel',), 19),\n",
" (('veracruz',), 16),\n",
" (('morena',), 16),\n",
" (('linares',), 12),\n",
" (('ratas',), 12),\n",
" (('madre',), 11),\n",
" (('estado',), 10),\n",
" (('partido',), 10),\n",
" (('angel',), 10),\n",
" (('gente',), 10),\n",
" (('dinero',), 10),\n",
" (('poder',), 10),\n",
" (('basta',), 9),\n",
" (('gobierno',), 8),\n",
" (('cambio',), 8),\n",
" (('arriba',), 8),\n",
" (('quieren',), 7),\n",
" (('gobernador',), 7),\n",
" (('encuestas',), 6),\n",
" (('ganar',), 6),\n",
" (('pinche',), 6),\n",
" (('chingar',), 6),\n",
" (('marrana',), 6),\n",
" (('patadas',), 6),\n",
" (('robo',), 6),\n",
" (('votar',), 6),\n",
" (('mierda',), 6),\n",
" (('rata',), 5),\n",
" (('favor',), 5),\n",
" (('engañar',), 5),\n",
" (('despensa',), 5),\n",
" (('ángel',), 5),\n",
" (('dios',), 4),\n",
" (('ahogado',), 4),\n",
" (('inseguridad',), 4),\n",
" (('sabe',), 4),\n",
" (('fidel',), 4),\n",
" (('priistas',), 4),\n",
" (('políticos',), 4),\n",
" (('robado',), 4),\n",
" (('rateros',), 4),\n",
" (('historia',), 4),\n",
" (('miedo',), 4),\n",
" (('único',), 4),\n",
" (('javier',), 4),\n",
" (('limpio',), 4),\n",
" (('puta',), 4),\n",
" (('elecciones',), 4),\n",
" (('gane',), 4),\n",
" (('hagamos',), 4),\n",
" (('perder',), 4),\n",
" (('diferencia',), 3),\n",
" (('nieto',), 3),\n",
" (('señores',), 3),\n",
" (('años',), 3),\n",
" (('mala',), 3),\n",
" (('pendejo',), 3),\n",
" (('roban',), 3),\n",
" (('hagan',), 3),\n",
" (('afuera',), 3),\n",
" (('delincuencia',), 3),\n",
" (('tiempo',), 3),\n",
" (('puto',), 3),\n",
" (('corrupto',), 3),\n",
" (('candidato',), 3),\n",
" (('robaron',), 3),\n",
" (('mamada',), 3),\n",
" (('pendeja',), 3),\n",
" (('vida',), 3),\n",
" (('chinge',), 3),\n",
" (('pura',), 3),\n",
" (('pobres',), 3),\n",
" (('verdad',), 3),\n",
" (('pinches',), 3),\n",
" (('votando',), 3),\n",
" (('landa',), 3),\n",
" (('sacar',), 3),\n",
" (('sucia',), 3),\n",
" (('política',), 3),\n",
" (('gracias',), 3),\n",
" (('enjuicien',), 3),\n",
" (('pesos',), 3),\n",
" (('pobre',), 3),\n",
" (('p.r.i',), 3),\n",
" (('robar',), 3),\n",
" (('miyuli',), 3),\n",
" (('cuitlahuac',), 3),\n",
" (('hector',), 3),\n",
" (('carcel',), 3),\n",
" (('vaya',), 3),\n",
" (('verga',), 3),\n",
" (('bola',), 3),\n",
" (('perro',), 3),\n",
" (('sale',), 2),\n",
" (('abren',), 2),\n",
" (('tambo',), 2),\n",
" (('sector',), 2),\n",
" (('disen',), 2),\n",
" (('dejemos',), 2),\n",
" (('oportunidades',), 2),\n",
" (('vote',), 2),\n",
" (('promesas',), 2),\n",
" (('miserias',), 2),\n",
" (('peña',), 2),\n",
" (('boto',), 2),\n",
" (('panistas',), 2),\n",
" (('ladrones',), 2),\n",
" (('secuestros',), 2),\n",
" (('caos',), 2),\n",
" (('andan',), 2),\n",
" (('debería',), 2),\n",
" (('destruir',), 2),\n",
" (('puro',), 2),\n",
" (('guerra',), 2),\n",
" (('dando',), 2),\n",
" (('recursos',), 2),\n",
" (('tapar',), 2),\n",
" (('ojos',), 2),\n",
" (('aganar',), 2),\n",
" (('sigan',), 2),\n",
" (('jamas',), 2),\n",
" (('nose',), 2),\n",
" (('gobernatura',), 2),\n",
" (('crimen',), 2),\n",
" (('jajajaja',), 2),\n",
" (('viene',), 2),\n",
" (('deja',), 2),\n",
" (('apoyo',), 2),\n",
" (('tapó',), 2),\n",
" (('dejen',), 2),\n",
" (('azul',), 2),\n",
" (('riko',), 2),\n",
" (('esperaban',), 2),\n",
" (('contaminación',), 2),\n",
" (('jente',), 2),\n",
" (('objetivo',), 2),\n",
" (('somos',), 2),\n",
" (('kien',), 2),\n",
" (('corrupcion',), 2),\n",
" (('idiota',), 2),\n",
" (('radio',), 2),\n",
" (('porkeria',), 2),\n",
" (('cambiar',), 2),\n",
" (('saca',), 2),\n",
" (('huevos',), 2),\n",
" (('gordo',), 2),\n",
" (('buscar',), 2),\n",
" (('chinguen',), 2),\n",
" (('pendejos',), 2),\n",
" (('sera',), 2),\n",
" (('humo',), 2),\n",
" (('culpa',), 2),\n",
" (('claro',), 2),\n",
" (('ustedes',), 2),\n",
" (('priístas',), 2),\n",
" (('apoyando',), 2),\n",
" (('verde',), 2),\n",
" (('malos',), 2),\n",
" (('peor',), 2),\n",
" (('fuerte',), 2),\n",
" (('pederasta',), 2),\n",
" (('politica',), 2),\n",
" (('estarán',), 2),\n",
" (('carreteras',), 2),\n",
" (('queda',), 2),\n",
" (('creer',), 2),\n",
" (('hagas',), 2),\n",
" (('boca',), 2),\n",
" (('pristas',), 2),\n",
" (('santo',), 2),\n",
" (('siquiera',), 2),\n",
" (('mamadas',), 2),\n",
" (('yuni',), 2),\n",
" (('pasa',), 2),\n",
" (('.por',), 2),\n",
" (('adios',), 2),\n",
" (('patria',), 2),\n",
" (('pensionados',), 2),\n",
" (('credibilidad',), 2),\n",
" (('gobernar',), 2),\n",
" (('pelea',), 2),\n",
" (('gana',), 2),\n",
" (('alos',), 2),\n",
" (('persecucion',), 2),\n",
" (('lucha',), 2),\n",
" (('foto',), 2),\n",
" (('carros',), 2),\n",
" (('escuchados',), 2),\n",
" (('maquinaria',), 2),\n",
" (('partidos',), 2),\n",
" (('ñaka',), 2),\n",
" (('votemos',), 2),\n",
" (('veracruzano',), 2),\n",
" (('opcion',), 2),\n",
" (('españa',), 2),\n",
" (('acuerdo',), 2),\n",
" (('hueso',), 2),\n",
" (('rostros',), 2),\n",
" (('suficiente',), 2),\n",
" (('basura',), 2),\n",
" (('seguir',), 2),\n",
" (('kieren',), 2),\n",
" (('conforman',), 2),\n",
" (('veracruzanos',), 2),\n",
" (('mayoría',), 2),\n",
" (('porquería',), 2),\n",
" (('meter',), 2),\n",
" (('destrozadas',), 1),\n",
" (('apasiones',), 1),\n",
" (('criticas',), 1),\n",
" (('regla',), 1),\n",
" (('nexos',), 1),\n",
" (('desesperada',), 1),\n",
" (('durmiendo',), 1),\n",
" (('maricelita',), 1),\n",
" (('salinas',), 1),\n",
" (('mauser',), 1),\n",
" (('negra',), 1),\n",
" (('fantan',), 1),\n",
" (('vigilaremos',), 1),\n",
" (('oigan',), 1),\n",
" (('gobernando',), 1),\n",
" (('corruptos',), 1),\n",
" (('maje',), 1),\n",
" (('intentan',), 1),\n",
" (('nomas',), 1),\n",
" (('titulares',), 1),\n",
" (('boleta',), 1),\n",
" (('marrano',), 1),\n",
" (('ventajistas',), 1),\n",
" (('cantar',), 1),\n",
" (('apollo..',), 1),\n",
" (('haga',), 1),\n",
" (('hipocrita',), 1),\n",
" (('pensaron',), 1),\n",
" (('tapadera',), 1),\n",
" (('vota',), 1),\n",
" (('prii',), 1),\n",
" (('derechos',), 1),\n",
" (('obras',), 1),\n",
" (('llene',), 1),\n",
" (('regalando',), 1),\n",
" (('sudor',), 1),\n",
" (('morales,',), 1),\n",
" (('ahutoridad',), 1),\n",
" (('crean',), 1),\n",
" (('definido',), 1),\n",
" (('chiflen',), 1),\n",
" (('erradicarla',), 1),\n",
" (('linda',), 1),\n",
" (('escribio',), 1),\n",
" (('garras',), 1),\n",
" (('encarcele',), 1),\n",
" (('caca',), 1),\n",
" (('quiero',), 1),\n",
" (('plena',), 1),\n",
" (('organisado',), 1),\n",
" (('namas',), 1),\n",
" (('cabecilla',), 1),\n",
" (('mantenerse',), 1),\n",
" (('priiu',), 1),\n",
" (('espero',), 1),\n",
" (('tambien',), 1),\n",
" (('saco',), 1),\n",
" (('ahogados',), 1),\n",
" (('nota..',), 1),\n",
" (('tinoco',), 1),\n",
" (('nunka',), 1),\n",
" (('responsable',), 1),\n",
" (('digas',), 1),\n",
" (('migel',), 1),\n",
" (('noticias',), 1),\n",
" (('complice',), 1),\n",
" (('gorda',), 1),\n",
" (('lenis',), 1),\n",
" (('logotipo',), 1),\n",
" (('muestran',), 1),\n",
" (('martirizar',), 1),\n",
" (('pri.',), 1),\n",
" (('repentina',), 1),\n",
" (('expertos',), 1),\n",
" (('spots',), 1),\n",
" (('comprar',), 1),\n",
" (('murió',), 1),\n",
" (('manejos',), 1),\n",
" (('arreglos',), 1),\n",
" (('barrio',), 1),\n",
" (('nación',), 1),\n",
" (('sige',), 1),\n",
" (('tijera',), 1),\n",
" (('sufrir',), 1),\n",
" (('tricolor',), 1),\n",
" (('familias',), 1),\n",
" (('espanten',), 1),\n",
" (('cárcel',), 1),\n",
" (('ambiente',), 1),\n",
" (('impunidad',), 1),\n",
" (('perderá',), 1),\n",
" (('viejos',), 1),\n",
" (('primierda',), 1),\n",
" (('botar',), 1),\n",
" (('pierdan',), 1),\n",
" (('tiempo.que',), 1),\n",
" (('acabar',), 1),\n",
" (('hogado',), 1),\n",
" (('conozco',), 1),\n",
" (('golpeado',), 1),\n",
" (('quita',), 1),\n",
" (('pagaremos',), 1),\n",
" (('ayúdame',), 1),\n",
" (('complices',), 1),\n",
" (('parenle',), 1),\n",
" (('quiebra',), 1),\n",
" (('trabajen',), 1),\n",
" (('ulibarri',), 1),\n",
" (('arribistas',), 1),\n",
" (('salud',), 1),\n",
" (('cara',), 1),\n",
" (('opine',), 1),\n",
" (('santanas',), 1),\n",
" (('beltran',), 1),\n",
" (('condenan',), 1),\n",
" (('..están',), 1),\n",
" (('preocupados',), 1),\n",
" (('pesimas',), 1),\n",
" (('aplicar',), 1),\n",
" (('información',), 1),\n",
" (('chingarse',), 1),\n",
" (('aser',), 1),\n",
" (('dejamos',), 1),\n",
" (('blanquiazules',), 1),\n",
" (('giselle',), 1),\n",
" (('sinonimos',), 1),\n",
" (('encarcel',), 1),\n",
" (('politicos',), 1),\n",
" (('flota',), 1),\n",
" (('kuleros',), 1),\n",
" (('andrés',), 1),\n",
" (('totalmente',), 1),\n",
" (('alimentado',), 1),\n",
" (('rival',), 1),\n",
" (('creíble',), 1),\n",
" (('buscan',), 1),\n",
" (('rstado',), 1),\n",
" (('esfuerzo',), 1),\n",
" (('tlacuache',), 1),\n",
" (('amparo',), 1),\n",
" (('trataran',), 1),\n",
" (('asistir',), 1),\n",
" (('sabido',), 1),\n",
" (('militantes',), 1),\n",
" (('verán',), 1),\n",
" (('perdido',), 1),\n",
" (('encarcelo',), 1),\n",
" (('sptm',), 1),\n",
" (('linare',), 1),\n",
" (('falsedades',), 1),\n",
" (('cambiemos',), 1),\n",
" (('violencia',), 1),\n",
" (('..no',), 1),\n",
" (('administrado',), 1),\n",
" (('engañando',), 1),\n",
" (('próspera',), 1),\n",
" (('mantecoso',), 1),\n",
" (('hasen',), 1),\n",
" (('continúen',), 1),\n",
" (('terminar',), 1),\n",
" (('respeto',), 1),\n",
" (('méxico',), 1),\n",
" (('levantones',), 1),\n",
" (('opinar',), 1),\n",
" (('evidencias',), 1),\n",
" (('hospitales',), 1),\n",
" (('querían',), 1),\n",
" (('empezando',), 1),\n",
" (('votaremod',), 1),\n",
" (('morral,',), 1),\n",
" (('mayl',), 1),\n",
" (('represores',), 1),\n",
" (('ganarån',), 1),\n",
" (('sienten',), 1),\n",
" (('transas',), 1),\n",
" (('choro',), 1),\n",
" (('amarillistas',), 1),\n",
" (('asquerosas',), 1),\n",
" (('alianzas',), 1),\n",
" (('gandero',), 1),\n",
" (('promete',), 1),\n",
" (('carajo',), 1),\n",
" (('paralisado',), 1),\n",
" (('aguas',), 1),\n",
" (('tachando',), 1),\n",
" (('acresentan',), 1),\n",
" (('converse',), 1),\n",
" (('gandállas',), 1),\n",
" (('vete',), 1),\n",
" (('verdad..',), 1),\n",
" (('taxis',), 1),\n",
" (('wevos',), 1),\n",
" (('mamen',), 1),\n",
" (('viviendo',), 1),\n",
" (('tricolor.porque',), 1),\n",
" (('creatividad',), 1),\n",
" (('cosa',), 1),\n",
" (('hartazgo',), 1),\n",
" (('visual',), 1),\n",
" (('todavia',), 1),\n",
" (('habla',), 1),\n",
" (('hdtsrp',), 1),\n",
" (('saquear',), 1),\n",
" (('regalar',), 1),\n",
" (('perjudicar',), 1),\n",
" (('ratassssssss',), 1),\n",
" (('cumpla',), 1),\n",
" (('vendrá',), 1),\n",
" (('maldita',), 1),\n",
" (('razonemos',), 1),\n",
" (('común',), 1),\n",
" (('llevó',), 1),\n",
" (('desgracia',), 1),\n",
" (('dictaduria',), 1),\n",
" (('.como',), 1),\n",
" (('vuelva',), 1),\n",
" (('circo',), 1),\n",
" (('ganando',), 1),\n",
" (('medios',), 1),\n",
" (('todologos',), 1),\n",
" (('pensar',), 1),\n",
" (('llegan',), 1),\n",
" (('farmacias',), 1),\n",
" (('metio',), 1),\n",
" (('isidro',), 1),\n",
" (('extorsiones',), 1),\n",
" (('albarado',), 1),\n",
" (('duro',), 1),\n",
" (('tonteras',), 1),\n",
" (('dejense',), 1),\n",
" (('personas',), 1),\n",
" (('esperarse',), 1),\n",
" (('pistas',), 1),\n",
" (('tiempos',), 1),\n",
" (('recuerdos',), 1),\n",
" (('pagar',), 1),\n",
" (('podridas',), 1),\n",
" (('tamaulipas',), 1),\n",
" (('licenciada',), 1),\n",
" (('juntos',), 1),\n",
" (('junio',), 1),\n",
" (('derecho',), 1),\n",
" (('cierren',), 1),\n",
" (('tiren',), 1),\n",
" (('horror',), 1),\n",
" (('mdre',), 1),\n",
" (('misma.mierda',), 1),\n",
" (('unico',), 1),\n",
" (('nacer',), 1),\n",
" (('lolo',), 1),\n",
" (('lotería',), 1),\n",
" (('odjetivo',), 1),\n",
" (('maldito',), 1),\n",
" (('dirigencia',), 1),\n",
" (('enamorado',), 1),\n",
" (('jajajajjaja',), 1),\n",
" (('traficante',), 1),\n",
" (('avento',), 1),\n",
" (('votan',), 1),\n",
" (('ande',), 1),\n",
" (('engañen',), 1),\n",
" (('pollito',), 1),\n",
" (('bonitos',), 1),\n",
" (('fiesta',), 1),\n",
" (('herrera',), 1),\n",
" (('parada',), 1),\n",
" (('algun',), 1),\n",
" (('calzones',), 1),\n",
" (('prineros',), 1),\n",
" (('mera',), 1),\n",
" (('encerrar',), 1),\n",
" (('etc.etc.etc',), 1),\n",
" (('ofrecer',), 1),\n",
" (('cono',), 1),\n",
" (('coructos',), 1),\n",
" (('neta',), 1),\n",
" (('electorado..para',), 1),\n",
" (('ilicitos',), 1),\n",
" (('dense😁',), 1),\n",
" (('convirtió',), 1),\n",
" (('ecologista',), 1),\n",
" (('eres',), 1),\n",
" (('cabeza',), 1),\n",
" (('ningun',), 1),\n",
" (('bullanguero',), 1),\n",
" (('next',), 1),\n",
" (('tomaré',), 1),\n",
" (('puras',), 1),\n",
" (('gorra',), 1),\n",
" (('cree',), 1),\n",
" (('engaña',), 1),\n",
" (('demasiado',), 1),\n",
" (('amenasan',), 1),\n",
" (('kambiar',), 1),\n",
" (('ilicito',), 1),\n",
" (('juicio',), 1),\n",
" (('saben',), 1),\n",
" (('informarse',), 1),\n",
" (('aleman',), 1),\n",
" (('seran',), 1),\n",
" (('regresa',), 1),\n",
" (('//m.facebook.com/story.php',), 1),\n",
" (('pri..',), 1),\n",
" (('bolsa',), 1),\n",
" (('depende',), 1),\n",
" (('yeno',), 1),\n",
" (('duela',), 1),\n",
" (('vecinos',), 1),\n",
" (('alegria',), 1),\n",
" (('mediocres',), 1),\n",
" (('escena',), 1),\n",
" (('tevoy',), 1),\n",
" (('odio',), 1),\n",
" (('marcada',), 1),\n",
" (('juegen',), 1),\n",
" (('voluntad',), 1),\n",
" (('quejan',), 1),\n",
" (('herramienta',), 1),\n",
" (('color',), 1),\n",
" (('hacemos',), 1),\n",
" (('aceptan',), 1),\n",
" (('defiendo',), 1),\n",
" (('espaeramos',), 1),\n",
" (('dejaron',), 1),\n",
" (('dudo.somos',), 1),\n",
" (('investiguen',), 1),\n",
" (('pri-meramente',), 1),\n",
" (('ablen',), 1),\n",
" (('conociera',), 1),\n",
" (('cantidad',), 1),\n",
" (('qerramos',), 1),\n",
" (('junta',), 1),\n",
" (('salpicar',), 1),\n",
" (('prometio',), 1),\n",
" (('chicharrones',), 1),\n",
" (('saludos',), 1),\n",
" (('libertad',), 1),\n",
" (('aparezca',), 1),\n",
" (('seeeeeeeee',), 1),\n",
" (('complises',), 1),\n",
" (('cambie',), 1),\n",
" (('rios',), 1),\n",
" (('justicia',), 1),\n",
" (('jajaja',), 1),\n",
" (('robaran',), 1),\n",
" (('pena',), 1),\n",
" (('rancho',), 1),\n",
" (('iran',), 1),\n",
" (('sucios',), 1),\n",
" (('precion',), 1),\n",
" (('asesinatos',), 1),\n",
" (('.ban',), 1),\n",
" (('niño',), 1),\n",
" (('ángeles',), 1),\n",
" (('señoras',), 1),\n",
" (('federal',), 1),\n",
" (('encuesta',), 1),\n",
" (('resultado',), 1),\n",
" (('error',), 1),\n",
" (('blusas',), 1),\n",
" (('señor',), 1),\n",
" (('correspondientes',), 1),\n",
" (('quisiera',), 1),\n",
" (('doble',), 1),\n",
" (('alegre',), 1),\n",
" (('asta',), 1),\n",
" (('acorralado',), 1),\n",
" (('eras',), 1),\n",
" (('darle',), 1),\n",
" (('ganaba',), 1),\n",
" (('pri..jajajajajajjaja',), 1),\n",
" (('quiera',), 1),\n",
" (('alguien',), 1),\n",
" (('apoya',), 1),\n",
" (('llunes',), 1),\n",
" (('ganso',), 1),\n",
" (('novenario',), 1),\n",
" (('dijieron',), 1),\n",
" (('tamaño',), 1),\n",
" (('quedase',), 1),\n",
" (('aviones',), 1),\n",
" (('priista',), 1),\n",
" (('perdemos',), 1),\n",
" (('diputados',), 1),\n",
" (('servirá',), 1),\n",
" (('capaz',), 1),\n",
" (('alianzas,',), 1),\n",
" (('hablar',), 1),\n",
" (('hermano',), 1),\n",
" (('cristo',), 1),\n",
" (('apliquen',), 1),\n",
" (('jjjjjj',), 1),\n",
" (('.sumado',), 1),\n",
" (('independiente',), 1),\n",
" (('sinvergüenzadas',), 1),\n",
" (('sacan',), 1),\n",
" (('mañana',), 1),\n",
" (('oran',), 1),\n",
" (('descubran',), 1),\n",
" (('estrategia',), 1),\n",
" (('juego',), 1),\n",
" (('apartidista',), 1),\n",
" (('cuál',), 1),\n",
" (('politico',), 1),\n",
" (('jugar',), 1),\n",
" (('publicando',), 1),\n",
" (('ofender',), 1),\n",
" (('perdió',), 1),\n",
" (('ptro',), 1),\n",
" (('ciegos',), 1),\n",
" (('gato',), 1),\n",
" (('impuso',), 1),\n",
" (('hundir',), 1),\n",
" (('crédito',), 1),\n",
" (('denuncias',), 1),\n",
" (('lodo',), 1),\n",
" (('mellooooo',), 1),\n",
" (('pantalla',), 1),\n",
" (('desaparecidos',), 1),\n",
" (('mėxico',), 1),\n",
" (('planeo',), 1),\n",
" (('chingue',), 1),\n",
" (('delicuantazo',), 1),\n",
" (('apoyandolos',), 1),\n",
" (('derecha',), 1),\n",
" (('haces',), 1),\n",
" (('manipulación',), 1),\n",
" (('ganado',), 1),\n",
" (('tlacuaches',), 1),\n",
" (('jajajajajajajajajajajajajaja',), 1),\n",
" (('olvidare',), 1),\n",
" (('fantasias',), 1),\n",
" (('jajaj',), 1),\n",
" (('asesinos',), 1),\n",
" (('arrastrado',), 1),\n",
" (('mataron',), 1),\n",
" (('sorprende',), 1),\n",
" (('porq',), 1),\n",
" (('qieren',), 1),\n",
" (('madres',), 1),\n",
" (('convencer',), 1),\n",
" (('ver´',), 1),\n",
" (('trepan',), 1),\n",
" (('útil',), 1),\n",
" (('mueran',), 1),\n",
" (('tirar',), 1),\n",
" (('robarle',), 1),\n",
" (('ayudando',), 1),\n",
" (('campañas',), 1),\n",
" (('recurrir',), 1),\n",
" (('rependejos',), 1),\n",
" (('boten',), 1),\n",
" (('beltrones',), 1),\n",
" (('amlo',), 1),\n",
" (('sociales',), 1),\n",
" (('llamada',), 1),\n",
" (('abrazo',), 1),\n",
" (('contiendas',), 1),\n",
" (('maestros',), 1),\n",
" (('manipular',), 1),\n",
" (('pan*',), 1),\n",
" (('dejara',), 1),\n",
" (('ofreciendo',), 1),\n",
" (('helizabeth',), 1),\n",
" (('sacer',), 1),\n",
" (('orejas',), 1),\n",
" (('morenaveracruz',), 1),\n",
" (('instituto',), 1),\n",
" (('mentira',), 1),\n",
" (('primaria',), 1),\n",
" (('story_fbid=10207810556173776',), 1),\n",
" (('contingencia',), 1),\n",
" (('dure',), 1),\n",
" (('lacras',), 1),\n",
" (('frutas',), 1),\n",
" (('quieres..',), 1),\n",
" (('llamadas',), 1),\n",
" (('pidamosle',), 1),\n",
" (('coatzacoalcos',), 1),\n",
" (('pertenecen',), 1),\n",
" (('cinismo',), 1),\n",
" (('tipico',), 1),\n",
" (('despedida',), 1),\n",
" (('realidad',), 1),\n",
" (('ignorantes',), 1),\n",
" (('drenajes',), 1),\n",
" (('daran',), 1),\n",
" (('utilizando',), 1),\n",
" (('papapa',), 1),\n",
" (('estan',), 1),\n",
" (('losque',), 1),\n",
" (('pobresito',), 1),\n",
" (('triquiñuelas',), 1),\n",
" (('llamo',), 1),\n",
" (('cansa',), 1),\n",
" (('televisión',), 1),\n",
" (('axiones',), 1),\n",
" (('carvallo',), 1),\n",
" (('sintiendo',), 1),\n",
" (('comentarios',), 1),\n",
" (('memoria',), 1),\n",
" (('delgado',), 1),\n",
" (('impartición',), 1),\n",
" (('esquinas',), 1),\n",
" (('meyoooo',), 1),\n",
" (('tantito',), 1),\n",
" (('manuel',), 1),\n",
" (('estupido',), 1),\n",
" (('serian',), 1),\n",
" (('salir',), 1),\n",
" (('monos',), 1),\n",
" (('inútiles',), 1),\n",
" (('tercer',), 1),\n",
" (('.rostros',), 1),\n",
" (('votarcpor',), 1),\n",
" (('estafadores',), 1),\n",
" (('hagase',), 1),\n",
" (('mendingando',), 1),\n",
" (('robarnos',), 1),\n",
" (('dejando',), 1),\n",
" (('gobierna',), 1),\n",
" (('wallace',), 1),\n",
" (('directo',), 1),\n",
" (('cansare',), 1),\n",
" (('mostrar',), 1),\n",
" (('presidente',), 1),\n",
" (('habian',), 1),\n",
" (('pelar',), 1),\n",
" (('abusos',), 1),\n",
" (('trato',), 1),\n",
" (('saluditos',), 1),\n",
" (('permitamos',), 1),\n",
" (('fuerza',), 1),\n",
" (('soltar',), 1),\n",
" (('abrir',), 1),\n",
" (('berduras',), 1),\n",
" (('le.va',), 1),\n",
" (('malo',), 1),\n",
" (('empleos',), 1),\n",
" (('.vamos',), 1),\n",
" (('incumplidas',), 1),\n",
" (('nacional',), 1),\n",
" (('desempleo',), 1),\n",
" (('amañadas',), 1),\n",
" (('bardas',), 1),\n",
" (('extorsión',), 1),\n",
" (('carsel',), 1),\n",
" (('diciendo',), 1),\n",
" (('cochino',), 1),\n",
" (('grande',), 1),\n",
" (('estuvieron',), 1),\n",
" (('politiko',), 1),\n",
" (('enriqui',), 1),\n",
" (('cometan',), 1),\n",
" (('perteneser',), 1),\n",
" (('pusieron',), 1),\n",
" (('ojetes',), 1),\n",
" (('traía',), 1),\n",
" (('puñal',), 1),\n",
" (('id=1590836771',), 1),\n",
" (('llama',), 1),\n",
" (('quieras',), 1),\n",
" (('payasos',), 1),\n",
" (('abajo',), 1),\n",
" (('obscurito',), 1),\n",
" (('piensalo',), 1),\n",
" (('gestiones',), 1),\n",
" (('ceron',), 1),\n",
" (('come',), 1),\n",
" (('lujosos',), 1),\n",
" (('zent',), 1),\n",
" (('mmmm',), 1),\n",
" (('ignorante',), 1),\n",
" (('seas',), 1),\n",
" (('desperto',), 1),\n",
" (('pulso',), 1),\n",
" (('bienestar',), 1),\n",
" (('durar',), 1),\n",
" (('chile',), 1),\n",
" (('personajes',), 1),\n",
" (('dejarlo',), 1),\n",
" (('robos',), 1),\n",
" (('infladas',), 1),\n",
" (('representa',), 1),\n",
" (('obrador',), 1),\n",
" (('culeros',), 1),\n",
" (('intervenir',), 1),\n",
" (('conseguir',), 1),\n",
" (('avían',), 1),\n",
" (('oficialmente',), 1),\n",
" (('golpe',), 1),\n",
" (('pensé',), 1),\n",
" (('floristeros',), 1),\n",
" (('igualitos',), 1),\n",
" (('siga',), 1),\n",
" (('fotos..',), 1),\n",
" (('confirma',), 1),\n",
" (('desastre',), 1),\n",
" (('hágnme',), 1),\n",
" (('tontos',), 1),\n",
" (('otaolaurruchi',), 1),\n",
" (('mexicana',), 1),\n",
" (('https',), 1),\n",
" (('correrlos',), 1),\n",
" (('mando',), 1),\n",
" (('bonito',), 1),\n",
" (('comprando',), 1),\n",
" (('trniendo',), 1),\n",
" (('mexicanos',), 1),\n",
" (('occico',), 1),\n",
" (('coruccion',), 1),\n",
" (('ciega',), 1),\n",
" (('http',), 1),\n",
" (('amigos',), 1),\n",
" (('comer',), 1),\n",
" (('apoyará',), 1),\n",
" (('secuaces',), 1),\n",
" (('urbanos',), 1),\n",
" (('rateria',), 1),\n",
" (('increíblemente',), 1),\n",
" (('generaciones',), 1),\n",
" (('ciudad',), 1),\n",
" (('despierten',), 1),\n",
" (('medicamentos',), 1),\n",
" (('¡trepadores',), 1),\n",
" (('veracruz..',), 1),\n",
" (('loque',), 1),\n",
" (('enel',), 1),\n",
" (('dispuesto',), 1),\n",
" (('al.pri',), 1),\n",
" (('caseta',), 1),\n",
" (('vergüenza',), 1),\n",
" (('tortas',), 1),\n",
" (('silencio',), 1),\n",
" (('estimado',), 1),\n",
" (('ganan',), 1),\n",
" (('seguro',), 1),\n",
" (('sustento',), 1),\n",
" (('cárnicas',), 1),\n",
" (('trabaja',), 1),\n",
" (('recuperar',), 1),\n",
" (('opciones',), 1),\n",
" (('suya',), 1),\n",
" (('debate',), 1),\n",
" (('podrida',), 1),\n",
" (('repudio',), 1),\n",
" (('goto',), 1),\n",
" (('sentido',), 1),\n",
" (('tubo',), 1),\n",
" (('taller',), 1),\n",
" (('pagina',), 1),\n",
" (('dante',), 1),\n",
" (('quieran',), 1),\n",
" (('cruel',), 1),\n",
" (('esperando',), 1),\n",
" (('raro',), 1),\n",
" (('dieran',), 1),\n",
" (('iguales',), 1),\n",
" (('representan',), 1),\n",
" (('agarraron',), 1),\n",
" (('periodistas',), 1),\n",
" (('principal',), 1),\n",
" (('difaman',), 1),\n",
" (('presentó',), 1),\n",
" (('venganza',), 1),\n",
" (('resagos',), 1),\n",
" (('combenencira',), 1),\n",
" (('deberle',), 1),\n",
" (('manos',), 1),\n",
" (('empiezan',), 1),\n",
" (('tragar',), 1),\n",
" (('vacías',), 1),\n",
" (('vieja',), 1),\n",
" (('precidente',), 1),\n",
" (('agarran',), 1),\n",
" (('votación',), 1),\n",
" (('cuánto',), 1),\n",
" (('demora',), 1),\n",
" (('creen',), 1),\n",
" (('matar.porque',), 1),\n",
" (('creible',), 1),\n",
" (('invertir',), 1),\n",
" (('juegan',), 1),\n",
" (('siguen',), 1),\n",
" (('noble',), 1),\n",
" (('roja',), 1),\n",
" (('salen',), 1),\n",
" (('siega',), 1),\n",
" (('pelan',), 1),\n",
" (('prinche',), 1),\n",
" (('conformó',), 1),\n",
" (('negocios',), 1),\n",
" (('montoya',), 1),\n",
" (('vinieron',), 1),\n",
" (('sionismo',), 1),\n",
" (('acusen',), 1),\n",
" (('calladitos',), 1),\n",
" (('permitiremos',), 1),\n",
" (('mansiones',), 1),\n",
" (('obra',), 1),\n",
" (('cortados',), 1),\n",
" (('vivido',), 1),\n",
" (('cura',), 1),\n",
" (('poquito',), 1),\n",
" (('acabado',), 1),\n",
" (('entró',), 1),\n",
" (('maldigamos',), 1),\n",
" (('llegaste',), 1),\n",
" (('ciudadano',), 1),\n",
" (('calles',), 1),\n",
" (('ademas',), 1),\n",
" (('paga',), 1),\n",
" (('muerto',), 1),\n",
" (('descanse',), 1),\n",
" (('asesino',), 1),\n",
" (('porquerias',), 1),\n",
" (('ladrón',), 1),\n",
" (('cobardes',), 1),\n",
" (('diario',), 1),\n",
" (('gaudy',), 1),\n",
" (('perdedor',), 1),\n",
" (('distractor',), 1),\n",
" (('hipocresía',), 1),\n",
" (('creyendo',), 1),\n",
" (('lambe',), 1),\n",
" (('traidor',), 1),\n",
" (('ayuda',), 1),\n",
" (('manitas',), 1),\n",
" (('redes',), 1),\n",
" (('efervescencia',), 1),\n",
" (('sres',), 1),\n",
" (('jinetear',), 1),\n",
" (('asquean',), 1),\n",
" (('malentes',), 1),\n",
" (('corajes',), 1),\n",
" (('operacion',), 1),\n",
" (('gubernatura',), 1),\n",
" (('finanzas',), 1),\n",
" (('gerardo',), 1),\n",
" (('sabemos',), 1),\n",
" (('aliado',), 1),\n",
" (('acabaron',), 1),\n",
" (('pregunto',), 1),\n",
" (('obvio',), 1),\n",
" (('ganara',), 1),\n",
" (('menciona',), 1),\n",
" (('cambiando',), 1),\n",
" (('segun',), 1),\n",
" (('espantan',), 1),\n",
" (('ganará',), 1),\n",
" (('falsas',), 1),\n",
" (('nota',), 1),\n",
" (('esencia',), 1),\n",
" (('hacerle',), 1),\n",
" (('jarochos',), 1),\n",
" (('café',), 1),\n",
" (('espectaculares',), 1),\n",
" (('ganó',), 1),\n",
" (('carlos',), 1),\n",
" (('toma',), 1),\n",
" (('incluyendo',), 1),\n",
" (('estúpida',), 1),\n",
" (('tocará',), 1),\n",
" (('trabajador',), 1),\n",
" (('encoleriza',), 1),\n",
" (('embarrados',), 1),\n",
" (('andas',), 1),\n",
" (('cuentos',), 1),\n",
" (('horas',), 1),\n",
" (('correcta',), 1),\n",
" (('vale',), 1),\n",
" (('sakiado',), 1),\n",
" (('universidad',), 1),\n",
" (('insulto',), 1),\n",
" (('verdaderos',), 1),\n",
" (('dejarnos',), 1),\n",
" (('miyules',), 1),\n",
" (('metan',), 1),\n",
" (('soborno',), 1),\n",
" (('coman',), 1),\n",
" (('paginas',), 1),\n",
" (('esperaba',), 1),\n",
" (('doctor',), 1),\n",
" (('apoco',), 1),\n",
" (('protegen',), 1),\n",
" (('casas',), 1),\n",
" (('quede',), 1),\n",
" (('sienpre',), 1),\n",
" (('hecto',), 1),\n",
" ...]"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ngram_histogram_page[\"1\"]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"729"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(ngram_histogram_page[\"1\"])"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Get top 10% of 1-grams\n",
"top_onegrams = [word[0][0] for word in ngram_histogram_page[\"1\"][0:72]]"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Find top 1-grams in 6-grams, and form a list of merged words\n",
"word_reference = {}\n",
"for word in top_onegrams:\n",
" for sixgram in ngrams_lists[\"6\"]:\n",
" if word in sixgram:\n",
" if word in word_reference:\n",
" sixgram.remove(word)\n",
" word_reference[word] = word_reference[word] + sixgram\n",
" else:\n",
" sixgram.remove(word)\n",
" word_reference[word] = sixgram"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Convert to counters the merged lists to get counts.\n",
"freqdist_reference = {}\n",
"for word in word_reference:\n",
" freqdist_reference[word] = Counter(word_reference[word])"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"morena\n",
"Counter({'cuitlahuac': 6, 'verdad': 5, 'arriba': 5, 'ofender': 5, 'mera': 4, 'cambio': 4, 'suficiente': 4, 'gane': 4, 'manitas': 4, 'rateria': 4, 'oran': 4, 'apoya': 4, 'quieren': 4, 'floristeros': 3, 'hagamos': 3, 'meyoooo': 3, 'votar': 3, 'fuerza': 3, 'operacion': 3, 'chinge': 3, 'gobernatura': 3, 'deberle': 3, 'salen': 3, 'realmente': 3, 'embarrados': 3, 'cobardes': 3, 'cambie': 3, 'realidad': 3, 'estado': 2, 'duela': 2, 'pelea': 2, 'disen': 2, 'votando': 2, 'pagina': 2, 'dudo.somos': 2, 'confirma': 2, 'helizabeth': 2, 'engañar': 2, 'amlo': 2, 'útil': 2, 'aliado': 2, 'juegen': 2, 'amañadas': 2, 'vendrá': 2, 'puta': 2, 'dejamos': 2, 'alianzas': 2, 'universidad': 2, 'puro': 1, 'veracruzanos': 1, 'presidente': 1, 'opcion': 1, 'morales,': 1, 'maricelita': 1, 'next': 1, 'http': 1, 'pusieron': 1, 'rival': 1, 'ratas': 1, 'ganando': 1, 'pollito': 1, 'linares': 1, 'despierten': 1, 'madre': 1, 'presentó': 1, 'adios': 1, 'jajaj': 1, 'pelan': 1, 'aguas': 1, 'encuestas': 1, 'comentarios': 1, 'creer': 1, 'andrés': 1, 'ganar': 1, 'votemos': 1, 'debate': 1, 'distractor': 1, 'limpio': 1, 'porkeria': 1, 'rateros': 1, 'robarle': 1, 'sale': 1})\n"
]
}
],
"source": [
"#Example of the count vector formed.\n",
"palabra = ngram_histogram_page[\"1\"][8][0][0]\n",
"print(\"morena\")\n",
"print(freqdist_reference[\"morena\"])"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Load Sdal dictionary for pleasantness: \n",
"import json\n",
"with open('sdal.json') as data_file: \n",
" data = json.load(data_file)\n",
"sdal_words = list(data.keys())"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Library to match words in comments with words in the sdal dictionary \n",
"from fuzzywuzzy import process"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"basta\n",
"perder\n",
"dios\n",
"marrana\n",
"roban\n",
"ángel\n",
"arriba\n",
"hagan\n",
"pinche\n",
"javier\n",
"gobierno\n",
"priistas\n",
"partido\n",
"sabe\n",
"puto\n",
"votar\n",
"angel\n",
"rata\n",
"inseguridad\n",
"tiempo\n",
"pendejo\n",
"nieto\n",
"cambio\n",
"linares\n",
"robaron\n",
"diferencia\n",
"mamada\n",
"despensa\n",
"madre\n",
"engañar\n",
"yunes\n",
"ganar\n",
"veracruz\n",
"ratas\n",
"estado\n",
"encuestas\n",
"corrupto\n",
"robo\n",
"dinero\n",
"poder\n",
"robado\n",
"pendeja\n",
"políticos\n",
"mala\n",
"voto\n",
"ahogado\n",
"limpio\n",
"miguel\n",
"gobernador\n",
"duarte\n",
"hagamos\n",
"morena\n",
"años\n",
"mierda\n",
"puta\n",
"elecciones\n",
"pueblo\n",
"miedo\n",
"chingar\n",
"quieren\n",
"gane\n",
"candidato\n",
"gente\n",
"delincuencia\n",
"rateros\n",
"patadas\n",
"señores\n",
"favor\n",
"fidel\n",
"historia\n",
"único\n"
]
}
],
"source": [
"#Get overall pleasantness as the sum of the contributions of each word ocurrence for each top 1-gram found.\n",
"word_feeling = {}\n",
"for word in freqdist_reference:\n",
" current_grade = 0.0\n",
" print(word)\n",
" for subword in freqdist_reference[word]:\n",
" match = process.extractOne(subword,sdal_words)\n",
" if match is not None:\n",
" current_grade = current_grade + float(data[match[0]]['pleasantness'])*(float(freqdist_reference[word][subword])/len( word_reference[word]))\n",
" word_feeling[word] =current_grade"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.32982105263157896"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_feeling[\"mala\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment