Created
April 21, 2016 01:53
-
-
Save ricalanis/12fe91e83e868672b65c21d0e7fb8f1c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from pymongo import MongoClient, ASCENDING, DESCENDING\n", | |
"from nltk import FreqDist\n", | |
"import os\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from collections import Counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"MONGO_URL = os.environ.get('mongo_url')\n", | |
"client = MongoClient(MONGO_URL)\n", | |
"db = client.comments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"candidate_id = \"fgcabezadevaca\"\n", | |
"page_id = \"1037553166317439\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"query = db.comments.find({ \"page_id\": page_id, \"candidate_id\": candidate_id, \"post_id\": {\"$exists\":True}}).batch_size(1000)\n", | |
"output = []\n", | |
"ngrams_cabeza = {}\n", | |
"ngrams_lists = {}\n", | |
"for comment in query:\n", | |
" for key in comment[\"ngrams\"]:\n", | |
" if key in ngrams_cabeza:\n", | |
" ngrams_cabeza[key]= ngrams_cabeza[key]+ [tuple(ngram_single) for ngram_single in comment[\"ngrams\"][key]]\n", | |
" ngrams_lists[key] = ngrams_lists[key] + [ngram_single for ngram_single in comment[\"ngrams\"][key]]\n", | |
" else:\n", | |
" ngrams_cabeza[key]= [tuple(ngram_single) for ngram_single in comment[\"ngrams\"][key]]\n", | |
" ngrams_lists[key] = [ngram_single for ngram_single in comment[\"ngrams\"][key]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ngrams_lists[\"1\"][0:70]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def ngram_histogram(ngrams_input):\n", | |
" output = {}\n", | |
" for idx in ngrams_input:\n", | |
" FreqDist(ngrams_input[idx])\n", | |
" freq_list = list(FreqDist(ngrams_input[idx]).items())\n", | |
" output[idx] = sorted(freq_list, key=lambda pair: pair[1], reverse=True)\n", | |
" return output" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ngram_histogram_page = ngram_histogram(ngrams_cabeza)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"231\n" | |
] | |
} | |
], | |
"source": [ | |
"query_count = db.comments.find({ \"page_id\": page_id, \"candidate_id\": candidate_id, \"post_id\": {\"$exists\":True}}).batch_size(1000).count()\n", | |
"print(query_count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(('duarte',), 27),\n", | |
" (('yunes',), 23),\n", | |
" (('pueblo',), 22),\n", | |
" (('voto',), 21),\n", | |
" (('miguel',), 19),\n", | |
" (('veracruz',), 16),\n", | |
" (('morena',), 16),\n", | |
" (('linares',), 12),\n", | |
" (('ratas',), 12),\n", | |
" (('madre',), 11),\n", | |
" (('estado',), 10),\n", | |
" (('partido',), 10),\n", | |
" (('angel',), 10),\n", | |
" (('gente',), 10),\n", | |
" (('dinero',), 10),\n", | |
" (('poder',), 10),\n", | |
" (('basta',), 9),\n", | |
" (('gobierno',), 8),\n", | |
" (('cambio',), 8),\n", | |
" (('arriba',), 8),\n", | |
" (('quieren',), 7),\n", | |
" (('gobernador',), 7),\n", | |
" (('encuestas',), 6),\n", | |
" (('ganar',), 6),\n", | |
" (('pinche',), 6),\n", | |
" (('chingar',), 6),\n", | |
" (('marrana',), 6),\n", | |
" (('patadas',), 6),\n", | |
" (('robo',), 6),\n", | |
" (('votar',), 6),\n", | |
" (('mierda',), 6),\n", | |
" (('rata',), 5),\n", | |
" (('favor',), 5),\n", | |
" (('engañar',), 5),\n", | |
" (('despensa',), 5),\n", | |
" (('ángel',), 5),\n", | |
" (('dios',), 4),\n", | |
" (('ahogado',), 4),\n", | |
" (('inseguridad',), 4),\n", | |
" (('sabe',), 4),\n", | |
" (('fidel',), 4),\n", | |
" (('priistas',), 4),\n", | |
" (('políticos',), 4),\n", | |
" (('robado',), 4),\n", | |
" (('rateros',), 4),\n", | |
" (('historia',), 4),\n", | |
" (('miedo',), 4),\n", | |
" (('único',), 4),\n", | |
" (('javier',), 4),\n", | |
" (('limpio',), 4),\n", | |
" (('puta',), 4),\n", | |
" (('elecciones',), 4),\n", | |
" (('gane',), 4),\n", | |
" (('hagamos',), 4),\n", | |
" (('perder',), 4),\n", | |
" (('diferencia',), 3),\n", | |
" (('nieto',), 3),\n", | |
" (('señores',), 3),\n", | |
" (('años',), 3),\n", | |
" (('mala',), 3),\n", | |
" (('pendejo',), 3),\n", | |
" (('roban',), 3),\n", | |
" (('hagan',), 3),\n", | |
" (('afuera',), 3),\n", | |
" (('delincuencia',), 3),\n", | |
" (('tiempo',), 3),\n", | |
" (('puto',), 3),\n", | |
" (('corrupto',), 3),\n", | |
" (('candidato',), 3),\n", | |
" (('robaron',), 3),\n", | |
" (('mamada',), 3),\n", | |
" (('pendeja',), 3),\n", | |
" (('vida',), 3),\n", | |
" (('chinge',), 3),\n", | |
" (('pura',), 3),\n", | |
" (('pobres',), 3),\n", | |
" (('verdad',), 3),\n", | |
" (('pinches',), 3),\n", | |
" (('votando',), 3),\n", | |
" (('landa',), 3),\n", | |
" (('sacar',), 3),\n", | |
" (('sucia',), 3),\n", | |
" (('política',), 3),\n", | |
" (('gracias',), 3),\n", | |
" (('enjuicien',), 3),\n", | |
" (('pesos',), 3),\n", | |
" (('pobre',), 3),\n", | |
" (('p.r.i',), 3),\n", | |
" (('robar',), 3),\n", | |
" (('miyuli',), 3),\n", | |
" (('cuitlahuac',), 3),\n", | |
" (('hector',), 3),\n", | |
" (('carcel',), 3),\n", | |
" (('vaya',), 3),\n", | |
" (('verga',), 3),\n", | |
" (('bola',), 3),\n", | |
" (('perro',), 3),\n", | |
" (('sale',), 2),\n", | |
" (('abren',), 2),\n", | |
" (('tambo',), 2),\n", | |
" (('sector',), 2),\n", | |
" (('disen',), 2),\n", | |
" (('dejemos',), 2),\n", | |
" (('oportunidades',), 2),\n", | |
" (('vote',), 2),\n", | |
" (('promesas',), 2),\n", | |
" (('miserias',), 2),\n", | |
" (('peña',), 2),\n", | |
" (('boto',), 2),\n", | |
" (('panistas',), 2),\n", | |
" (('ladrones',), 2),\n", | |
" (('secuestros',), 2),\n", | |
" (('caos',), 2),\n", | |
" (('andan',), 2),\n", | |
" (('debería',), 2),\n", | |
" (('destruir',), 2),\n", | |
" (('puro',), 2),\n", | |
" (('guerra',), 2),\n", | |
" (('dando',), 2),\n", | |
" (('recursos',), 2),\n", | |
" (('tapar',), 2),\n", | |
" (('ojos',), 2),\n", | |
" (('aganar',), 2),\n", | |
" (('sigan',), 2),\n", | |
" (('jamas',), 2),\n", | |
" (('nose',), 2),\n", | |
" (('gobernatura',), 2),\n", | |
" (('crimen',), 2),\n", | |
" (('jajajaja',), 2),\n", | |
" (('viene',), 2),\n", | |
" (('deja',), 2),\n", | |
" (('apoyo',), 2),\n", | |
" (('tapó',), 2),\n", | |
" (('dejen',), 2),\n", | |
" (('azul',), 2),\n", | |
" (('riko',), 2),\n", | |
" (('esperaban',), 2),\n", | |
" (('contaminación',), 2),\n", | |
" (('jente',), 2),\n", | |
" (('objetivo',), 2),\n", | |
" (('somos',), 2),\n", | |
" (('kien',), 2),\n", | |
" (('corrupcion',), 2),\n", | |
" (('idiota',), 2),\n", | |
" (('radio',), 2),\n", | |
" (('porkeria',), 2),\n", | |
" (('cambiar',), 2),\n", | |
" (('saca',), 2),\n", | |
" (('huevos',), 2),\n", | |
" (('gordo',), 2),\n", | |
" (('buscar',), 2),\n", | |
" (('chinguen',), 2),\n", | |
" (('pendejos',), 2),\n", | |
" (('sera',), 2),\n", | |
" (('humo',), 2),\n", | |
" (('culpa',), 2),\n", | |
" (('claro',), 2),\n", | |
" (('ustedes',), 2),\n", | |
" (('priístas',), 2),\n", | |
" (('apoyando',), 2),\n", | |
" (('verde',), 2),\n", | |
" (('malos',), 2),\n", | |
" (('peor',), 2),\n", | |
" (('fuerte',), 2),\n", | |
" (('pederasta',), 2),\n", | |
" (('politica',), 2),\n", | |
" (('estarán',), 2),\n", | |
" (('carreteras',), 2),\n", | |
" (('queda',), 2),\n", | |
" (('creer',), 2),\n", | |
" (('hagas',), 2),\n", | |
" (('boca',), 2),\n", | |
" (('pristas',), 2),\n", | |
" (('santo',), 2),\n", | |
" (('siquiera',), 2),\n", | |
" (('mamadas',), 2),\n", | |
" (('yuni',), 2),\n", | |
" (('pasa',), 2),\n", | |
" (('.por',), 2),\n", | |
" (('adios',), 2),\n", | |
" (('patria',), 2),\n", | |
" (('pensionados',), 2),\n", | |
" (('credibilidad',), 2),\n", | |
" (('gobernar',), 2),\n", | |
" (('pelea',), 2),\n", | |
" (('gana',), 2),\n", | |
" (('alos',), 2),\n", | |
" (('persecucion',), 2),\n", | |
" (('lucha',), 2),\n", | |
" (('foto',), 2),\n", | |
" (('carros',), 2),\n", | |
" (('escuchados',), 2),\n", | |
" (('maquinaria',), 2),\n", | |
" (('partidos',), 2),\n", | |
" (('ñaka',), 2),\n", | |
" (('votemos',), 2),\n", | |
" (('veracruzano',), 2),\n", | |
" (('opcion',), 2),\n", | |
" (('españa',), 2),\n", | |
" (('acuerdo',), 2),\n", | |
" (('hueso',), 2),\n", | |
" (('rostros',), 2),\n", | |
" (('suficiente',), 2),\n", | |
" (('basura',), 2),\n", | |
" (('seguir',), 2),\n", | |
" (('kieren',), 2),\n", | |
" (('conforman',), 2),\n", | |
" (('veracruzanos',), 2),\n", | |
" (('mayoría',), 2),\n", | |
" (('porquería',), 2),\n", | |
" (('meter',), 2),\n", | |
" (('destrozadas',), 1),\n", | |
" (('apasiones',), 1),\n", | |
" (('criticas',), 1),\n", | |
" (('regla',), 1),\n", | |
" (('nexos',), 1),\n", | |
" (('desesperada',), 1),\n", | |
" (('durmiendo',), 1),\n", | |
" (('maricelita',), 1),\n", | |
" (('salinas',), 1),\n", | |
" (('mauser',), 1),\n", | |
" (('negra',), 1),\n", | |
" (('fantan',), 1),\n", | |
" (('vigilaremos',), 1),\n", | |
" (('oigan',), 1),\n", | |
" (('gobernando',), 1),\n", | |
" (('corruptos',), 1),\n", | |
" (('maje',), 1),\n", | |
" (('intentan',), 1),\n", | |
" (('nomas',), 1),\n", | |
" (('titulares',), 1),\n", | |
" (('boleta',), 1),\n", | |
" (('marrano',), 1),\n", | |
" (('ventajistas',), 1),\n", | |
" (('cantar',), 1),\n", | |
" (('apollo..',), 1),\n", | |
" (('haga',), 1),\n", | |
" (('hipocrita',), 1),\n", | |
" (('pensaron',), 1),\n", | |
" (('tapadera',), 1),\n", | |
" (('vota',), 1),\n", | |
" (('prii',), 1),\n", | |
" (('derechos',), 1),\n", | |
" (('obras',), 1),\n", | |
" (('llene',), 1),\n", | |
" (('regalando',), 1),\n", | |
" (('sudor',), 1),\n", | |
" (('morales,',), 1),\n", | |
" (('ahutoridad',), 1),\n", | |
" (('crean',), 1),\n", | |
" (('definido',), 1),\n", | |
" (('chiflen',), 1),\n", | |
" (('erradicarla',), 1),\n", | |
" (('linda',), 1),\n", | |
" (('escribio',), 1),\n", | |
" (('garras',), 1),\n", | |
" (('encarcele',), 1),\n", | |
" (('caca',), 1),\n", | |
" (('quiero',), 1),\n", | |
" (('plena',), 1),\n", | |
" (('organisado',), 1),\n", | |
" (('namas',), 1),\n", | |
" (('cabecilla',), 1),\n", | |
" (('mantenerse',), 1),\n", | |
" (('priiu',), 1),\n", | |
" (('espero',), 1),\n", | |
" (('tambien',), 1),\n", | |
" (('saco',), 1),\n", | |
" (('ahogados',), 1),\n", | |
" (('nota..',), 1),\n", | |
" (('tinoco',), 1),\n", | |
" (('nunka',), 1),\n", | |
" (('responsable',), 1),\n", | |
" (('digas',), 1),\n", | |
" (('migel',), 1),\n", | |
" (('noticias',), 1),\n", | |
" (('complice',), 1),\n", | |
" (('gorda',), 1),\n", | |
" (('lenis',), 1),\n", | |
" (('logotipo',), 1),\n", | |
" (('muestran',), 1),\n", | |
" (('martirizar',), 1),\n", | |
" (('pri.',), 1),\n", | |
" (('repentina',), 1),\n", | |
" (('expertos',), 1),\n", | |
" (('spots',), 1),\n", | |
" (('comprar',), 1),\n", | |
" (('murió',), 1),\n", | |
" (('manejos',), 1),\n", | |
" (('arreglos',), 1),\n", | |
" (('barrio',), 1),\n", | |
" (('nación',), 1),\n", | |
" (('sige',), 1),\n", | |
" (('tijera',), 1),\n", | |
" (('sufrir',), 1),\n", | |
" (('tricolor',), 1),\n", | |
" (('familias',), 1),\n", | |
" (('espanten',), 1),\n", | |
" (('cárcel',), 1),\n", | |
" (('ambiente',), 1),\n", | |
" (('impunidad',), 1),\n", | |
" (('perderá',), 1),\n", | |
" (('viejos',), 1),\n", | |
" (('primierda',), 1),\n", | |
" (('botar',), 1),\n", | |
" (('pierdan',), 1),\n", | |
" (('tiempo.que',), 1),\n", | |
" (('acabar',), 1),\n", | |
" (('hogado',), 1),\n", | |
" (('conozco',), 1),\n", | |
" (('golpeado',), 1),\n", | |
" (('quita',), 1),\n", | |
" (('pagaremos',), 1),\n", | |
" (('ayúdame',), 1),\n", | |
" (('complices',), 1),\n", | |
" (('parenle',), 1),\n", | |
" (('quiebra',), 1),\n", | |
" (('trabajen',), 1),\n", | |
" (('ulibarri',), 1),\n", | |
" (('arribistas',), 1),\n", | |
" (('salud',), 1),\n", | |
" (('cara',), 1),\n", | |
" (('opine',), 1),\n", | |
" (('santanas',), 1),\n", | |
" (('beltran',), 1),\n", | |
" (('condenan',), 1),\n", | |
" (('..están',), 1),\n", | |
" (('preocupados',), 1),\n", | |
" (('pesimas',), 1),\n", | |
" (('aplicar',), 1),\n", | |
" (('información',), 1),\n", | |
" (('chingarse',), 1),\n", | |
" (('aser',), 1),\n", | |
" (('dejamos',), 1),\n", | |
" (('blanquiazules',), 1),\n", | |
" (('giselle',), 1),\n", | |
" (('sinonimos',), 1),\n", | |
" (('encarcel',), 1),\n", | |
" (('politicos',), 1),\n", | |
" (('flota',), 1),\n", | |
" (('kuleros',), 1),\n", | |
" (('andrés',), 1),\n", | |
" (('totalmente',), 1),\n", | |
" (('alimentado',), 1),\n", | |
" (('rival',), 1),\n", | |
" (('creíble',), 1),\n", | |
" (('buscan',), 1),\n", | |
" (('rstado',), 1),\n", | |
" (('esfuerzo',), 1),\n", | |
" (('tlacuache',), 1),\n", | |
" (('amparo',), 1),\n", | |
" (('trataran',), 1),\n", | |
" (('asistir',), 1),\n", | |
" (('sabido',), 1),\n", | |
" (('militantes',), 1),\n", | |
" (('verán',), 1),\n", | |
" (('perdido',), 1),\n", | |
" (('encarcelo',), 1),\n", | |
" (('sptm',), 1),\n", | |
" (('linare',), 1),\n", | |
" (('falsedades',), 1),\n", | |
" (('cambiemos',), 1),\n", | |
" (('violencia',), 1),\n", | |
" (('..no',), 1),\n", | |
" (('administrado',), 1),\n", | |
" (('engañando',), 1),\n", | |
" (('próspera',), 1),\n", | |
" (('mantecoso',), 1),\n", | |
" (('hasen',), 1),\n", | |
" (('continúen',), 1),\n", | |
" (('terminar',), 1),\n", | |
" (('respeto',), 1),\n", | |
" (('méxico',), 1),\n", | |
" (('levantones',), 1),\n", | |
" (('opinar',), 1),\n", | |
" (('evidencias',), 1),\n", | |
" (('hospitales',), 1),\n", | |
" (('querían',), 1),\n", | |
" (('empezando',), 1),\n", | |
" (('votaremod',), 1),\n", | |
" (('morral,',), 1),\n", | |
" (('mayl',), 1),\n", | |
" (('represores',), 1),\n", | |
" (('ganarån',), 1),\n", | |
" (('sienten',), 1),\n", | |
" (('transas',), 1),\n", | |
" (('choro',), 1),\n", | |
" (('amarillistas',), 1),\n", | |
" (('asquerosas',), 1),\n", | |
" (('alianzas',), 1),\n", | |
" (('gandero',), 1),\n", | |
" (('promete',), 1),\n", | |
" (('carajo',), 1),\n", | |
" (('paralisado',), 1),\n", | |
" (('aguas',), 1),\n", | |
" (('tachando',), 1),\n", | |
" (('acresentan',), 1),\n", | |
" (('converse',), 1),\n", | |
" (('gandállas',), 1),\n", | |
" (('vete',), 1),\n", | |
" (('verdad..',), 1),\n", | |
" (('taxis',), 1),\n", | |
" (('wevos',), 1),\n", | |
" (('mamen',), 1),\n", | |
" (('viviendo',), 1),\n", | |
" (('tricolor.porque',), 1),\n", | |
" (('creatividad',), 1),\n", | |
" (('cosa',), 1),\n", | |
" (('hartazgo',), 1),\n", | |
" (('visual',), 1),\n", | |
" (('todavia',), 1),\n", | |
" (('habla',), 1),\n", | |
" (('hdtsrp',), 1),\n", | |
" (('saquear',), 1),\n", | |
" (('regalar',), 1),\n", | |
" (('perjudicar',), 1),\n", | |
" (('ratassssssss',), 1),\n", | |
" (('cumpla',), 1),\n", | |
" (('vendrá',), 1),\n", | |
" (('maldita',), 1),\n", | |
" (('razonemos',), 1),\n", | |
" (('común',), 1),\n", | |
" (('llevó',), 1),\n", | |
" (('desgracia',), 1),\n", | |
" (('dictaduria',), 1),\n", | |
" (('.como',), 1),\n", | |
" (('vuelva',), 1),\n", | |
" (('circo',), 1),\n", | |
" (('ganando',), 1),\n", | |
" (('medios',), 1),\n", | |
" (('todologos',), 1),\n", | |
" (('pensar',), 1),\n", | |
" (('llegan',), 1),\n", | |
" (('farmacias',), 1),\n", | |
" (('metio',), 1),\n", | |
" (('isidro',), 1),\n", | |
" (('extorsiones',), 1),\n", | |
" (('albarado',), 1),\n", | |
" (('duro',), 1),\n", | |
" (('tonteras',), 1),\n", | |
" (('dejense',), 1),\n", | |
" (('personas',), 1),\n", | |
" (('esperarse',), 1),\n", | |
" (('pistas',), 1),\n", | |
" (('tiempos',), 1),\n", | |
" (('recuerdos',), 1),\n", | |
" (('pagar',), 1),\n", | |
" (('podridas',), 1),\n", | |
" (('tamaulipas',), 1),\n", | |
" (('licenciada',), 1),\n", | |
" (('juntos',), 1),\n", | |
" (('junio',), 1),\n", | |
" (('derecho',), 1),\n", | |
" (('cierren',), 1),\n", | |
" (('tiren',), 1),\n", | |
" (('horror',), 1),\n", | |
" (('mdre',), 1),\n", | |
" (('misma.mierda',), 1),\n", | |
" (('unico',), 1),\n", | |
" (('nacer',), 1),\n", | |
" (('lolo',), 1),\n", | |
" (('lotería',), 1),\n", | |
" (('odjetivo',), 1),\n", | |
" (('maldito',), 1),\n", | |
" (('dirigencia',), 1),\n", | |
" (('enamorado',), 1),\n", | |
" (('jajajajjaja',), 1),\n", | |
" (('traficante',), 1),\n", | |
" (('avento',), 1),\n", | |
" (('votan',), 1),\n", | |
" (('ande',), 1),\n", | |
" (('engañen',), 1),\n", | |
" (('pollito',), 1),\n", | |
" (('bonitos',), 1),\n", | |
" (('fiesta',), 1),\n", | |
" (('herrera',), 1),\n", | |
" (('parada',), 1),\n", | |
" (('algun',), 1),\n", | |
" (('calzones',), 1),\n", | |
" (('prineros',), 1),\n", | |
" (('mera',), 1),\n", | |
" (('encerrar',), 1),\n", | |
" (('etc.etc.etc',), 1),\n", | |
" (('ofrecer',), 1),\n", | |
" (('cono',), 1),\n", | |
" (('coructos',), 1),\n", | |
" (('neta',), 1),\n", | |
" (('electorado..para',), 1),\n", | |
" (('ilicitos',), 1),\n", | |
" (('dense😁',), 1),\n", | |
" (('convirtió',), 1),\n", | |
" (('ecologista',), 1),\n", | |
" (('eres',), 1),\n", | |
" (('cabeza',), 1),\n", | |
" (('ningun',), 1),\n", | |
" (('bullanguero',), 1),\n", | |
" (('next',), 1),\n", | |
" (('tomaré',), 1),\n", | |
" (('puras',), 1),\n", | |
" (('gorra',), 1),\n", | |
" (('cree',), 1),\n", | |
" (('engaña',), 1),\n", | |
" (('demasiado',), 1),\n", | |
" (('amenasan',), 1),\n", | |
" (('kambiar',), 1),\n", | |
" (('ilicito',), 1),\n", | |
" (('juicio',), 1),\n", | |
" (('saben',), 1),\n", | |
" (('informarse',), 1),\n", | |
" (('aleman',), 1),\n", | |
" (('seran',), 1),\n", | |
" (('regresa',), 1),\n", | |
" (('//m.facebook.com/story.php',), 1),\n", | |
" (('pri..',), 1),\n", | |
" (('bolsa',), 1),\n", | |
" (('depende',), 1),\n", | |
" (('yeno',), 1),\n", | |
" (('duela',), 1),\n", | |
" (('vecinos',), 1),\n", | |
" (('alegria',), 1),\n", | |
" (('mediocres',), 1),\n", | |
" (('escena',), 1),\n", | |
" (('tevoy',), 1),\n", | |
" (('odio',), 1),\n", | |
" (('marcada',), 1),\n", | |
" (('juegen',), 1),\n", | |
" (('voluntad',), 1),\n", | |
" (('quejan',), 1),\n", | |
" (('herramienta',), 1),\n", | |
" (('color',), 1),\n", | |
" (('hacemos',), 1),\n", | |
" (('aceptan',), 1),\n", | |
" (('defiendo',), 1),\n", | |
" (('espaeramos',), 1),\n", | |
" (('dejaron',), 1),\n", | |
" (('dudo.somos',), 1),\n", | |
" (('investiguen',), 1),\n", | |
" (('pri-meramente',), 1),\n", | |
" (('ablen',), 1),\n", | |
" (('conociera',), 1),\n", | |
" (('cantidad',), 1),\n", | |
" (('qerramos',), 1),\n", | |
" (('junta',), 1),\n", | |
" (('salpicar',), 1),\n", | |
" (('prometio',), 1),\n", | |
" (('chicharrones',), 1),\n", | |
" (('saludos',), 1),\n", | |
" (('libertad',), 1),\n", | |
" (('aparezca',), 1),\n", | |
" (('seeeeeeeee',), 1),\n", | |
" (('complises',), 1),\n", | |
" (('cambie',), 1),\n", | |
" (('rios',), 1),\n", | |
" (('justicia',), 1),\n", | |
" (('jajaja',), 1),\n", | |
" (('robaran',), 1),\n", | |
" (('pena',), 1),\n", | |
" (('rancho',), 1),\n", | |
" (('iran',), 1),\n", | |
" (('sucios',), 1),\n", | |
" (('precion',), 1),\n", | |
" (('asesinatos',), 1),\n", | |
" (('.ban',), 1),\n", | |
" (('niño',), 1),\n", | |
" (('ángeles',), 1),\n", | |
" (('señoras',), 1),\n", | |
" (('federal',), 1),\n", | |
" (('encuesta',), 1),\n", | |
" (('resultado',), 1),\n", | |
" (('error',), 1),\n", | |
" (('blusas',), 1),\n", | |
" (('señor',), 1),\n", | |
" (('correspondientes',), 1),\n", | |
" (('quisiera',), 1),\n", | |
" (('doble',), 1),\n", | |
" (('alegre',), 1),\n", | |
" (('asta',), 1),\n", | |
" (('acorralado',), 1),\n", | |
" (('eras',), 1),\n", | |
" (('darle',), 1),\n", | |
" (('ganaba',), 1),\n", | |
" (('pri..jajajajajajjaja',), 1),\n", | |
" (('quiera',), 1),\n", | |
" (('alguien',), 1),\n", | |
" (('apoya',), 1),\n", | |
" (('llunes',), 1),\n", | |
" (('ganso',), 1),\n", | |
" (('novenario',), 1),\n", | |
" (('dijieron',), 1),\n", | |
" (('tamaño',), 1),\n", | |
" (('quedase',), 1),\n", | |
" (('aviones',), 1),\n", | |
" (('priista',), 1),\n", | |
" (('perdemos',), 1),\n", | |
" (('diputados',), 1),\n", | |
" (('servirá',), 1),\n", | |
" (('capaz',), 1),\n", | |
" (('alianzas,',), 1),\n", | |
" (('hablar',), 1),\n", | |
" (('hermano',), 1),\n", | |
" (('cristo',), 1),\n", | |
" (('apliquen',), 1),\n", | |
" (('jjjjjj',), 1),\n", | |
" (('.sumado',), 1),\n", | |
" (('independiente',), 1),\n", | |
" (('sinvergüenzadas',), 1),\n", | |
" (('sacan',), 1),\n", | |
" (('mañana',), 1),\n", | |
" (('oran',), 1),\n", | |
" (('descubran',), 1),\n", | |
" (('estrategia',), 1),\n", | |
" (('juego',), 1),\n", | |
" (('apartidista',), 1),\n", | |
" (('cuál',), 1),\n", | |
" (('politico',), 1),\n", | |
" (('jugar',), 1),\n", | |
" (('publicando',), 1),\n", | |
" (('ofender',), 1),\n", | |
" (('perdió',), 1),\n", | |
" (('ptro',), 1),\n", | |
" (('ciegos',), 1),\n", | |
" (('gato',), 1),\n", | |
" (('impuso',), 1),\n", | |
" (('hundir',), 1),\n", | |
" (('crédito',), 1),\n", | |
" (('denuncias',), 1),\n", | |
" (('lodo',), 1),\n", | |
" (('mellooooo',), 1),\n", | |
" (('pantalla',), 1),\n", | |
" (('desaparecidos',), 1),\n", | |
" (('mėxico',), 1),\n", | |
" (('planeo',), 1),\n", | |
" (('chingue',), 1),\n", | |
" (('delicuantazo',), 1),\n", | |
" (('apoyandolos',), 1),\n", | |
" (('derecha',), 1),\n", | |
" (('haces',), 1),\n", | |
" (('manipulación',), 1),\n", | |
" (('ganado',), 1),\n", | |
" (('tlacuaches',), 1),\n", | |
" (('jajajajajajajajajajajajajaja',), 1),\n", | |
" (('olvidare',), 1),\n", | |
" (('fantasias',), 1),\n", | |
" (('jajaj',), 1),\n", | |
" (('asesinos',), 1),\n", | |
" (('arrastrado',), 1),\n", | |
" (('mataron',), 1),\n", | |
" (('sorprende',), 1),\n", | |
" (('porq',), 1),\n", | |
" (('qieren',), 1),\n", | |
" (('madres',), 1),\n", | |
" (('convencer',), 1),\n", | |
" (('ver´',), 1),\n", | |
" (('trepan',), 1),\n", | |
" (('útil',), 1),\n", | |
" (('mueran',), 1),\n", | |
" (('tirar',), 1),\n", | |
" (('robarle',), 1),\n", | |
" (('ayudando',), 1),\n", | |
" (('campañas',), 1),\n", | |
" (('recurrir',), 1),\n", | |
" (('rependejos',), 1),\n", | |
" (('boten',), 1),\n", | |
" (('beltrones',), 1),\n", | |
" (('amlo',), 1),\n", | |
" (('sociales',), 1),\n", | |
" (('llamada',), 1),\n", | |
" (('abrazo',), 1),\n", | |
" (('contiendas',), 1),\n", | |
" (('maestros',), 1),\n", | |
" (('manipular',), 1),\n", | |
" (('pan*',), 1),\n", | |
" (('dejara',), 1),\n", | |
" (('ofreciendo',), 1),\n", | |
" (('helizabeth',), 1),\n", | |
" (('sacer',), 1),\n", | |
" (('orejas',), 1),\n", | |
" (('morenaveracruz',), 1),\n", | |
" (('instituto',), 1),\n", | |
" (('mentira',), 1),\n", | |
" (('primaria',), 1),\n", | |
" (('story_fbid=10207810556173776',), 1),\n", | |
" (('contingencia',), 1),\n", | |
" (('dure',), 1),\n", | |
" (('lacras',), 1),\n", | |
" (('frutas',), 1),\n", | |
" (('quieres..',), 1),\n", | |
" (('llamadas',), 1),\n", | |
" (('pidamosle',), 1),\n", | |
" (('coatzacoalcos',), 1),\n", | |
" (('pertenecen',), 1),\n", | |
" (('cinismo',), 1),\n", | |
" (('tipico',), 1),\n", | |
" (('despedida',), 1),\n", | |
" (('realidad',), 1),\n", | |
" (('ignorantes',), 1),\n", | |
" (('drenajes',), 1),\n", | |
" (('daran',), 1),\n", | |
" (('utilizando',), 1),\n", | |
" (('papapa',), 1),\n", | |
" (('estan',), 1),\n", | |
" (('losque',), 1),\n", | |
" (('pobresito',), 1),\n", | |
" (('triquiñuelas',), 1),\n", | |
" (('llamo',), 1),\n", | |
" (('cansa',), 1),\n", | |
" (('televisión',), 1),\n", | |
" (('axiones',), 1),\n", | |
" (('carvallo',), 1),\n", | |
" (('sintiendo',), 1),\n", | |
" (('comentarios',), 1),\n", | |
" (('memoria',), 1),\n", | |
" (('delgado',), 1),\n", | |
" (('impartición',), 1),\n", | |
" (('esquinas',), 1),\n", | |
" (('meyoooo',), 1),\n", | |
" (('tantito',), 1),\n", | |
" (('manuel',), 1),\n", | |
" (('estupido',), 1),\n", | |
" (('serian',), 1),\n", | |
" (('salir',), 1),\n", | |
" (('monos',), 1),\n", | |
" (('inútiles',), 1),\n", | |
" (('tercer',), 1),\n", | |
" (('.rostros',), 1),\n", | |
" (('votarcpor',), 1),\n", | |
" (('estafadores',), 1),\n", | |
" (('hagase',), 1),\n", | |
" (('mendingando',), 1),\n", | |
" (('robarnos',), 1),\n", | |
" (('dejando',), 1),\n", | |
" (('gobierna',), 1),\n", | |
" (('wallace',), 1),\n", | |
" (('directo',), 1),\n", | |
" (('cansare',), 1),\n", | |
" (('mostrar',), 1),\n", | |
" (('presidente',), 1),\n", | |
" (('habian',), 1),\n", | |
" (('pelar',), 1),\n", | |
" (('abusos',), 1),\n", | |
" (('trato',), 1),\n", | |
" (('saluditos',), 1),\n", | |
" (('permitamos',), 1),\n", | |
" (('fuerza',), 1),\n", | |
" (('soltar',), 1),\n", | |
" (('abrir',), 1),\n", | |
" (('berduras',), 1),\n", | |
" (('le.va',), 1),\n", | |
" (('malo',), 1),\n", | |
" (('empleos',), 1),\n", | |
" (('.vamos',), 1),\n", | |
" (('incumplidas',), 1),\n", | |
" (('nacional',), 1),\n", | |
" (('desempleo',), 1),\n", | |
" (('amañadas',), 1),\n", | |
" (('bardas',), 1),\n", | |
" (('extorsión',), 1),\n", | |
" (('carsel',), 1),\n", | |
" (('diciendo',), 1),\n", | |
" (('cochino',), 1),\n", | |
" (('grande',), 1),\n", | |
" (('estuvieron',), 1),\n", | |
" (('politiko',), 1),\n", | |
" (('enriqui',), 1),\n", | |
" (('cometan',), 1),\n", | |
" (('perteneser',), 1),\n", | |
" (('pusieron',), 1),\n", | |
" (('ojetes',), 1),\n", | |
" (('traía',), 1),\n", | |
" (('puñal',), 1),\n", | |
" (('id=1590836771',), 1),\n", | |
" (('llama',), 1),\n", | |
" (('quieras',), 1),\n", | |
" (('payasos',), 1),\n", | |
" (('abajo',), 1),\n", | |
" (('obscurito',), 1),\n", | |
" (('piensalo',), 1),\n", | |
" (('gestiones',), 1),\n", | |
" (('ceron',), 1),\n", | |
" (('come',), 1),\n", | |
" (('lujosos',), 1),\n", | |
" (('zent',), 1),\n", | |
" (('mmmm',), 1),\n", | |
" (('ignorante',), 1),\n", | |
" (('seas',), 1),\n", | |
" (('desperto',), 1),\n", | |
" (('pulso',), 1),\n", | |
" (('bienestar',), 1),\n", | |
" (('durar',), 1),\n", | |
" (('chile',), 1),\n", | |
" (('personajes',), 1),\n", | |
" (('dejarlo',), 1),\n", | |
" (('robos',), 1),\n", | |
" (('infladas',), 1),\n", | |
" (('representa',), 1),\n", | |
" (('obrador',), 1),\n", | |
" (('culeros',), 1),\n", | |
" (('intervenir',), 1),\n", | |
" (('conseguir',), 1),\n", | |
" (('avían',), 1),\n", | |
" (('oficialmente',), 1),\n", | |
" (('golpe',), 1),\n", | |
" (('pensé',), 1),\n", | |
" (('floristeros',), 1),\n", | |
" (('igualitos',), 1),\n", | |
" (('siga',), 1),\n", | |
" (('fotos..',), 1),\n", | |
" (('confirma',), 1),\n", | |
" (('desastre',), 1),\n", | |
" (('hágnme',), 1),\n", | |
" (('tontos',), 1),\n", | |
" (('otaolaurruchi',), 1),\n", | |
" (('mexicana',), 1),\n", | |
" (('https',), 1),\n", | |
" (('correrlos',), 1),\n", | |
" (('mando',), 1),\n", | |
" (('bonito',), 1),\n", | |
" (('comprando',), 1),\n", | |
" (('trniendo',), 1),\n", | |
" (('mexicanos',), 1),\n", | |
" (('occico',), 1),\n", | |
" (('coruccion',), 1),\n", | |
" (('ciega',), 1),\n", | |
" (('http',), 1),\n", | |
" (('amigos',), 1),\n", | |
" (('comer',), 1),\n", | |
" (('apoyará',), 1),\n", | |
" (('secuaces',), 1),\n", | |
" (('urbanos',), 1),\n", | |
" (('rateria',), 1),\n", | |
" (('increíblemente',), 1),\n", | |
" (('generaciones',), 1),\n", | |
" (('ciudad',), 1),\n", | |
" (('despierten',), 1),\n", | |
" (('medicamentos',), 1),\n", | |
" (('¡trepadores',), 1),\n", | |
" (('veracruz..',), 1),\n", | |
" (('loque',), 1),\n", | |
" (('enel',), 1),\n", | |
" (('dispuesto',), 1),\n", | |
" (('al.pri',), 1),\n", | |
" (('caseta',), 1),\n", | |
" (('vergüenza',), 1),\n", | |
" (('tortas',), 1),\n", | |
" (('silencio',), 1),\n", | |
" (('estimado',), 1),\n", | |
" (('ganan',), 1),\n", | |
" (('seguro',), 1),\n", | |
" (('sustento',), 1),\n", | |
" (('cárnicas',), 1),\n", | |
" (('trabaja',), 1),\n", | |
" (('recuperar',), 1),\n", | |
" (('opciones',), 1),\n", | |
" (('suya',), 1),\n", | |
" (('debate',), 1),\n", | |
" (('podrida',), 1),\n", | |
" (('repudio',), 1),\n", | |
" (('goto',), 1),\n", | |
" (('sentido',), 1),\n", | |
" (('tubo',), 1),\n", | |
" (('taller',), 1),\n", | |
" (('pagina',), 1),\n", | |
" (('dante',), 1),\n", | |
" (('quieran',), 1),\n", | |
" (('cruel',), 1),\n", | |
" (('esperando',), 1),\n", | |
" (('raro',), 1),\n", | |
" (('dieran',), 1),\n", | |
" (('iguales',), 1),\n", | |
" (('representan',), 1),\n", | |
" (('agarraron',), 1),\n", | |
" (('periodistas',), 1),\n", | |
" (('principal',), 1),\n", | |
" (('difaman',), 1),\n", | |
" (('presentó',), 1),\n", | |
" (('venganza',), 1),\n", | |
" (('resagos',), 1),\n", | |
" (('combenencira',), 1),\n", | |
" (('deberle',), 1),\n", | |
" (('manos',), 1),\n", | |
" (('empiezan',), 1),\n", | |
" (('tragar',), 1),\n", | |
" (('vacías',), 1),\n", | |
" (('vieja',), 1),\n", | |
" (('precidente',), 1),\n", | |
" (('agarran',), 1),\n", | |
" (('votación',), 1),\n", | |
" (('cuánto',), 1),\n", | |
" (('demora',), 1),\n", | |
" (('creen',), 1),\n", | |
" (('matar.porque',), 1),\n", | |
" (('creible',), 1),\n", | |
" (('invertir',), 1),\n", | |
" (('juegan',), 1),\n", | |
" (('siguen',), 1),\n", | |
" (('noble',), 1),\n", | |
" (('roja',), 1),\n", | |
" (('salen',), 1),\n", | |
" (('siega',), 1),\n", | |
" (('pelan',), 1),\n", | |
" (('prinche',), 1),\n", | |
" (('conformó',), 1),\n", | |
" (('negocios',), 1),\n", | |
" (('montoya',), 1),\n", | |
" (('vinieron',), 1),\n", | |
" (('sionismo',), 1),\n", | |
" (('acusen',), 1),\n", | |
" (('calladitos',), 1),\n", | |
" (('permitiremos',), 1),\n", | |
" (('mansiones',), 1),\n", | |
" (('obra',), 1),\n", | |
" (('cortados',), 1),\n", | |
" (('vivido',), 1),\n", | |
" (('cura',), 1),\n", | |
" (('poquito',), 1),\n", | |
" (('acabado',), 1),\n", | |
" (('entró',), 1),\n", | |
" (('maldigamos',), 1),\n", | |
" (('llegaste',), 1),\n", | |
" (('ciudadano',), 1),\n", | |
" (('calles',), 1),\n", | |
" (('ademas',), 1),\n", | |
" (('paga',), 1),\n", | |
" (('muerto',), 1),\n", | |
" (('descanse',), 1),\n", | |
" (('asesino',), 1),\n", | |
" (('porquerias',), 1),\n", | |
" (('ladrón',), 1),\n", | |
" (('cobardes',), 1),\n", | |
" (('diario',), 1),\n", | |
" (('gaudy',), 1),\n", | |
" (('perdedor',), 1),\n", | |
" (('distractor',), 1),\n", | |
" (('hipocresía',), 1),\n", | |
" (('creyendo',), 1),\n", | |
" (('lambe',), 1),\n", | |
" (('traidor',), 1),\n", | |
" (('ayuda',), 1),\n", | |
" (('manitas',), 1),\n", | |
" (('redes',), 1),\n", | |
" (('efervescencia',), 1),\n", | |
" (('sres',), 1),\n", | |
" (('jinetear',), 1),\n", | |
" (('asquean',), 1),\n", | |
" (('malentes',), 1),\n", | |
" (('corajes',), 1),\n", | |
" (('operacion',), 1),\n", | |
" (('gubernatura',), 1),\n", | |
" (('finanzas',), 1),\n", | |
" (('gerardo',), 1),\n", | |
" (('sabemos',), 1),\n", | |
" (('aliado',), 1),\n", | |
" (('acabaron',), 1),\n", | |
" (('pregunto',), 1),\n", | |
" (('obvio',), 1),\n", | |
" (('ganara',), 1),\n", | |
" (('menciona',), 1),\n", | |
" (('cambiando',), 1),\n", | |
" (('segun',), 1),\n", | |
" (('espantan',), 1),\n", | |
" (('ganará',), 1),\n", | |
" (('falsas',), 1),\n", | |
" (('nota',), 1),\n", | |
" (('esencia',), 1),\n", | |
" (('hacerle',), 1),\n", | |
" (('jarochos',), 1),\n", | |
" (('café',), 1),\n", | |
" (('espectaculares',), 1),\n", | |
" (('ganó',), 1),\n", | |
" (('carlos',), 1),\n", | |
" (('toma',), 1),\n", | |
" (('incluyendo',), 1),\n", | |
" (('estúpida',), 1),\n", | |
" (('tocará',), 1),\n", | |
" (('trabajador',), 1),\n", | |
" (('encoleriza',), 1),\n", | |
" (('embarrados',), 1),\n", | |
" (('andas',), 1),\n", | |
" (('cuentos',), 1),\n", | |
" (('horas',), 1),\n", | |
" (('correcta',), 1),\n", | |
" (('vale',), 1),\n", | |
" (('sakiado',), 1),\n", | |
" (('universidad',), 1),\n", | |
" (('insulto',), 1),\n", | |
" (('verdaderos',), 1),\n", | |
" (('dejarnos',), 1),\n", | |
" (('miyules',), 1),\n", | |
" (('metan',), 1),\n", | |
" (('soborno',), 1),\n", | |
" (('coman',), 1),\n", | |
" (('paginas',), 1),\n", | |
" (('esperaba',), 1),\n", | |
" (('doctor',), 1),\n", | |
" (('apoco',), 1),\n", | |
" (('protegen',), 1),\n", | |
" (('casas',), 1),\n", | |
" (('quede',), 1),\n", | |
" (('sienpre',), 1),\n", | |
" (('hecto',), 1),\n", | |
" ...]" | |
] | |
}, | |
"execution_count": 70, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ngram_histogram_page[\"1\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"729" | |
] | |
}, | |
"execution_count": 61, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(ngram_histogram_page[\"1\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Get top 10% of 1-grams\n", | |
"top_onegrams = [word[0][0] for word in ngram_histogram_page[\"1\"][0:72]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Find top 1-grams in 6-grams, and form a list of merged words\n", | |
"word_reference = {}\n", | |
"for word in top_onegrams:\n", | |
" for sixgram in ngrams_lists[\"6\"]:\n", | |
" if word in sixgram:\n", | |
" if word in word_reference:\n", | |
" sixgram.remove(word)\n", | |
" word_reference[word] = word_reference[word] + sixgram\n", | |
" else:\n", | |
" sixgram.remove(word)\n", | |
" word_reference[word] = sixgram" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Convert to counters the merged lists to get counts.\n", | |
"freqdist_reference = {}\n", | |
"for word in word_reference:\n", | |
" freqdist_reference[word] = Counter(word_reference[word])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 156, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"morena\n", | |
"Counter({'cuitlahuac': 6, 'verdad': 5, 'arriba': 5, 'ofender': 5, 'mera': 4, 'cambio': 4, 'suficiente': 4, 'gane': 4, 'manitas': 4, 'rateria': 4, 'oran': 4, 'apoya': 4, 'quieren': 4, 'floristeros': 3, 'hagamos': 3, 'meyoooo': 3, 'votar': 3, 'fuerza': 3, 'operacion': 3, 'chinge': 3, 'gobernatura': 3, 'deberle': 3, 'salen': 3, 'realmente': 3, 'embarrados': 3, 'cobardes': 3, 'cambie': 3, 'realidad': 3, 'estado': 2, 'duela': 2, 'pelea': 2, 'disen': 2, 'votando': 2, 'pagina': 2, 'dudo.somos': 2, 'confirma': 2, 'helizabeth': 2, 'engañar': 2, 'amlo': 2, 'útil': 2, 'aliado': 2, 'juegen': 2, 'amañadas': 2, 'vendrá': 2, 'puta': 2, 'dejamos': 2, 'alianzas': 2, 'universidad': 2, 'puro': 1, 'veracruzanos': 1, 'presidente': 1, 'opcion': 1, 'morales,': 1, 'maricelita': 1, 'next': 1, 'http': 1, 'pusieron': 1, 'rival': 1, 'ratas': 1, 'ganando': 1, 'pollito': 1, 'linares': 1, 'despierten': 1, 'madre': 1, 'presentó': 1, 'adios': 1, 'jajaj': 1, 'pelan': 1, 'aguas': 1, 'encuestas': 1, 'comentarios': 1, 'creer': 1, 'andrés': 1, 'ganar': 1, 'votemos': 1, 'debate': 1, 'distractor': 1, 'limpio': 1, 'porkeria': 1, 'rateros': 1, 'robarle': 1, 'sale': 1})\n" | |
] | |
} | |
], | |
"source": [ | |
"#Example of the count vector formed.\n", | |
"palabra = ngram_histogram_page[\"1\"][8][0][0]\n", | |
"print(\"morena\")\n", | |
"print(freqdist_reference[\"morena\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 139, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Load Sdal dictionary for pleasantness: \n", | |
"import json\n", | |
"with open('sdal.json') as data_file: \n", | |
" data = json.load(data_file)\n", | |
"sdal_words = list(data.keys())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Library to match words in comments with words in the sdal dictionary \n", | |
"from fuzzywuzzy import process" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 161, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"basta\n", | |
"perder\n", | |
"dios\n", | |
"marrana\n", | |
"roban\n", | |
"ángel\n", | |
"arriba\n", | |
"hagan\n", | |
"pinche\n", | |
"javier\n", | |
"gobierno\n", | |
"priistas\n", | |
"partido\n", | |
"sabe\n", | |
"puto\n", | |
"votar\n", | |
"angel\n", | |
"rata\n", | |
"inseguridad\n", | |
"tiempo\n", | |
"pendejo\n", | |
"nieto\n", | |
"cambio\n", | |
"linares\n", | |
"robaron\n", | |
"diferencia\n", | |
"mamada\n", | |
"despensa\n", | |
"madre\n", | |
"engañar\n", | |
"yunes\n", | |
"ganar\n", | |
"veracruz\n", | |
"ratas\n", | |
"estado\n", | |
"encuestas\n", | |
"corrupto\n", | |
"robo\n", | |
"dinero\n", | |
"poder\n", | |
"robado\n", | |
"pendeja\n", | |
"políticos\n", | |
"mala\n", | |
"voto\n", | |
"ahogado\n", | |
"limpio\n", | |
"miguel\n", | |
"gobernador\n", | |
"duarte\n", | |
"hagamos\n", | |
"morena\n", | |
"años\n", | |
"mierda\n", | |
"puta\n", | |
"elecciones\n", | |
"pueblo\n", | |
"miedo\n", | |
"chingar\n", | |
"quieren\n", | |
"gane\n", | |
"candidato\n", | |
"gente\n", | |
"delincuencia\n", | |
"rateros\n", | |
"patadas\n", | |
"señores\n", | |
"favor\n", | |
"fidel\n", | |
"historia\n", | |
"único\n" | |
] | |
} | |
], | |
"source": [ | |
"#Get overall pleasantness as the sum of the contributions of each word ocurrence for each top 1-gram found.\n", | |
"word_feeling = {}\n", | |
"for word in freqdist_reference:\n", | |
" current_grade = 0.0\n", | |
" print(word)\n", | |
" for subword in freqdist_reference[word]:\n", | |
" match = process.extractOne(subword,sdal_words)\n", | |
" if match is not None:\n", | |
" current_grade = current_grade + float(data[match[0]]['pleasantness'])*(float(freqdist_reference[word][subword])/len( word_reference[word]))\n", | |
" word_feeling[word] =current_grade" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 170, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.32982105263157896" | |
] | |
}, | |
"execution_count": 170, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word_feeling[\"mala\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment