Created
January 20, 2022 09:55
-
-
Save ryanbateman/cae795c3d4ee4d7aa7d2cbc4109af0aa to your computer and use it in GitHub Desktop.
DDG Jupyter notebook for exporation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "235a092e", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#importing required libraries\n", | |
"from google_play_scraper import app, Sort, reviews_all\n", | |
"from os.path import exists\n", | |
"import json\n", | |
"import re\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"from pprint import pprint\n", | |
"import gensim\n", | |
"import gensim.corpora as corpora\n", | |
"from gensim.utils import simple_preprocess\n", | |
"from nltk.corpus import stopwords\n", | |
"from gensim.models import CoherenceModel\n", | |
"import spacy\n", | |
"import pyLDAvis\n", | |
"import pyLDAvis.gensim_models\n", | |
"import matplotlib.pyplot as plt\n", | |
"import nltk\n", | |
"from nltk import FreqDist\n", | |
"from matplotlib import pyplot as plt\n", | |
"from wordcloud import WordCloud, STOPWORDS\n", | |
"import matplotlib.colors as mcolors\n", | |
"import spacy\n", | |
"\n", | |
"nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])\n", | |
"\n", | |
"#importing the Stopwords to use them, tidying a little and including some domain/superfluous stuff\n", | |
"stop_words = stopwords.words('english')\n", | |
"stop_words.extend(['duckduckgo', 'duck', 'go', 'duckduck', 'browser', 'ddg', 'app', 'good', 'great', \n", | |
" 'search', 'engine', 'use', 'nice', 'chrome', ''])\n", | |
"\n", | |
"#downloading the data, storing it in text file (also used by the R script)\n", | |
"if not exists('reviews.txt'):\n", | |
" result = reviews_all('com.duckduckgo.mobile.android',\n", | |
" sleep_milliseconds=0,\n", | |
" lang='en', \n", | |
" country='us'\n", | |
" )\n", | |
" file = open('reviews.txt', 'w')\n", | |
" hopeful_json = json.dump(result, file, indent=4, sort_keys=True, default=str)\n", | |
" file.close()\n", | |
" \n", | |
"with open(\"reviews.txt\", \"r\") as read_file:\n", | |
" review_json = json.load(read_file)\n", | |
" # Concatenate reviews, strip some punctuation \n", | |
" reviews_content = [ review['content'] for review in review_json if type(review['content']) == str ] \n", | |
"\n", | |
"data = [r.lower() for r in reviews_content]\n", | |
"print(\"Approximate number of reviews: \", len(data))\n", | |
" \n", | |
"#cleaning the text \n", | |
"def tokeniz(sentences):\n", | |
" for sentence in sentences:\n", | |
" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n", | |
"processed_data = list(tokeniz(data))\n", | |
"\n", | |
"#Building bigram & trigram Models\n", | |
"bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=50)\n", | |
"trigram = gensim.models.Phrases(bigram[processed_data], threshold=50)\n", | |
"bigram_mod = gensim.models.phrases.Phraser(bigram)\n", | |
"trigram_mod = gensim.models.phrases.Phraser(trigram)\n", | |
"\n", | |
"#function to filter out stopwords\n", | |
"def remove_stopwords(texts):\n", | |
" return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]\n", | |
"\n", | |
"#function to create bigrams\n", | |
"def create_bigrams(texts):\n", | |
" return [bigram_mod[doc] for doc in texts]\n", | |
"\n", | |
"#function to create trigrams\n", | |
"def create_trigrams(texts):\n", | |
" [trigram_mod[bigram_mod[doc]] for doc in texts]\n", | |
"\n", | |
"#function for lemmatization\n", | |
"def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):\n", | |
" texts_op = []\n", | |
" for sent in texts:\n", | |
" doc = nlp(\" \".join(sent))\n", | |
" texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n", | |
" return texts_op\n", | |
"\n", | |
"#removing stopwords, creating bigrams and lemmatizing the text\n", | |
"data_wo_stopwords = remove_stopwords(processed_data)\n", | |
"data_bigrams = create_bigrams(data_wo_stopwords)\n", | |
"data_lemmatized = lemmatize(data_bigrams, allowed_postags=[ 'NOUN', 'ADJ'])\n", | |
"print(trigram_mod[bigram_mod[texts]])\n", | |
"\n", | |
"#printing the lemmatized data\n", | |
"print(data_lemmatized[:3])\n", | |
"\n", | |
"#creating a dictionary\n", | |
"gensim_dictionary = corpora.Dictionary(data_lemmatized)\n", | |
"\n", | |
"texts = data_lemmatized\n", | |
"\n", | |
"#building a corpus for the topic model\n", | |
"gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]\n", | |
"\n", | |
"#printing the corpus we created above.\n", | |
"print(gensim_corpus[:3]) \n", | |
"\n", | |
"#we can print the words with their frequencies.\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "fe541fbf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"#creating the LDA model (100 passes produced the cleanest result but took forever)\n", | |
"lda_model = gensim.models.ldamodel.LdaModel(\n", | |
" corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=10, random_state=100, chunksize=100, passes=10)\n", | |
"\n", | |
"#calculating the coherence\n", | |
"coherence_model_lda = CoherenceModel(\n", | |
" model=lda_model, texts=data_lemmatized, dictionary=gensim_dictionary, coherence='c_v')\n", | |
"coherence_lda = coherence_model_lda.get_coherence()\n", | |
"\n", | |
"# Display Perplexity (low is good) and coherence (high is good)\n", | |
"print('\\nPerplexity: ', lda_model.log_perplexity(gensim_corpus))\n", | |
"print('\\nCoherence Score: ', coherence_lda)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "b55aaf5f", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Show the notebook\n", | |
"pyLDAvis.enable_notebook()\n", | |
"visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, mds='mmds')\n", | |
"visualization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "82b4ee84", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# For fun/easier understanding, print out some wordclouds\n", | |
"cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS'\n", | |
"cloud = WordCloud(stopwords=stop_words,\n", | |
" background_color='white',\n", | |
" width=2500,\n", | |
" height=1800,\n", | |
" max_words=10,\n", | |
" colormap='tab10',\n", | |
" color_func=lambda *args, **kwargs: cols[i],\n", | |
" prefer_horizontal=1.0)\n", | |
"\n", | |
"topics = lda_model.show_topics(formatted=False)\n", | |
"print(len(topics))\n", | |
"\n", | |
"fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True)\n", | |
"\n", | |
"for i, ax in enumerate(axes.flatten()):\n", | |
" fig.add_subplot(ax)\n", | |
" topic_words = dict(topics[i][1])\n", | |
" cloud.generate_from_frequencies(topic_words, max_font_size=300)\n", | |
" plt.gca().imshow(cloud)\n", | |
" plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))\n", | |
" plt.gca().axis('off')\n", | |
"\n", | |
"\n", | |
"plt.subplots_adjust(wspace=0, hspace=0)\n", | |
"plt.axis('off')\n", | |
"plt.margins(x=0, y=0)\n", | |
"plt.tight_layout()\n", | |
"plt.show()\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment