Skip to content

Instantly share code, notes, and snippets.

@ryanbateman
Created January 20, 2022 09:55
Show Gist options
  • Save ryanbateman/cae795c3d4ee4d7aa7d2cbc4109af0aa to your computer and use it in GitHub Desktop.
Save ryanbateman/cae795c3d4ee4d7aa7d2cbc4109af0aa to your computer and use it in GitHub Desktop.
DDG Jupyter notebook for exporation
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "235a092e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#importing required libraries\n",
"from google_play_scraper import app, Sort, reviews_all\n",
"from os.path import exists\n",
"import json\n",
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"from pprint import pprint\n",
"import gensim\n",
"import gensim.corpora as corpora\n",
"from gensim.utils import simple_preprocess\n",
"from nltk.corpus import stopwords\n",
"from gensim.models import CoherenceModel\n",
"import spacy\n",
"import pyLDAvis\n",
"import pyLDAvis.gensim_models\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"from nltk import FreqDist\n",
"from matplotlib import pyplot as plt\n",
"from wordcloud import WordCloud, STOPWORDS\n",
"import matplotlib.colors as mcolors\n",
"import spacy\n",
"\n",
"nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])\n",
"\n",
"#importing the Stopwords to use them, tidying a little and including some domain/superfluous stuff\n",
"stop_words = stopwords.words('english')\n",
"stop_words.extend(['duckduckgo', 'duck', 'go', 'duckduck', 'browser', 'ddg', 'app', 'good', 'great', \n",
" 'search', 'engine', 'use', 'nice', 'chrome', ''])\n",
"\n",
"#downloading the data, storing it in text file (also used by the R script)\n",
"if not exists('reviews.txt'):\n",
" result = reviews_all('com.duckduckgo.mobile.android',\n",
" sleep_milliseconds=0,\n",
" lang='en', \n",
" country='us'\n",
" )\n",
" file = open('reviews.txt', 'w')\n",
" hopeful_json = json.dump(result, file, indent=4, sort_keys=True, default=str)\n",
" file.close()\n",
" \n",
"with open(\"reviews.txt\", \"r\") as read_file:\n",
" review_json = json.load(read_file)\n",
" # Concatenate reviews, strip some punctuation \n",
" reviews_content = [ review['content'] for review in review_json if type(review['content']) == str ] \n",
"\n",
"data = [r.lower() for r in reviews_content]\n",
"print(\"Approximate number of reviews: \", len(data))\n",
" \n",
"#cleaning the text \n",
"def tokeniz(sentences):\n",
" for sentence in sentences:\n",
" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
"processed_data = list(tokeniz(data))\n",
"\n",
"#Building bigram & trigram Models\n",
"bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=50)\n",
"trigram = gensim.models.Phrases(bigram[processed_data], threshold=50)\n",
"bigram_mod = gensim.models.phrases.Phraser(bigram)\n",
"trigram_mod = gensim.models.phrases.Phraser(trigram)\n",
"\n",
"#function to filter out stopwords\n",
"def remove_stopwords(texts):\n",
" return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]\n",
"\n",
"#function to create bigrams\n",
"def create_bigrams(texts):\n",
" return [bigram_mod[doc] for doc in texts]\n",
"\n",
"#function to create trigrams\n",
"def create_trigrams(texts):\n",
" [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
"\n",
"#function for lemmatization\n",
"def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):\n",
" texts_op = []\n",
" for sent in texts:\n",
" doc = nlp(\" \".join(sent))\n",
" texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n",
" return texts_op\n",
"\n",
"#removing stopwords, creating bigrams and lemmatizing the text\n",
"data_wo_stopwords = remove_stopwords(processed_data)\n",
"data_bigrams = create_bigrams(data_wo_stopwords)\n",
"data_lemmatized = lemmatize(data_bigrams, allowed_postags=[ 'NOUN', 'ADJ'])\n",
"print(trigram_mod[bigram_mod[texts]])\n",
"\n",
"#printing the lemmatized data\n",
"print(data_lemmatized[:3])\n",
"\n",
"#creating a dictionary\n",
"gensim_dictionary = corpora.Dictionary(data_lemmatized)\n",
"\n",
"texts = data_lemmatized\n",
"\n",
"#building a corpus for the topic model\n",
"gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]\n",
"\n",
"#printing the corpus we created above.\n",
"print(gensim_corpus[:3]) \n",
"\n",
"#we can print the words with their frequencies.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe541fbf",
"metadata": {},
"outputs": [],
"source": [
"\n",
"#creating the LDA model (100 passes produced the cleanest result but took forever)\n",
"lda_model = gensim.models.ldamodel.LdaModel(\n",
" corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=10, random_state=100, chunksize=100, passes=10)\n",
"\n",
"#calculating the coherence\n",
"coherence_model_lda = CoherenceModel(\n",
" model=lda_model, texts=data_lemmatized, dictionary=gensim_dictionary, coherence='c_v')\n",
"coherence_lda = coherence_model_lda.get_coherence()\n",
"\n",
"# Display Perplexity (low is good) and coherence (high is good)\n",
"print('\\nPerplexity: ', lda_model.log_perplexity(gensim_corpus))\n",
"print('\\nCoherence Score: ', coherence_lda)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b55aaf5f",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Show the notebook\n",
"pyLDAvis.enable_notebook()\n",
"visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, mds='mmds')\n",
"visualization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82b4ee84",
"metadata": {},
"outputs": [],
"source": [
"# For fun/easier understanding, print out some wordclouds\n",
"cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS'\n",
"cloud = WordCloud(stopwords=stop_words,\n",
" background_color='white',\n",
" width=2500,\n",
" height=1800,\n",
" max_words=10,\n",
" colormap='tab10',\n",
" color_func=lambda *args, **kwargs: cols[i],\n",
" prefer_horizontal=1.0)\n",
"\n",
"topics = lda_model.show_topics(formatted=False)\n",
"print(len(topics))\n",
"\n",
"fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True)\n",
"\n",
"for i, ax in enumerate(axes.flatten()):\n",
" fig.add_subplot(ax)\n",
" topic_words = dict(topics[i][1])\n",
" cloud.generate_from_frequencies(topic_words, max_font_size=300)\n",
" plt.gca().imshow(cloud)\n",
" plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))\n",
" plt.gca().axis('off')\n",
"\n",
"\n",
"plt.subplots_adjust(wspace=0, hspace=0)\n",
"plt.axis('off')\n",
"plt.margins(x=0, y=0)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment