This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importing libraries | |
import pandas as pd | |
import numpy as np | |
import gensim | |
from gsdmm import MovieGroupProcess | |
# cast tweets to numpy array | |
docs = df.tweet_text.to_numpy() | |
# create dictionary of all words in all documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# print number of documents per topic | |
doc_count = np.array(gsdmm.cluster_doc_count) | |
print('Number of documents per topic :', doc_count) | |
# Topics sorted by the number of document they are allocated to | |
top_index = doc_count.argsort()[-15:][::-1] | |
print('Most important clusters (by number of docs inside):', top_index) | |
# define function to get top words per topic | |
def top_words(cluster_word_distribution, top_cluster, values): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import library from gensim | |
from gensim.models import CoherenceModel | |
# define function to get words in topics | |
def get_topics_lists(model, top_clusters, n_words): | |
''' | |
Gets lists of words in topics as a list of lists. | |
model: gsdmm instance | |
top_clusters: numpy array containing indices of top_clusters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import wordcloud library | |
from wordcloud import WordCloud | |
# Get topic word distributions from gsdmm model | |
cluster_word_distribution = gsdmm.cluster_word_distribution | |
# Select topic you want to output as dictionary (using topic_number) | |
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values] | |
# Generate a word cloud image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import library from gensim | |
from gensim.models import CoherenceModel | |
# instantiate topic coherence model | |
cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v') | |
# get topic coherence score | |
coherence_lda = cm.get_coherence() | |
print(coherence_lda) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ...spin up cluster...connect Dask...etc. | |
# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1) | |
def submit_jobs(): | |
from distributed import get_client | |
with get_client() as client: | |
large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M") | |
large.to_parquet( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import geopandas as gpd | |
from geopandas import points_from_xy | |
from shapely.geometry import Point | |
# read in raw taxi data | |
df = pd.read_csv( | |
"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv", | |
nrows=100_000, | |
) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from pathlib import Path | |
import duckdb | |
import kaggle | |
from loguru import logger | |
from tqdm import tqdm | |
# In order to access this data, you must create a Kaggle account and obtain an API key. | |
# You can obtain a key by clicking on your icon on the upper right of the homepage, |