avriiil’s gists

avriiil / gsdmm-preprocessing.py

Last active November 27, 2021 14:47

	# importing libraries
	import pandas as pd
	import numpy as np
	import gensim
	from gsdmm import MovieGroupProcess

	# cast tweets to numpy array
	docs = df.tweet_text.to_numpy()

	# create dictionary of all words in all documents

avriiil / gsdmm-print-topics.py

Last active June 16, 2021 12:54

	# print number of documents per topic
	doc_count = np.array(gsdmm.cluster_doc_count)
	print('Number of documents per topic :', doc_count)

	# Topics sorted by the number of document they are allocated to
	top_index = doc_count.argsort()[-15:][::-1]
	print('Most important clusters (by number of docs inside):', top_index)

	# define function to get top words per topic
	def top_words(cluster_word_distribution, top_cluster, values):

avriiil / gsdmm-coherence-score.py

Last active January 30, 2023 15:16

	# import library from gensim
	from gensim.models import CoherenceModel

	# define function to get words in topics
	def get_topics_lists(model, top_clusters, n_words):
	'''
	Gets lists of words in topics as a list of lists.

	model: gsdmm instance
	top_clusters: numpy array containing indices of top_clusters

avriiil / gsdmm-word-cloud.py

Created June 17, 2021 10:16

	# Import wordcloud library
	from wordcloud import WordCloud

	# Get topic word distributions from gsdmm model
	cluster_word_distribution = gsdmm.cluster_word_distribution

	# Select topic you want to output as dictionary (using topic_number)
	topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]

	# Generate a word cloud image

avriiil / lda-coherence-score.py

Created June 17, 2021 10:37

	# import library from gensim
	from gensim.models import CoherenceModel

	# instantiate topic coherence model
	cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')

	# get topic coherence score
	coherence_lda = cm.get_coherence()
	print(coherence_lda)

avriiil / 02_National_Water_Model.ipynb

Created October 11, 2021 14:08 — forked from rsignell-usgs/02_National_Water_Model.ipynb

NWM demo with hvplot

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

avriiil / write_large_parquet_m1

Created January 18, 2022 10:48

Gist to write large parquet files to S3 on M1 (avoid blosc issues)

	# ...spin up cluster...connect Dask...etc.

	# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1)
	def submit_jobs():
	from distributed import get_client

	with get_client() as client:
	large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M")

	large.to_parquet(

avriiil / geopandas-explore.py

Created May 11, 2022 10:39

Create Interactive Maps with GeoPandas

	import pandas as pd
	import geopandas as gpd
	from geopandas import points_from_xy
	from shapely.geometry import Point

	# read in raw taxi data
	df = pd.read_csv(
	"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv",
	nrows=100_000,
	)

avriiil / tutorial.ipynb

Created November 2, 2022 11:38 — forked from mrocklin/tutorial.ipynb

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

avriiil / download-kaggle.py

Created November 14, 2022 15:19

Download Kaggle NYC bike data for Dask benchmarking

	import json
	from pathlib import Path

	import duckdb
	import kaggle
	from loguru import logger
	from tqdm import tqdm

	# In order to access this data, you must create a Kaggle account and obtain an API key.
	# You can obtain a key by clicking on your icon on the upper right of the homepage,

Avril Aysha avriiil