Skip to content

Instantly share code, notes, and snippets.

@avriiil
avriiil / generate_butterflies.csv
Created January 27, 2025 11:40
Generate butterfly data in CSV
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
# This gist generates a specified number of CSV files
# Each CSV file contains a varying number of rows about butterflies
import os
import random
import pandas as pd
output_dir = "your/output/directory"
# Step 1: Generate Butterfly Species Data
@avriiil
avriiil / download-kaggle.py
Created November 14, 2022 15:19
Download Kaggle NYC bike data for Dask benchmarking
import json
from pathlib import Path
import duckdb
import kaggle
from loguru import logger
from tqdm import tqdm
# In order to access this data, you must create a Kaggle account and obtain an API key.
# You can obtain a key by clicking on your icon on the upper right of the homepage,
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@avriiil
avriiil / geopandas-explore.py
Created May 11, 2022 10:39
Create Interactive Maps with GeoPandas
import pandas as pd
import geopandas as gpd
from geopandas import points_from_xy
from shapely.geometry import Point
# read in raw taxi data
df = pd.read_csv(
"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv",
nrows=100_000,
)
@avriiil
avriiil / write_large_parquet_m1
Created January 18, 2022 10:48
Gist to write large parquet files to S3 on M1 (avoid blosc issues)
# ...spin up cluster...connect Dask...etc.
# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1)
def submit_jobs():
from distributed import get_client
with get_client() as client:
large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M")
large.to_parquet(
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# import library from gensim
from gensim.models import CoherenceModel
# instantiate topic coherence model
cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')
# get topic coherence score
coherence_lda = cm.get_coherence()
print(coherence_lda)
# Import wordcloud library
from wordcloud import WordCloud
# Get topic word distributions from gsdmm model
cluster_word_distribution = gsdmm.cluster_word_distribution
# Select topic you want to output as dictionary (using topic_number)
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values]
# Generate a word cloud image
# import library from gensim
from gensim.models import CoherenceModel
# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
'''
Gets lists of words in topics as a list of lists.
model: gsdmm instance
top_clusters: numpy array containing indices of top_clusters
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):