We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This gist generates a specified number of CSV files | |
# Each CSV file contains a varying number of rows about butterflies | |
import os | |
import random | |
import pandas as pd | |
output_dir = "your/output/directory" | |
# Step 1: Generate Butterfly Species Data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from pathlib import Path | |
import duckdb | |
import kaggle | |
from loguru import logger | |
from tqdm import tqdm | |
# In order to access this data, you must create a Kaggle account and obtain an API key. | |
# You can obtain a key by clicking on your icon on the upper right of the homepage, |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import geopandas as gpd | |
from geopandas import points_from_xy | |
from shapely.geometry import Point | |
# read in raw taxi data | |
df = pd.read_csv( | |
"s3://nyc-tlc/trip data/yellow_tripdata_2012-01.csv", | |
nrows=100_000, | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ...spin up cluster...connect Dask...etc. | |
# use client.submit() to write large parquet files to S3 (to avoid blosc issues on M1) | |
def submit_jobs(): | |
from distributed import get_client | |
with get_client() as client: | |
large = dask.datasets.timeseries(start="2000", end="2015", freq="10s", partition_freq="1M") | |
large.to_parquet( |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import library from gensim | |
from gensim.models import CoherenceModel | |
# instantiate topic coherence model | |
cm = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v') | |
# get topic coherence score | |
coherence_lda = cm.get_coherence() | |
print(coherence_lda) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import wordcloud library | |
from wordcloud import WordCloud | |
# Get topic word distributions from gsdmm model | |
cluster_word_distribution = gsdmm.cluster_word_distribution | |
# Select topic you want to output as dictionary (using topic_number) | |
topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)[:values] | |
# Generate a word cloud image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import library from gensim | |
from gensim.models import CoherenceModel | |
# define function to get words in topics | |
def get_topics_lists(model, top_clusters, n_words): | |
''' | |
Gets lists of words in topics as a list of lists. | |
model: gsdmm instance | |
top_clusters: numpy array containing indices of top_clusters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# print number of documents per topic | |
doc_count = np.array(gsdmm.cluster_doc_count) | |
print('Number of documents per topic :', doc_count) | |
# Topics sorted by the number of document they are allocated to | |
top_index = doc_count.argsort()[-15:][::-1] | |
print('Most important clusters (by number of docs inside):', top_index) | |
# define function to get top words per topic | |
def top_words(cluster_word_distribution, top_cluster, values): |
NewerOlder