Skip to content

Instantly share code, notes, and snippets.

View do-me's full-sized avatar

Dominik Weckmüller do-me

View GitHub Profile
@do-me
do-me / cosine_similarity.py
Last active October 22, 2024 09:13
Quick cosine similarity with numpy & query with pandas
from numpy.linalg import norm
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b)) # from https://huggingface.co/jinaai/jina-embeddings-v2-base-en
query = "social democracy"
quer_emb = model.encode(query)
df["cos_sim"] = df["embeddings"].apply(lambda x: cos_sim(x, quer_emb))
df = df.sort_values("cos_sim", ascending=False)
##################################################################################################
# 2x faster for 350k rows
@do-me
do-me / scatter_animation.py
Created October 19, 2024 18:03
Scatterplot animation with matplotlib in Jupyter
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.animation as animation
from matplotlib.animation import PillowWriter
import numpy as np
from IPython.display import HTML
# Example data for two states
state1_x = np.random.rand(10) # x-coordinates for state 1
Country
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Australia
@do-me
do-me / extract_markdown_table.py
Created October 13, 2024 12:12
Extract markdown table from arbitrary markdown text based on regex
import pandas as pd
import re
def extract_markdown_table(text):
"""
Extracts a markdown table from a string, removing other markdown elements.
Args:
text: The input string containing markdown.
@do-me
do-me / wiki.sh
Last active October 10, 2024 11:03
personal wiki with bash & zsh, searches a directory with .txt files. includes command for new note
wiki() {
# Combine arguments into a single string for multi-word search
search_string="$*"
# Perform case-insensitive grep search with the combined string
grep -Hni --color=always "$search_string" /Users/dome/work/wikifiles/*.txt | awk -F':' '
BEGIN {
prevfile=""
}
{
@do-me
do-me / gemini_summary.py
Created October 9, 2024 07:52
Gemini Flash 1.5 Summary logic with retry (free plan)
import google.generativeai as genai
import pandas as pd
df = pd.read_json("https://github.com/do-me/copernicus-services-semantic-search/raw/refs/heads/main/copernicus_services_embeddings.json.gz")
# ignoring the cleaning of the dataset for brevity
GOOGLE_API_KEY= "YOUR_KEY"
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")
@do-me
do-me / eo_pubsy.py
Created October 8, 2024 14:24
Earth Observation Embedding from selected JRC publications (BAAI/bge-base-en-v1.5)
Earth_Observation_Pubsy = [0.0023605350870639086,-0.03585183620452881,-0.0018838586984202266,-0.0066082351841032505,0.03577606752514839,0.007964790798723698,0.023150762543082237,0.03316942974925041,-0.038895998150110245,-0.04117076098918915,-0.03140062466263771,-0.017644666135311127,-0.05881122127175331,0.01922798343002796,-0.001551413326524198,0.04579007625579834,0.02461058646440506,0.006413688883185387,0.003569109132513404,0.029188191518187523,-0.008217660710215569,-0.009149713441729546,0.015580502338707447,0.02944401651620865,0.009927663952112198,-0.02080441080033779,0.0313025526702404,0.035126153379678726,-0.03328511863946915,0.0006073070107959211,0.025256695225834846,-0.0033638938330113888,-0.021389279514551163,-0.0021468251943588257,0.009579457342624664,0.012051025405526161,-0.0401134267449379,-0.010880139656364918,-0.038161613047122955,-0.015132302418351173,-0.026435792446136475,-0.002597113372758031,-0.021558517590165138,-0.00289620878174901,-0.023958338424563408,0.015574358403682709,-0.05900900810956
@do-me
do-me / array_to_string.py
Created October 1, 2024 18:49
Convert array to string with fixed decimal precision, multiprocessing with pandarallel based on pandas df
import json
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
# Function to round array to 2 decimal places and serialize to JSON
def round_and_serialize(x):
if isinstance(x, np.ndarray):
# Round and format each number to 2 decimal places
@do-me
do-me / unique_embeddings.py
Created September 27, 2024 07:15
Create embeddings for pandas df for unique texts only, saving resources
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
# assuming gdf is a (geo)pandas dataframe with texts to inference
# Step 1: Get the list of texts to encode
gdf_list = gdf["texts"].to_list()
# Step 2: Deduplicate the list of texts and keep track of the original indices
unique_texts = list(set(gdf_list))
@do-me
do-me / download_file.py
Created September 25, 2024 13:39
Batch download pandas parquet files from Jupyter frontend to local computer with Python and JS (BytesIO) without saving on server
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
from io import BytesIO
from IPython.display import display, Javascript
import gc # Garbage collector
import time
from IPython.utils import io
import psutil