Skip to content

Instantly share code, notes, and snippets.

View jeanmidevacc's full-sized avatar

Jean-Michel Daignan jeanmidevacc

View GitHub Profile
# How to colect table versions
query = f"""
SELECT *
FROM (DESCRIBE HISTORY name_of_the_table)
WHERE timestamp >= '{start_date.strftime('%Y-%m-%d %H:%M:%S')}';
"""
# How to get a specific version
dfs_recommendations = spark.read.format("delta").option("versionAsOf", version_id).table(f"name_of_the_table")
@jeanmidevacc
jeanmidevacc / databricks_liquid_clustering.py
Last active August 26, 2025 03:25
Sample of code to build and write on liquid clustered table
# Add a create stement example
# build a table
query = """
CREATE TABLE IF NOT EXISTS name_of_the_table (
column_1 DATE,
column_2 INT,
p_dateid INT NOT NULL
)
USING DELTA
CLUSTER BY (p_dateid); # you could also set CLUSTER BY AUTO , if you wanto let databrciks handle the rest
@jeanmidevacc
jeanmidevacc / databricks_ai_query_udf.sql
Created August 23, 2025 15:21
snapshot from the dfatabricks documentation to build function on top of ai_query https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_query
CREATE FUNCTION correct_grammar(text STRING)
RETURNS STRING
RETURN ai_query(
'databricks-meta-llama-3-3-70b-instruct',
CONCAT('Correct this to standard English:\n', text));
> GRANT EXECUTE ON correct_grammar TO ds;
- DS fixes grammar issues in a batch.
> SELECT
* EXCEPT text,
correct_grammar(text) AS text
@jeanmidevacc
jeanmidevacc / NYC Taxi Trip Analysis.lvdash.json
Created August 23, 2025 14:16
json export of the nyc taxi dashboard declaration on databricks
{
"datasets": [
{
"name": "0ca96e81",
"displayName": "route revenue",
"queryLines": [
"SELECT\n",
" T.pickup_zip,\n",
" T.dropoff_zip,\n",
" T.route as `Route`,\n",
@jeanmidevacc
jeanmidevacc / databricks_create_volume.py
Created July 21, 2025 20:53
Pyspark code to create volumen in databricks schemas
schemas = [] # Listo of the schemas to create the volume
volume_name = "" #name of you volume
for schema in schemas:
try:
spark.sql(f"CREATE VOLUME IF NOT EXISTS {schema}.{volume_name}")
print(f"Volume {volume_name} created or already exists.")
except Exception as e:
print(f"Error creating volume {volume_name}: {e}")
# definition of the few shot messages
messages_few_shots = []
for key, sample in samples.items():
messages_few_shots.extend([{
"role": "user",
"content" : str(sample["input"]),
},{
"role": "assistant",
"content" : str(sample["output"]),
}
samples = {
"sample_0" : {
"input" : "<p><strong>Merlin:</strong> <em>(off)</em> Non, non, non on y va sûrement pas ! Vous me laissez le temps de me faire à l'idée !</p>",
"output" : [{
"character": "Merlin",
"dialogue": "Non, non, non on y va sûrement pas ! Vous me laissez le temps de me faire à l'idée !",
"parenthetical": "(off)"}]
},
"sample_1" : {
"input" : "<p><strong>Servius:</strong> Allez on y va !</p>",
@jeanmidevacc
jeanmidevacc / llm_prompt.py
Created May 28, 2025 22:48
this is an example of prompt teste during my kaamelot project
prompt = """
You are given raw dialogues from a TV show script in HTML format.
Extract and return the dialogues in the following JSON format:
[
{
"character": "",
"dialogue": "",
"parenthetical": ""
}
@jeanmidevacc
jeanmidevacc / databricks_dbutils_selection.py
Last active July 21, 2025 20:50
This is a collection of dbutils functions that I used
# Secrets module
dbutils.secrets.get(scope="", key="") # Function ot fetch secret to vault link to DBX instance
# Widgets module
dbutils.widgets.text("","")# Function to create a text input widget with a given name and default value
dbutils.widgets.get("") # Function to retrieve current value of an input widget
# notebook module
dbutils.notebook.exit() # Function to exit properly a notebook
@jeanmidevacc
jeanmidevacc / mf_pyspark_als.py
Last active August 14, 2024 12:12
mf_pyspark_als.py
from pyspark.ml.recommendations import ALS
# Context , there is dfs_actions is spark dataframe that looks like the pandas dataframe for surprise example https://gist.github.com/jeanmidevacc/a00c9cf8fe9379cd8a818b1d842dbaa1
# Setup the model parameters
als = ALS(
seed=12,
userCol="user_id",
itemCol="item_id",
ratingCol="rating",