Jean-Michel Daignan jeanmidevacc

Data scientist and R&D engineer who like to test, build and destroy stuff. My opinion are my own and I have always an opinion because I am French.

jeanmidevacc / mf_surprise_example.py

Created August 2, 2024 19:20

	import surprise

	# there is pandas dataframe that contains the different ratings between user and item , indexed

	# Build a reader
	reader = surprise.Reader(rating_scale=(dfp_actions["rating"].min(),dfp_actions["rating"].max()))

	#_id column are index id of the user and item
	surprise_data = surprise.Dataset.load_from_df(dfp_actions[["user_id", "item_id", "rating"]], reader = reader)
	train_set = surprise_train.build_full_trainset()

jeanmidevacc / mf_build_recommendations_with_udf_with_pyspark.py

Last active August 2, 2024 18:51

	from pyspark.sql import SparkSession, SQLContext
	import pyspark.sql.functions as F
	import pyspark.sql.types as T
	from pyspark.sql import Window

	def build_recommendations(broadcast_dfp_items_factors):
	def build_recommendations_(user_factors, inventory_itemid_encoded, k=5):

	# Fetch on the factors for the item that can be recommended , add your rules
	dfp_items_factors_to_rank = broadcast_dfp_items_factors.value[~broadcast_array_items_factors.value["id"].isin(inventory_itemid_encoded)]

jeanmidevacc / mf_index_entities_with_pyspark.py

Created August 2, 2024 18:26

	from pyspark.sql import SparkSession, SQLContext
	import pyspark.sql.functions as F
	import pyspark.sql.types as T
	from pyspark.sql import Window

	dfs_items = #pyspark dataframe that contains items to be indexed (define by itemid)

	previous_max_itemid_indexed = 0 #in case it's an incremental process , set a tempora

	windowspec = Window.orderBy(F.col("itemid"))# build a window function

jeanmidevacc / suika_make_actions.py

Created June 25, 2024 09:57

suika_make_Action

	if time.time() - last_checked > 2:
	step += 1

	# Update the last checked time
	last_checked = time.time()
	particle_states = get_particles_state(handler.data["particles"])
	score = handler.data["score"]

	# Set the next particle's x position before releasing
	observation = {

jeanmidevacc / trigger.sh

Created June 24, 2024 22:16

suika trigger simulation

	#!/bin/bash

	# Check if the user provided an argument
	if [ -z "$1" ]; then
	echo "Please specify the number of runs as an argument."
	exit 1
	fi

	# Total number of runs specified by the first argument
	TOTAL_RUNS=$1

jeanmidevacc / suika_baseline_agents.py

Created June 24, 2024 15:56

suika_baseline_agents.py

	from datetime import datetime
	import random

	import pandas as pd


	class RandomAgent():
	def __init__(self):
	self.creation_date = datetime.utcnow()
	self.tag = "random"

jeanmidevacc / build_timecodes_based_on_silence.py

Created January 28, 2024 21:38

build_timecodes_based_on_silence

	from pydub import AudioSegment, silence
	import pandas as pd

	def build_segments(audio, length_segment=10, dbfs=0):
	silences = silence.detect_silence(audio, min_silence_len=1000, silence_thresh=dbfs-16)
	dfp_silences = pd.DataFrame(silences, columns = ["start_timecode", "end_timecode"])

	threshold_segment = int(length_segment * 60 * 1000)
	first_timecode = 0
	last_timecode = int(audio.duration_seconds * 1000)

jeanmidevacc / openai_whisper.py

Created January 28, 2024 20:20

openai_whisper

	from pathlib import Path
	from openai import OpenAI

	client_openai = OpenAI(
	# This is the default and can be omitted
	api_key="sk-XXX",
	)

	def get_transcript_openai_api(file, language="fr"):
	# f = open(file, "rb")

jeanmidevacc / local_hf_whisper.py

Created January 28, 2024 18:16

local_hf_whisper

	import torch
	from transformers import pipeline

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	mapping = {"whisper-tiny" : "tiny", "whisper-small" : "small", "whisper-medium" : "medium", "whisper-base" : "base"}

	hf_model_name = "whisper-medium"
	size_model = mapping[hf_model_name] #tiny, base, small, medium

	model = pipeline(

jeanmidevacc / local_whisper.py

Last active January 28, 2024 16:04

local_whisper.py

	import whisper

	size_model = "medium" #the type of model in the model card , with .en or not
	model = whisper.load_model(size_model, device="cuda")

	def get_transcript_local_whisper(model, file, language):
	audio = whisper.load_audio(file)
	audio = whisper.pad_or_trim(audio)
	mel = whisper.log_mel_spectrogram(audio).to(model.device)
	result = whisper.decode(model, mel, language=language)