Luis Catacora lucataco

🎯

Focusing

Model Hacker at @replicate

lucataco / ollama_fast_speech_text_speech.py

Last active November 27, 2025 16:53

speech to text to speech using Ollama

	""" To use: install Ollama, clone OpenVoice, run this script in the OpenVoice directory
	brew install portaudio
	brew install git-lfs
	git lfs install

	git clone https://github.com/myshell-ai/OpenVoice
	cd OpenVoice
	git clone https://huggingface.co/myshell-ai/OpenVoice
	cp -r OpenVoice/* .

lucataco / svd.py

Created January 4, 2024 15:52

Run SVD locally

	# from cog import BasePredictor, Input, Path
	import os
	import cv2
	import time
	import math
	import torch
	import numpy as np
	from PIL import Image
	from glob import glob
	from typing import Optional

lucataco / notes.txt

Last active December 7, 2024 19:25

L40S vs A40 Benchmarks

	Goal: Run benchmarks of SDXL, SVD, and Llama 13B on an L40S test node

	TL;DR:

	- L40S has same inference speeds as A40 for SDXL
	- L40S has 10% faster inference speeds than A40S for llama2
	- L40S are ~9% faster at Video rendering than A40s

	Process: Run non-docker/cog python code for fp16

lucataco / sdxl.py

Created December 7, 2023 03:21

Run SDXL locally

	from diffusers import DiffusionPipeline
	import torch
	import time

	# load both base & refiner
	t1 = time.time()
	base = DiffusionPipeline.from_pretrained(
	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
	)
	base.to("cuda")

lucataco / llama2-13b-chat.py

Last active May 20, 2025 08:37

Run llama2-13b locally

	import os
	import time
	import torch
	from typing import Iterator
	from threading import Thread
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	#Change this to 512, 1024, 2048
	MAX_NEW_TOKENS = 512
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

lucataco / runllama2.py

Created November 29, 2023 20:33

Benchmark Llama2-13B speeds

	import time
	import json
	import requests


	# Start Llama2 13b locally:
	# docker run -d -p 5000:5000 --gpus=all r8.im/meta/llama-2-13b@sha256:078d7a002387bd96d93b0302a4c03b3f15824b63104034bfa943c63a8f208c38


	url = "http://localhost:5000/predictions"

lucataco / runSVD.py

Created November 29, 2023 20:33

Benchmark SVD speed

	import io
	import time
	import json
	import base64
	import requests

	# Start SDXL locally:
	# docker run -d -p 5000:5000 --gpus=all r8.im/stability-ai/stable-video-diffusion@sha256:3f0457e4619daac51203dedb472816fd4af51f3149fa7a9e0b5ffcf1b8172438

lucataco / runSDXL.py

Created November 29, 2023 20:32

Benchmark SDXL speed

	import io
	import time
	import json
	import base64
	import requests
	from PIL import Image
	# Start SDXL locally:
	# docker run -d -p 5000:5000 --gpus=all r8.im/stability-ai/sdxl@sha256:39ed52f2a78e934b3ba6e2a89f5b1c712de7dfea535525255b1aa35c5565e08b

	url = "http://localhost:5000/predictions"

lucataco / predict.py

Created October 15, 2023 02:48

RealvisXL-v1.0

	from cog import BasePredictor, Input, Path
	import os
	import torch
	import time
	from diffusers import (DDIMScheduler,
	DiffusionPipeline,
	DPMSolverMultistepScheduler,
	EulerAncestralDiscreteScheduler,
	EulerDiscreteScheduler,
	HeunDiscreteScheduler,

lucataco / Falcon7BHFspeedtest.py

Last active June 30, 2023 23:55

Falcon7B HF speed test

	from transformers import AutoTokenizer, AutoModelForCausalLM
	import transformers
	import torch
	import time

	model = "tiiuae/falcon-7b"

	tokenizer = AutoTokenizer.from_pretrained(model)
	pipeline = transformers.pipeline(
	"text-generation",