William Held Helw150

Helw150 / parallel_t5.py

Last active May 10, 2023 14:52

Flan T5 Parallel Usage

	from transformers import AutoTokenizer, T5ForConditionalGeneration

	# Model Init
	n_gpu = 8
	tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
	model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2")
	heads_per_gpu = len(model.encoder.block) // n_gpu
	device_map = {
	gpu: list(
	range(

Helw150 / ot_loss.py

Last active April 27, 2023 22:02

OT TADA Loss

	from typing import List, Optional, Tuple, Union
	from torchtyping import TensorType

	from transformers.adapters.modeling import Adapter
	from transformers.adapters import (
	BartAdapterModel,
	RobertaAdapterModel,
	BertAdapterModel,
	AdapterConfig,
	)

Helw150 / process_parses.py

Last active May 6, 2024 17:33

Helw150 / intermediate_push_parquet.py

Created May 7, 2024 21:41

	def _push_parquet_shards_to_hub( [1071/1877]
	self,
	repo_id: str,
	data_dir: str = "data",
	split: Optional[str] = None,
	token: Optional[str] = None,
	revision: Optional[str] = None,
	create_pr: Optional[bool] = False,
	max_shard_size: Optional[Union[int, str]] = None,
	num_shards: Optional[int] = None,

Helw150 / via.py

Last active May 10, 2024 19:58

	text = # Tokenized Text Corresponding to Recording Transcript
	audio = # Mel Spectrogram of the Recording

	# Only Train Connector and Projection
	self.encoder.freeze()
	self.llama.freeze()

	# Convert Raw Audio Signal to 1500 Embeddings with Whisper Encoder (CNN+Transformer)
	audio_features = self.encoder(audio)

Helw150 / data_generation.py

Created May 23, 2024 00:06

	from time import sleep

	from datasets import load_dataset
	from huggingface_hub import InferenceClient
	from ratelimit import limits, sleep_and_retry
	from transformers import AutoTokenizer

	dataset = load_dataset("yijingwu/HeySQuAD_human", split="train")

	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Helw150 / stream_gradio_audio.py

Last active August 27, 2024 07:05

	import gradio as gr
	import math
	import numpy as np
	import time
	import io
	import wave


	def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
	# This will create a wave header then append the frame input

Helw150 / stats.csv

Created March 30, 2026 16:52

Token Counts

	dataset	marin_tokens	category

Helw150 / bench_trace.py

Created April 10, 2026 01:29

Python execution tracer prototype for SWE-bench-style Docker images

	"""Benchmark end-to-end trace pipeline on multiple SWE-rebench-V2 Python images."""

	import json
	import os
	import subprocess
	import sys
	import time

	IMAGES = [
	{"instance_id": "wtforms__wtforms-614", "image_name": "docker.io/swerebenchv2/wtforms-wtforms:614-848d28d", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_fields.py tests/test_validators.py tests/test_widgets.py"},