Skip to content

Instantly share code, notes, and snippets.

@rajivmehtaflex
Last active August 5, 2024 12:04
Show Gist options
  • Save rajivmehtaflex/3094bf6d1ed67daad71cc48501b5bfdf to your computer and use it in GitHub Desktop.
Save rajivmehtaflex/3094bf6d1ed67daad71cc48501b5bfdf to your computer and use it in GitHub Desktop.
HF+ZeroGPU+OpenAI-Prox+Gradio Operation
import ray
from huggingface_hub import HfFileSystem
fs = HfFileSystem()
ds = ray.data.read_csv("hf://datasets/scikit-learn/iris/Iris.csv", filesystem=fs)
ds.show(5)
print('Done')
import gradio as gr
# Define a simple function
def add_numbers(a, b):
return a + b
# Create a Gradio interface
iface = gr.Interface(fn=add_numbers, inputs=["number", "number"], outputs="number")
# Launch the interface with API enabled
iface.queue(api_open=True).launch()
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
# Load the model and tokenizer with quantization
# model_name = "facebook/gamma-2" # Update this with the correct model name
model_name = "HuggingFaceH4/zephyr-7b-beta"
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adding a padding token if needed
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config,low_cpu_mem_usage=True)
# Create a text generation pipeline
text_generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2500, top_k=50, temperature=0.35, do_sample=True)
# Define the text generation function
def generate_text(prompt):
results = text_generation_pipeline(prompt, max_length=100)
generated_text = results[0]['generated_text']
return generated_text
# Create the Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs="text",
outputs="text",
title="Text Generation with 4-bit Quantization",
description="Enter a prompt to generate text using the Gemma-2 model with 4-bit quantization."
)
# Launch the Gradio app
iface.queue(api_open=True).launch(share=True)
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import ChatHuggingFace
from transformers import BitsAndBytesConfig
import torch
from langchain_core.messages import (
HumanMessage,
SystemMessage,
)
messages = [
SystemMessage(content="You're a helpful assistant"),
HumanMessage(
content="What is color of flamingo?"
),
]
# Define the model id
model_id = "HuggingFaceH4/zephyr-7b-beta"
# model_id = "mistralai/Mistral-Nemo-Instruct-2407"
# Load the tokenizer and set a chat template
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adding a padding token if needed
# Define the BitsAndBytes configuration for low-memory usage
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16
)
# Load the model with the quantization configuration
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
low_cpu_mem_usage=True,
)
# Create a text generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, top_k=50, temperature=0.1, do_sample=True)
# Define a valid chat template if needed (adjust according to your model's requirements)
chat_template = {
"system": "You are a helpful assistant.",
"user": "{input}",
"assistant": "{output}",
}
# Instantiate the HuggingFacePipeline with the defined pipeline
llm = HuggingFacePipeline(pipeline=pipe)
# Create the ChatHuggingFace instance with a valid chat template
llm_engine_hf = ChatHuggingFace(llm=llm, chat_template=chat_template)
while True:
user_input = input("Enter a prompt: ")
if user_input.lower() == "exit":
break
response = llm_engine_hf.invoke(user_input)
print(response.content.replace(user_input,'').replace('<|endoftext|>','').strip())
from flask import Flask
from flask import request
app = Flask(__name__)
@app.route('/')
def index():
return 'Hello from Flask!'
# chat completion
@app.route('/v1//chat/completions', methods=["POST", "GET"])
@app.route('/chat/completions', methods=["POST"])
def chat_completion():
print("got request for chat completion")
data = request.json
print("request data", data)
return {
"object":
"chat.completion",
"choices": [{
"finish_reason": "stop",
"index": 0,
"message": {
"content":
"The sky, a canvas of blue,\nA work of art, pure and true,\nA",
"role": "assistant"
}
}],
"id":
"chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
"created":
1699290237.408061,
"model":
"togethercomputer/llama-2-70b-chat",
"usage": {
"completion_tokens": 18,
"prompt_tokens": 14,
"total_tokens": 32
}
}
# text completion
@app.route('/v1/completions', methods=["POST"])
def completion():
print("got request for completion")
data = request.json
print("request data", data)
return {
"warning":
"This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
"id":
"cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
"object":
"text_completion",
"created":
1699290166,
"model":
"text-davinci-003",
"choices": [{
"text":
"\n\nThe weather in San Francisco varies depending on what time of year and time",
"index": 0,
"logprobs": None,
"finish_reason": "length"
}],
"usage": {
"prompt_tokens": 7,
"completion_tokens": 16,
"total_tokens": 23
}
}
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8890)
from langchain_ollama.chat_models import ChatOllama
from langchain.schema import AIMessage, HumanMessage
import gradio as gr
llm = ChatOllama(model="mistral-nemo:latest")
def predict(message, history):
chat_history = []
for human, ai in history:
chat_history.append(HumanMessage(content=human))
chat_history.append(AIMessage(content=ai))
chat_history.append(HumanMessage(content=message))
llm_response = llm(chat_history)
return llm_response.content
gr.ChatInterface(predict).launch(share=True)
duckdb
e2b_code_interpreter
gradio
google-search-results
huggingface-hub
langchain
langchain-community
langchain-core
langchain-experimental
langchain-huggingface
langchain-openai
langchain-text-splitters
langchainhub
numexpr
pandas
sentencepiece
text-generation
transformers
# pip install transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import re
checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct"
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16
)
def remove_first_token(input_text):
# Regular expression pattern to match the first occurrence of the desired token
pattern = r'<\|im_start\|>user\n(.*?)<\|im_end\|>'
# Substitute the first occurrence of the token with just the inner text
cleaned_text = re.sub(pattern, r'\1', input_text, count=1)
return cleaned_text.strip() # Strip any leading or trailing whitespace
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config,low_cpu_mem_usage=True, device_map="auto")
if __name__ == "__main__":
while True:
user_query=input(">>>")
if user_query == "exit":
break
messages = [{"role": "user", "content": user_query}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(inputs, max_new_tokens=2000, temperature=0.35, top_p=0.92, do_sample=True)
print(remove_first_token(tokenizer.decode(outputs[0])).replace("<|im_end|>","").replace("<|im_start|>",""))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment