Last active
August 5, 2024 12:04
-
-
Save rajivmehtaflex/3094bf6d1ed67daad71cc48501b5bfdf to your computer and use it in GitHub Desktop.
HF+ZeroGPU+OpenAI-Prox+Gradio Operation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
from huggingface_hub import HfFileSystem | |
fs = HfFileSystem() | |
ds = ray.data.read_csv("hf://datasets/scikit-learn/iris/Iris.csv", filesystem=fs) | |
ds.show(5) | |
print('Done') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gradio as gr | |
# Define a simple function | |
def add_numbers(a, b): | |
return a + b | |
# Create a Gradio interface | |
iface = gr.Interface(fn=add_numbers, inputs=["number", "number"], outputs="number") | |
# Launch the interface with API enabled | |
iface.queue(api_open=True).launch() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gradio as gr | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
import torch | |
# Load the model and tokenizer with quantization | |
# model_name = "facebook/gamma-2" # Update this with the correct model name | |
model_name = "HuggingFaceH4/zephyr-7b-beta" | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16 | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adding a padding token if needed | |
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config,low_cpu_mem_usage=True) | |
# Create a text generation pipeline | |
text_generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2500, top_k=50, temperature=0.35, do_sample=True) | |
# Define the text generation function | |
def generate_text(prompt): | |
results = text_generation_pipeline(prompt, max_length=100) | |
generated_text = results[0]['generated_text'] | |
return generated_text | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=generate_text, | |
inputs="text", | |
outputs="text", | |
title="Text Generation with 4-bit Quantization", | |
description="Enter a prompt to generate text using the Gemma-2 model with 4-bit quantization." | |
) | |
# Launch the Gradio app | |
iface.queue(api_open=True).launch(share=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langchain_huggingface import HuggingFacePipeline | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
from langchain_huggingface import ChatHuggingFace | |
from transformers import BitsAndBytesConfig | |
import torch | |
from langchain_core.messages import ( | |
HumanMessage, | |
SystemMessage, | |
) | |
messages = [ | |
SystemMessage(content="You're a helpful assistant"), | |
HumanMessage( | |
content="What is color of flamingo?" | |
), | |
] | |
# Define the model id | |
model_id = "HuggingFaceH4/zephyr-7b-beta" | |
# model_id = "mistralai/Mistral-Nemo-Instruct-2407" | |
# Load the tokenizer and set a chat template | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adding a padding token if needed | |
# Define the BitsAndBytes configuration for low-memory usage | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16 | |
) | |
# Load the model with the quantization configuration | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
quantization_config=bnb_config, | |
low_cpu_mem_usage=True, | |
) | |
# Create a text generation pipeline | |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, top_k=50, temperature=0.1, do_sample=True) | |
# Define a valid chat template if needed (adjust according to your model's requirements) | |
chat_template = { | |
"system": "You are a helpful assistant.", | |
"user": "{input}", | |
"assistant": "{output}", | |
} | |
# Instantiate the HuggingFacePipeline with the defined pipeline | |
llm = HuggingFacePipeline(pipeline=pipe) | |
# Create the ChatHuggingFace instance with a valid chat template | |
llm_engine_hf = ChatHuggingFace(llm=llm, chat_template=chat_template) | |
while True: | |
user_input = input("Enter a prompt: ") | |
if user_input.lower() == "exit": | |
break | |
response = llm_engine_hf.invoke(user_input) | |
print(response.content.replace(user_input,'').replace('<|endoftext|>','').strip()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from flask import Flask | |
from flask import request | |
app = Flask(__name__) | |
@app.route('/') | |
def index(): | |
return 'Hello from Flask!' | |
# chat completion | |
@app.route('/v1//chat/completions', methods=["POST", "GET"]) | |
@app.route('/chat/completions', methods=["POST"]) | |
def chat_completion(): | |
print("got request for chat completion") | |
data = request.json | |
print("request data", data) | |
return { | |
"object": | |
"chat.completion", | |
"choices": [{ | |
"finish_reason": "stop", | |
"index": 0, | |
"message": { | |
"content": | |
"The sky, a canvas of blue,\nA work of art, pure and true,\nA", | |
"role": "assistant" | |
} | |
}], | |
"id": | |
"chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8", | |
"created": | |
1699290237.408061, | |
"model": | |
"togethercomputer/llama-2-70b-chat", | |
"usage": { | |
"completion_tokens": 18, | |
"prompt_tokens": 14, | |
"total_tokens": 32 | |
} | |
} | |
# text completion | |
@app.route('/v1/completions', methods=["POST"]) | |
def completion(): | |
print("got request for completion") | |
data = request.json | |
print("request data", data) | |
return { | |
"warning": | |
"This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations", | |
"id": | |
"cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r", | |
"object": | |
"text_completion", | |
"created": | |
1699290166, | |
"model": | |
"text-davinci-003", | |
"choices": [{ | |
"text": | |
"\n\nThe weather in San Francisco varies depending on what time of year and time", | |
"index": 0, | |
"logprobs": None, | |
"finish_reason": "length" | |
}], | |
"usage": { | |
"prompt_tokens": 7, | |
"completion_tokens": 16, | |
"total_tokens": 23 | |
} | |
} | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=8890) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langchain_ollama.chat_models import ChatOllama | |
from langchain.schema import AIMessage, HumanMessage | |
import gradio as gr | |
llm = ChatOllama(model="mistral-nemo:latest") | |
def predict(message, history): | |
chat_history = [] | |
for human, ai in history: | |
chat_history.append(HumanMessage(content=human)) | |
chat_history.append(AIMessage(content=ai)) | |
chat_history.append(HumanMessage(content=message)) | |
llm_response = llm(chat_history) | |
return llm_response.content | |
gr.ChatInterface(predict).launch(share=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
duckdb | |
e2b_code_interpreter | |
gradio | |
google-search-results | |
huggingface-hub | |
langchain | |
langchain-community | |
langchain-core | |
langchain-experimental | |
langchain-huggingface | |
langchain-openai | |
langchain-text-splitters | |
langchainhub | |
numexpr | |
pandas | |
sentencepiece | |
text-generation | |
transformers |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install transformers | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
import torch | |
import re | |
checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct" | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16 | |
) | |
def remove_first_token(input_text): | |
# Regular expression pattern to match the first occurrence of the desired token | |
pattern = r'<\|im_start\|>user\n(.*?)<\|im_end\|>' | |
# Substitute the first occurrence of the token with just the inner text | |
cleaned_text = re.sub(pattern, r'\1', input_text, count=1) | |
return cleaned_text.strip() # Strip any leading or trailing whitespace | |
device = "cuda" # for GPU usage or "cpu" for CPU usage | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")` | |
model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config,low_cpu_mem_usage=True, device_map="auto") | |
if __name__ == "__main__": | |
while True: | |
user_query=input(">>>") | |
if user_query == "exit": | |
break | |
messages = [{"role": "user", "content": user_query}] | |
input_text=tokenizer.apply_chat_template(messages, tokenize=False) | |
print(input_text) | |
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") | |
outputs = model.generate(inputs, max_new_tokens=2000, temperature=0.35, top_p=0.92, do_sample=True) | |
print(remove_first_token(tokenizer.decode(outputs[0])).replace("<|im_end|>","").replace("<|im_start|>","")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment