rajivmehtaflex · August 5, 2024 12:04
diff --git a/access_hf_system.py b/access_hf_system.py
 import ray
 from huggingface_hub import HfFileSystem
 fs = HfFileSystem()
 ds = ray.data.read_csv("hf://datasets/scikit-learn/iris/Iris.csv", filesystem=fs)
 ds.show(5)
 print('Done')
diff --git a/Api_enabled_Gradio.py b/Api_enabled_Gradio.py
 import gradio as gr

 # Define a simple function
 def add_numbers(a, b):
    return a + b

 # Create a Gradio interface
 iface = gr.Interface(fn=add_numbers, inputs=["number", "number"], outputs="number")

 # Launch the interface with API enabled
 iface.queue(api_open=True).launch()
diff --git a/Base_gradio_hf_loading.py b/Base_gradio_hf_loading.py
 import gradio as gr
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 # Load the model and tokenizer with quantization
 # model_name = "facebook/gamma-2"  # Update this with the correct model name
 model_name = "HuggingFaceH4/zephyr-7b-beta"
 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Set compute dtype to float16
 )

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Adding a padding token if needed
 model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config,low_cpu_mem_usage=True)

 # Create a text generation pipeline
 text_generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2500, top_k=50, temperature=0.35, do_sample=True)

 # Define the text generation function
 def generate_text(prompt):
    results = text_generation_pipeline(prompt, max_length=100)
    generated_text = results[0]['generated_text']
    return generated_text

 # Create the Gradio interface
 iface = gr.Interface(
    fn=generate_text,
    inputs="text",
    outputs="text",
    title="Text Generation with 4-bit Quantization",
    description="Enter a prompt to generate text using the Gemma-2 model with 4-bit quantization."
 )

 # Launch the Gradio app
 iface.queue(api_open=True).launch(share=True)
diff --git a/HF-native-model-load.py b/HF-native-model-load.py
 from langchain_huggingface import HuggingFacePipeline
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from langchain_huggingface import ChatHuggingFace
 from transformers import BitsAndBytesConfig
 import torch

 from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
 )

 messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="What is color of flamingo?"
    ),
 ]

 # Define the model id
 model_id = "HuggingFaceH4/zephyr-7b-beta"
 # model_id = "mistralai/Mistral-Nemo-Instruct-2407"

 # Load the tokenizer and set a chat template
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Adding a padding token if needed

 # Define the BitsAndBytes configuration for low-memory usage
 bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Set compute dtype to float16
 )

 # Load the model with the quantization configuration
 model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
 )

 # Create a text generation pipeline
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, top_k=50, temperature=0.1, do_sample=True)

 # Define a valid chat template if needed (adjust according to your model's requirements)
 chat_template = {
    "system": "You are a helpful assistant.",
    "user": "{input}",
    "assistant": "{output}",
 }

 # Instantiate the HuggingFacePipeline with the defined pipeline
 llm = HuggingFacePipeline(pipeline=pipe)

 # Create the ChatHuggingFace instance with a valid chat template
 llm_engine_hf = ChatHuggingFace(llm=llm, chat_template=chat_template)


 while True:
    user_input = input("Enter a prompt: ")
    if user_input.lower() == "exit":
        break
    response = llm_engine_hf.invoke(user_input)
    print(response.content.replace(user_input,'').replace('<|endoftext|>','').strip())
diff --git a/open_ai_proxy.py b/open_ai_proxy.py
 from flask import Flask
 from flask import request

 app = Flask(__name__)


 @app.route('/')
 def index():
  return 'Hello from Flask!'


 # chat completion
 @app.route('/v1//chat/completions', methods=["POST", "GET"])
 @app.route('/chat/completions', methods=["POST"])
 def chat_completion():
  print("got request for chat completion")
  data = request.json
  print("request data", data)
  return {
    "object":
    "chat.completion",
    "choices": [{
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content":
        "The sky, a canvas of blue,\nA work of art, pure and true,\nA",
        "role": "assistant"
      }
    }],
    "id":
    "chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
    "created":
    1699290237.408061,
    "model":
    "togethercomputer/llama-2-70b-chat",
    "usage": {
      "completion_tokens": 18,
      "prompt_tokens": 14,
      "total_tokens": 32
    }
  }


 # text completion
 @app.route('/v1/completions', methods=["POST"])
 def completion():
  print("got request for completion")
  data = request.json
  print("request data", data)

  return {
    "warning":
    "This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
    "id":
    "cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
    "object":
    "text_completion",
    "created":
    1699290166,
    "model":
    "text-davinci-003",
    "choices": [{
      "text":
      "\n\nThe weather in San Francisco varies depending on what time of year and time",
      "index": 0,
      "logprobs": None,
      "finish_reason": "length"
    }],
    "usage": {
      "prompt_tokens": 7,
      "completion_tokens": 16,
      "total_tokens": 23
    }
  }

 if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8890)
diff --git a/Quick_chat.py b/Quick_chat.py
 from langchain_ollama.chat_models import ChatOllama
 from langchain.schema import AIMessage, HumanMessage
 import gradio as gr

 llm = ChatOllama(model="mistral-nemo:latest")

 def predict(message, history):
  chat_history = []
  for human, ai in history:
    chat_history.append(HumanMessage(content=human))
    chat_history.append(AIMessage(content=ai))
  chat_history.append(HumanMessage(content=message))
  llm_response = llm(chat_history)
  return llm_response.content

 gr.ChatInterface(predict).launch(share=True)
diff --git a/requiremens.txt b/requiremens.txt
 duckdb
 e2b_code_interpreter
 gradio
 google-search-results
 huggingface-hub
 langchain
 langchain-community
 langchain-core
 langchain-experimental
 langchain-huggingface
 langchain-openai
 langchain-text-splitters
 langchainhub
 numexpr
 pandas
 sentencepiece
 text-generation
 transformers
diff --git a/SMOLModel.py b/SMOLModel.py
 # pip install transformers
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 import re

 checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct"

 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Set compute dtype to float16
 )

 def remove_first_token(input_text):
    # Regular expression pattern to match the first occurrence of the desired token
    pattern = r'<\|im_start\|>user\n(.*?)<\|im_end\|>'
    
    # Substitute the first occurrence of the token with just the inner text
    cleaned_text = re.sub(pattern, r'\1', input_text, count=1)
    return cleaned_text.strip()  # Strip any leading or trailing whitespace

 device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 
 # for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
 model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config,low_cpu_mem_usage=True, device_map="auto")

 if __name__ == "__main__":
    while True:
        user_query=input(">>>")
        if user_query == "exit":
            break
        messages = [{"role": "user", "content": user_query}]
        input_text=tokenizer.apply_chat_template(messages, tokenize=False)
        print(input_text)
        inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(inputs, max_new_tokens=2000, temperature=0.35, top_p=0.92, do_sample=True)
        print(remove_first_token(tokenizer.decode(outputs[0])).replace("<|im_end|>","").replace("<|im_start|>",""))
	import ray
	from huggingface_hub import HfFileSystem
	fs = HfFileSystem()
	ds = ray.data.read_csv("hf://datasets/scikit-learn/iris/Iris.csv", filesystem=fs)
	ds.show(5)
	print('Done')
	import gradio as gr

	# Define a simple function
	def add_numbers(a, b):
	return a + b

	# Create a Gradio interface
	iface = gr.Interface(fn=add_numbers, inputs=["number", "number"], outputs="number")

	# Launch the interface with API enabled
	iface.queue(api_open=True).launch()
	import gradio as gr
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	import torch
	# Load the model and tokenizer with quantization
	# model_name = "facebook/gamma-2" # Update this with the correct model name
	model_name = "HuggingFaceH4/zephyr-7b-beta"
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16
	)

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adding a padding token if needed
	model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config,low_cpu_mem_usage=True)

	# Create a text generation pipeline
	text_generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2500, top_k=50, temperature=0.35, do_sample=True)

	# Define the text generation function
	def generate_text(prompt):
	results = text_generation_pipeline(prompt, max_length=100)
	generated_text = results[0]['generated_text']
	return generated_text

	# Create the Gradio interface
	iface = gr.Interface(
	fn=generate_text,
	inputs="text",
	outputs="text",
	title="Text Generation with 4-bit Quantization",
	description="Enter a prompt to generate text using the Gemma-2 model with 4-bit quantization."
	)

	# Launch the Gradio app
	iface.queue(api_open=True).launch(share=True)
	from langchain_huggingface import HuggingFacePipeline
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from langchain_huggingface import ChatHuggingFace
	from transformers import BitsAndBytesConfig
	import torch

	from langchain_core.messages import (
	HumanMessage,
	SystemMessage,
	)

	messages = [
	SystemMessage(content="You're a helpful assistant"),
	HumanMessage(
	content="What is color of flamingo?"
	),
	]

	# Define the model id
	model_id = "HuggingFaceH4/zephyr-7b-beta"
	# model_id = "mistralai/Mistral-Nemo-Instruct-2407"

	# Load the tokenizer and set a chat template
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adding a padding token if needed

	# Define the BitsAndBytes configuration for low-memory usage
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16
	)

	# Load the model with the quantization configuration
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	quantization_config=bnb_config,
	low_cpu_mem_usage=True,
	)

	# Create a text generation pipeline
	pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, top_k=50, temperature=0.1, do_sample=True)

	# Define a valid chat template if needed (adjust according to your model's requirements)
	chat_template = {
	"system": "You are a helpful assistant.",
	"user": "{input}",
	"assistant": "{output}",
	}

	# Instantiate the HuggingFacePipeline with the defined pipeline
	llm = HuggingFacePipeline(pipeline=pipe)

	# Create the ChatHuggingFace instance with a valid chat template
	llm_engine_hf = ChatHuggingFace(llm=llm, chat_template=chat_template)


	while True:
	user_input = input("Enter a prompt: ")
	if user_input.lower() == "exit":
	break
	response = llm_engine_hf.invoke(user_input)
	print(response.content.replace(user_input,'').replace('<\|endoftext\|>','').strip())
	from flask import Flask
	from flask import request

	app = Flask(__name__)


	@app.route('/')
	def index():
	return 'Hello from Flask!'


	# chat completion
	@app.route('/v1//chat/completions', methods=["POST", "GET"])
	@app.route('/chat/completions', methods=["POST"])
	def chat_completion():
	print("got request for chat completion")
	data = request.json
	print("request data", data)
	return {
	"object":
	"chat.completion",
	"choices": [{
	"finish_reason": "stop",
	"index": 0,
	"message": {
	"content":
	"The sky, a canvas of blue,\nA work of art, pure and true,\nA",
	"role": "assistant"
	}
	}],
	"id":
	"chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
	"created":
	1699290237.408061,
	"model":
	"togethercomputer/llama-2-70b-chat",
	"usage": {
	"completion_tokens": 18,
	"prompt_tokens": 14,
	"total_tokens": 32
	}
	}


	# text completion
	@app.route('/v1/completions', methods=["POST"])
	def completion():
	print("got request for completion")
	data = request.json
	print("request data", data)

	return {
	"warning":
	"This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
	"id":
	"cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
	"object":
	"text_completion",
	"created":
	1699290166,
	"model":
	"text-davinci-003",
	"choices": [{
	"text":
	"\n\nThe weather in San Francisco varies depending on what time of year and time",
	"index": 0,
	"logprobs": None,
	"finish_reason": "length"
	}],
	"usage": {
	"prompt_tokens": 7,
	"completion_tokens": 16,
	"total_tokens": 23
	}
	}

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=8890)
	from langchain_ollama.chat_models import ChatOllama
	from langchain.schema import AIMessage, HumanMessage
	import gradio as gr

	llm = ChatOllama(model="mistral-nemo:latest")

	def predict(message, history):
	chat_history = []
	for human, ai in history:
	chat_history.append(HumanMessage(content=human))
	chat_history.append(AIMessage(content=ai))
	chat_history.append(HumanMessage(content=message))
	llm_response = llm(chat_history)
	return llm_response.content

	gr.ChatInterface(predict).launch(share=True)
	duckdb
	e2b_code_interpreter
	gradio
	google-search-results
	huggingface-hub
	langchain
	langchain-community
	langchain-core
	langchain-experimental
	langchain-huggingface
	langchain-openai
	langchain-text-splitters
	langchainhub
	numexpr
	pandas
	sentencepiece
	text-generation
	transformers
	# pip install transformers
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	import torch
	import re

	checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct"

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16, # Set compute dtype to float16
	)

	def remove_first_token(input_text):
	# Regular expression pattern to match the first occurrence of the desired token
	pattern = r'<\\|im_start\\|>user\n(.*?)<\\|im_end\\|>'

	# Substitute the first occurrence of the token with just the inner text
	cleaned_text = re.sub(pattern, r'\1', input_text, count=1)
	return cleaned_text.strip() # Strip any leading or trailing whitespace

	device = "cuda" # for GPU usage or "cpu" for CPU usage
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
	model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config,low_cpu_mem_usage=True, device_map="auto")

	if __name__ == "__main__":
	while True:
	user_query=input(">>>")
	if user_query == "exit":
	break
	messages = [{"role": "user", "content": user_query}]
	input_text=tokenizer.apply_chat_template(messages, tokenize=False)
	print(input_text)
	inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
	outputs = model.generate(inputs, max_new_tokens=2000, temperature=0.35, top_p=0.92, do_sample=True)
	print(remove_first_token(tokenizer.decode(outputs[0])).replace("<\|im_end\|>","").replace("<\|im_start\|>",""))