sohang3112 · August 22, 2024 05:53
diff --git a/azure_langchain.ipynb b/azure_langchain.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://learn.deeplearning.ai/courses/building-your-own-database-agent/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "from langchain.schema import HumanMessage\n",
    "from langchain_openai import AzureChatOpenAI\n",
    "from langchain.agents.agent_types import AgentType\n",
    "from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent\n",
    "from langchain.agents import create_sql_agent\n",
    "from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
    "from langchain.sql_database import SQLDatabase\n",
    "\n",
    "# initialize model, requires Azure key - not shown here, in the tutorial Azure is alreay setup\n",
    "model = AzureChatOpenAI(\n",
    "    openai_api_version=\"2023-05-15\",\n",
    "    azure_deployment=\"gpt-4-1106\",\n",
    "    temperature=0, \n",
    "    max_tokens=500\n",
    "\n",
    "    # its value in tutorial is: http://jupyter-api-proxy.internal.dlai/rev-proxy/microsoft_azure_openai\n",
    "    # the above url is only accessible from within the tutorial notebook (presumably since it's connected to Azure already)\n",
    "    azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"),   \n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Analyze Data from CSV Spreadsheet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Asking langchain to analyze this data (2K rows): https://covidtracking.com/data/download/all-states-history.csv\n",
    "df = pd.read_csv(\"./data/all-states-history.csv\").fillna(value = 0)\n",
    "\n",
    "agent = create_pandas_dataframe_agent(\n",
    "    llm=model,\n",
    "    df=df,     # input data\n",
    "    verbose=True       # makes langchain print its \"thoughts\"\n",
    ")\n",
    "prompt = \"PROMPT HERE - ask questions about the data\"\n",
    "ans = agent.invoke(prompt)\n",
    "# ans[\"input\"] contains original prompt\n",
    "print(ans[\"output\"])      # model final response string"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Analyze Data from Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db = SQLDatabase.from_uri(f'sqlite:///{database_file_path}')     # change SQL url here\n",
    "toolkit = SQLDatabaseToolkit(db=db, llm=model)\n",
    "\n",
    "MSSQL_AGENT_PREFIX = \"\"\"\n",
    "\n",
    "You are an agent designed to interact with a SQL database.\n",
    "## Instructions:\n",
    "- Given an input question, create a syntactically correct {dialect} query\n",
    "to run, then look at the results of the query and return the answer.\n",
    "- Unless the user specifies a specific number of examples they wish to\n",
    "obtain, **ALWAYS** limit your query to at most {top_k} results.\n",
    "- You can order the results by a relevant column to return the most\n",
    "interesting examples in the database.\n",
    "- Never query for all the columns from a specific table, only ask for\n",
    "the relevant columns given the question.\n",
    "- You have access to tools for interacting with the database.\n",
    "- You MUST double check your query before executing it.If you get an error\n",
    "while executing a query,rewrite the query and try again.\n",
    "- DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.)\n",
    "to the database.\n",
    "- DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE, ONLY USE THE RESULTS\n",
    "OF THE CALCULATIONS YOU HAVE DONE.\n",
    "- Your response should be in Markdown. However, **when running  a SQL Query\n",
    "in \"Action Input\", do not include the markdown backticks**.\n",
    "Those are only for formatting the response, not for executing the command.\n",
    "- ALWAYS, as part of your final answer, explain how you got to the answer\n",
    "on a section that starts with: \"Explanation:\". Include the SQL query as\n",
    "part of the explanation section.\n",
    "- If the question does not seem related to the database, just return\n",
    "\"I don\\'t know\" as the answer.\n",
    "- Only use the below tools. Only use the information returned by the\n",
    "below tools to construct your query and final answer.\n",
    "- Do not make up table names, only use the tables returned by any of the\n",
    "tools below.\n",
    "\n",
    "## Tools:\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "MSSQL_AGENT_FORMAT_INSTRUCTIONS = \"\"\"\n",
    "\n",
    "## Use the following format:\n",
    "\n",
    "Question: the input question you must answer.\n",
    "Thought: you should always think about what to do.\n",
    "Action: the action to take, should be one of [{tool_names}].\n",
    "Action Input: the input to the action.\n",
    "Observation: the result of the action.\n",
    "... (this Thought/Action/Action Input/Observation can repeat N times)\n",
    "Thought: I now know the final answer.\n",
    "Final Answer: the final answer to the original input question.\n",
    "\n",
    "Example of Final Answer:\n",
    "<=== Beginning of example\n",
    "\n",
    "Action: query_sql_db\n",
    "Action Input: \n",
    "SELECT TOP (10) [death]\n",
    "FROM covidtracking \n",
    "WHERE state = 'TX' AND date LIKE '2020%'\n",
    "\n",
    "Observation:\n",
    "[(27437.0,), (27088.0,), (26762.0,), (26521.0,), (26472.0,), (26421.0,), (26408.0,)]\n",
    "Thought:I now know the final answer\n",
    "Final Answer: There were 27437 people who died of covid in Texas in 2020.\n",
    "\n",
    "Explanation:\n",
    "I queried the `covidtracking` table for the `death` column where the state\n",
    "is 'TX' and the date starts with '2020'. The query returned a list of tuples\n",
    "with the number of deaths for each day in 2020. To answer the question,\n",
    "I took the sum of all the deaths in the list, which is 27437.\n",
    "I used the following query\n",
    "\n",
    "```sql\n",
    "SELECT [death] FROM covidtracking WHERE state = 'TX' AND date LIKE '2020%'\"\n",
    "```\n",
    "===> End of Example\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "agent_executor_SQL = create_sql_agent(\n",
    "    prefix=MSSQL_AGENT_PREFIX,   # similar to system prompt - instructions for how to generate SQL query (will be executed by langchain)\n",
    "    format_instructions = MSSQL_AGENT_FORMAT_INSTRUCTIONS,   # instructions + examples for how to create \"thoughts\" and final answer\n",
    "    llm=model,\n",
    "    toolkit=toolkit,\n",
    "    top_k=30,      # TODO: check what this does\n",
    "    verbose=True\n",
    ")\n",
    "\n",
    "QUESTION = \"\"\"How may patients were hospitalized during October 2020\n",
    "in New York, and nationwide as the total of all states?\n",
    "Use the hospitalizedIncrease column\n",
    "\"\"\"\n",
    "agent_executor_SQL.invoke(QUESTION)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function Calling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from openai import AzureOpenAI\n",
    "\n",
    "def FUNCTION_NAME(location, unit):\n",
    "    return {\"location\": location, \"unit\": unit}\n",
    "\n",
    "messages = [\n",
    "    {\"role\": \"user\",\n",
    "     \"content\": \"PROMPT HERE\"\n",
    "    }\n",
    "]\n",
    "\n",
    "# providing functions to GPT, which it can call - arguments to be provided by GPT\n",
    "tools = [\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"FUNCTION_NAME\",\n",
    "            \"description\": \"FUNCTION_DESCRIPTION\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",           # Python Dict\n",
    "                \"properties\": {\n",
    "                    \"location\": {\n",
    "                        \"type\": \"string\",\n",
    "                        \"description\": \"DICT KEY DESCRIPTION\",\n",
    "                    },\n",
    "                    \"unit\": {\n",
    "                        \"type\": \"string\",\n",
    "                        \"default\":\"fahrenheit\",\n",
    "                        \"enum\": [ \"fahrenheit\", \"celsius\"],\n",
    "                        \"description\": \"DICT KEY DESCRIPTION\"\n",
    "                    },\n",
    "                },\n",
    "                \"required\": [\"location\"],\n",
    "            },\n",
    "        },\n",
    "    }\n",
    "]\n",
    "\n",
    "client = AzureOpenAI(\n",
    "  azure_endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n",
    "  api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n",
    "  api_version=\"2023-05-15\"\n",
    ")\n",
    "response = client.chat.completions.create(\n",
    "    model=\"gpt-4-1106\",\n",
    "    messages=messages,\n",
    "    tools=tools,\n",
    "    tool_choice=\"auto\", \n",
    ")\n",
    "\n",
    "response_message = response.choices[0].message\n",
    "tool_calls = response_message.tool_calls\n",
    "# tool calls is a list of functions with arguments output by GPT for calling\n",
    "functions = {\n",
    "    'FUNCTION_NAME': FUNCTION_NAME\n",
    "}\n",
    "for tool in tool_calls:\n",
    "    print('GPT called this function:', tool.function.name, tool.function.arguments)\n",
    "    kwargs = json.loads(tool.function.arguments)\n",
    "    response = functions[tool.function.name](**kwargs)\n",
    "    print(response)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"https://learn.deeplearning.ai/courses/building-your-own-database-agent/"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import pandas as pd\n",
	"\n",
	"from langchain.schema import HumanMessage\n",
	"from langchain_openai import AzureChatOpenAI\n",
	"from langchain.agents.agent_types import AgentType\n",
	"from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent\n",
	"from langchain.agents import create_sql_agent\n",
	"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
	"from langchain.sql_database import SQLDatabase\n",
	"\n",
	"# initialize model, requires Azure key - not shown here, in the tutorial Azure is alreay setup\n",
	"model = AzureChatOpenAI(\n",
	" openai_api_version=\"2023-05-15\",\n",
	" azure_deployment=\"gpt-4-1106\",\n",
	" temperature=0, \n",
	" max_tokens=500\n",
	"\n",
	" # its value in tutorial is: http://jupyter-api-proxy.internal.dlai/rev-proxy/microsoft_azure_openai\n",
	" # the above url is only accessible from within the tutorial notebook (presumably since it's connected to Azure already)\n",
	" azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"), \n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Analyze Data from CSV Spreadsheet"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Asking langchain to analyze this data (2K rows): https://covidtracking.com/data/download/all-states-history.csv\n",
	"df = pd.read_csv(\"./data/all-states-history.csv\").fillna(value = 0)\n",
	"\n",
	"agent = create_pandas_dataframe_agent(\n",
	" llm=model,\n",
	" df=df, # input data\n",
	" verbose=True # makes langchain print its \"thoughts\"\n",
	")\n",
	"prompt = \"PROMPT HERE - ask questions about the data\"\n",
	"ans = agent.invoke(prompt)\n",
	"# ans[\"input\"] contains original prompt\n",
	"print(ans[\"output\"]) # model final response string"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Analyze Data from Database"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"db = SQLDatabase.from_uri(f'sqlite:///{database_file_path}') # change SQL url here\n",
	"toolkit = SQLDatabaseToolkit(db=db, llm=model)\n",
	"\n",
	"MSSQL_AGENT_PREFIX = \"\"\"\n",
	"\n",
	"You are an agent designed to interact with a SQL database.\n",
	"## Instructions:\n",
	"- Given an input question, create a syntactically correct {dialect} query\n",
	"to run, then look at the results of the query and return the answer.\n",
	"- Unless the user specifies a specific number of examples they wish to\n",
	"obtain, ALWAYS limit your query to at most {top_k} results.\n",
	"- You can order the results by a relevant column to return the most\n",
	"interesting examples in the database.\n",
	"- Never query for all the columns from a specific table, only ask for\n",
	"the relevant columns given the question.\n",
	"- You have access to tools for interacting with the database.\n",
	"- You MUST double check your query before executing it.If you get an error\n",
	"while executing a query,rewrite the query and try again.\n",
	"- DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.)\n",
	"to the database.\n",
	"- DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE, ONLY USE THE RESULTS\n",
	"OF THE CALCULATIONS YOU HAVE DONE.\n",
	"- Your response should be in Markdown. However, **when running a SQL Query\n",
	"in \"Action Input\", do not include the markdown backticks**.\n",
	"Those are only for formatting the response, not for executing the command.\n",
	"- ALWAYS, as part of your final answer, explain how you got to the answer\n",
	"on a section that starts with: \"Explanation:\". Include the SQL query as\n",
	"part of the explanation section.\n",
	"- If the question does not seem related to the database, just return\n",
	"\"I don\\'t know\" as the answer.\n",
	"- Only use the below tools. Only use the information returned by the\n",
	"below tools to construct your query and final answer.\n",
	"- Do not make up table names, only use the tables returned by any of the\n",
	"tools below.\n",
	"\n",
	"## Tools:\n",
	"\n",
	"\"\"\"\n",
	"\n",
	"MSSQL_AGENT_FORMAT_INSTRUCTIONS = \"\"\"\n",
	"\n",
	"## Use the following format:\n",
	"\n",
	"Question: the input question you must answer.\n",
	"Thought: you should always think about what to do.\n",
	"Action: the action to take, should be one of [{tool_names}].\n",
	"Action Input: the input to the action.\n",
	"Observation: the result of the action.\n",
	"... (this Thought/Action/Action Input/Observation can repeat N times)\n",
	"Thought: I now know the final answer.\n",
	"Final Answer: the final answer to the original input question.\n",
	"\n",
	"Example of Final Answer:\n",
	"<=== Beginning of example\n",
	"\n",
	"Action: query_sql_db\n",
	"Action Input: \n",
	"SELECT TOP (10) [death]\n",
	"FROM covidtracking \n",
	"WHERE state = 'TX' AND date LIKE '2020%'\n",
	"\n",
	"Observation:\n",
	"[(27437.0,), (27088.0,), (26762.0,), (26521.0,), (26472.0,), (26421.0,), (26408.0,)]\n",
	"Thought:I now know the final answer\n",
	"Final Answer: There were 27437 people who died of covid in Texas in 2020.\n",
	"\n",
	"Explanation:\n",
	"I queried the `covidtracking` table for the `death` column where the state\n",
	"is 'TX' and the date starts with '2020'. The query returned a list of tuples\n",
	"with the number of deaths for each day in 2020. To answer the question,\n",
	"I took the sum of all the deaths in the list, which is 27437.\n",
	"I used the following query\n",
	"\n",
	"```sql\n",
	"SELECT [death] FROM covidtracking WHERE state = 'TX' AND date LIKE '2020%'\"\n",
	"```\n",
	"===> End of Example\n",
	"\n",
	"\"\"\"\n",
	"\n",
	"agent_executor_SQL = create_sql_agent(\n",
	" prefix=MSSQL_AGENT_PREFIX, # similar to system prompt - instructions for how to generate SQL query (will be executed by langchain)\n",
	" format_instructions = MSSQL_AGENT_FORMAT_INSTRUCTIONS, # instructions + examples for how to create \"thoughts\" and final answer\n",
	" llm=model,\n",
	" toolkit=toolkit,\n",
	" top_k=30, # TODO: check what this does\n",
	" verbose=True\n",
	")\n",
	"\n",
	"QUESTION = \"\"\"How may patients were hospitalized during October 2020\n",
	"in New York, and nationwide as the total of all states?\n",
	"Use the hospitalizedIncrease column\n",
	"\"\"\"\n",
	"agent_executor_SQL.invoke(QUESTION)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Function Calling"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import json\n",
	"from openai import AzureOpenAI\n",
	"\n",
	"def FUNCTION_NAME(location, unit):\n",
	" return {\"location\": location, \"unit\": unit}\n",
	"\n",
	"messages = [\n",
	" {\"role\": \"user\",\n",
	" \"content\": \"PROMPT HERE\"\n",
	" }\n",
	"]\n",
	"\n",
	"# providing functions to GPT, which it can call - arguments to be provided by GPT\n",
	"tools = [\n",
	" {\n",
	" \"type\": \"function\",\n",
	" \"function\": {\n",
	" \"name\": \"FUNCTION_NAME\",\n",
	" \"description\": \"FUNCTION_DESCRIPTION\",\n",
	" \"parameters\": {\n",
	" \"type\": \"object\", # Python Dict\n",
	" \"properties\": {\n",
	" \"location\": {\n",
	" \"type\": \"string\",\n",
	" \"description\": \"DICT KEY DESCRIPTION\",\n",
	" },\n",
	" \"unit\": {\n",
	" \"type\": \"string\",\n",
	" \"default\":\"fahrenheit\",\n",
	" \"enum\": [ \"fahrenheit\", \"celsius\"],\n",
	" \"description\": \"DICT KEY DESCRIPTION\"\n",
	" },\n",
	" },\n",
	" \"required\": [\"location\"],\n",
	" },\n",
	" },\n",
	" }\n",
	"]\n",
	"\n",
	"client = AzureOpenAI(\n",
	" azure_endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n",
	" api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n",
	" api_version=\"2023-05-15\"\n",
	")\n",
	"response = client.chat.completions.create(\n",
	" model=\"gpt-4-1106\",\n",
	" messages=messages,\n",
	" tools=tools,\n",
	" tool_choice=\"auto\", \n",
	")\n",
	"\n",
	"response_message = response.choices[0].message\n",
	"tool_calls = response_message.tool_calls\n",
	"# tool calls is a list of functions with arguments output by GPT for calling\n",
	"functions = {\n",
	" 'FUNCTION_NAME': FUNCTION_NAME\n",
	"}\n",
	"for tool in tool_calls:\n",
	" print('GPT called this function:', tool.function.name, tool.function.arguments)\n",
	" kwargs = json.loads(tool.function.arguments)\n",
	" response = functions[tool.function.name](**kwargs)\n",
	" print(response)"
	]
	}
	],
	"metadata": {
	"language_info": {
	"name": "python"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}