Created
October 15, 2024 02:55
-
-
Save manisnesan/722d11258ef62e9c34886a5dad4aae99 to your computer and use it in GitHub Desktop.
Replication of the code for the article https://www.timescale.com/blog/rag-is-more-than-just-vector-search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "91230e03-ff94-44f6-b44e-0bd3ea6f656a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"from datetime import datetime\n", | |
"from openai import AsyncOpenAI, OpenAI\n", | |
"from asyncio import run, Semaphore\n", | |
"from tqdm.asyncio import tqdm_asyncio as asyncio\n", | |
"from textwrap import dedent\n", | |
"from instructor import from_openai\n", | |
"from jinja2 import Template\n", | |
"import os\n", | |
"from pgvector.asyncpg import register_vector\n", | |
"import asyncpg\n", | |
"import json\n", | |
"from dotenv import load_dotenv\n", | |
"from pydantic import BaseModel\n", | |
"from typing import Literal, Any, Optional" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "6855daa8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from IPython.display import Markdown\n", | |
"from rich import print as rprint\n", | |
"from rich import inspect as rinspect" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "dd98d5a4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"load_dotenv(dotenv_path='../.env', override=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e62adbfb", | |
"metadata": {}, | |
"source": [ | |
"# Extraction and Ingestion" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "58913c67", | |
"metadata": {}, | |
"source": [ | |
"## Data Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "8c5fb6fa-e2ea-4eda-aa29-ffe2090bebc6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pydantic import BaseModel\n", | |
"\n", | |
"class ClassifiedSummary(BaseModel):\n", | |
" chain_of_thought: str\n", | |
" label: Literal['OPEN', 'CLOSED']\n", | |
" summary: str" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "d04533fb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class ProcessedIssue(BaseModel):\n", | |
" issue_id: int\n", | |
" text: str\n", | |
" label: Literal['OPEN', 'CLOSED']\n", | |
" repo_name: str\n", | |
" embedding: Optional[list[float]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "420cde3c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class GithubIssue(BaseModel):\n", | |
" issue_id: int\n", | |
" metadata: dict[str, Any]\n", | |
" text: str\n", | |
" repo_name: str\n", | |
" start_ts: datetime\n", | |
" end_ts: Optional[datetime]\n", | |
" embedding: Optional[list[float]]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c08f9d8f", | |
"metadata": {}, | |
"source": [ | |
"## Using generators and datasets " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "48dbf21e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"repos = ['rust-lang/rust', 'kubernetes/kubernetes', 'apache/spark', 'golang/go', 'tensorflow/tensorflow', 'MicrosoftDocs/azure-docs', 'pytorch/pytorch', 'Microsoft/TypeScript', 'python/cpython', 'facebook/react', 'django/django', 'rails/rails', 'bitcoin/bitcoin', 'nodejs/node', 'ocaml/opam-repository', 'apache/airflow', 'scipy/scipy', 'vercel/next.js']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "de2edc7e", | |
"metadata": {}, | |
"source": [ | |
"Accept the permissions and ensure the access token has \"Read access to contents of all public gated repos you can access\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "161214c1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"\n", | |
"gh_issues = 'bigcode/the-stack-github-issues'\n", | |
"n = 10\n", | |
"dataset = load_dataset(gh_issues, split='train', streaming=True).filter(lambda x: x['repo'] in repos).take(n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "45ac3fae", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"IterableDataset({\n", | |
" features: Unknown,\n", | |
" n_shards: 127\n", | |
"})" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "ea955b16", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'repo': 'rails/rails',\n", | |
" 'issue_id': 3362540,\n", | |
" 'issue_number': 5145,\n", | |
" 'pull_request': None,\n", | |
" 'events': [{'action': 'opened',\n", | |
" 'author': 'batter',\n", | |
" 'comment_id': None,\n", | |
" 'datetime': '2012-02-23T20:34:32Z',\n", | |
" 'masked_author': 'username_0',\n", | |
" 'text': \"When you create a brand new application using `rails 3.2.x`, the default configuration for the asset pipeline in the `development` environment, as set in `config/environments/development.rb` is as follows:\\n\\n # Do not compress assets\\n config.assets.compress = false\\n \\n # Expands the lines which load the assets\\n config.assets.debug = true\\n\\nThis is described in the Asset Pipeline documentation [as shown here](http://guides.rubyonrails.org/asset_pipeline.html#upgrading-from-old-versions-of-rails). Unfortunately, this setup doesn't actually work out of the box. If you generate a brand new application with a basic controller serving up a page with plain text, and then inspect the code, you'll notice that it does indeed acknowledge the `config.assets.debug = true` option, as multiple javascript files will be included (`jquery.js` and `jquery_ujs.js` are provided by default). \\n\\nHowever, if you inspect what's inside of the `application.js` file that is being served up, you will notice that it is actually a compressed combination of all the others. This must mean that `config.assets.compress = false` is being ignored and treated as `true` no matter what.\\n\\nYou'll also notice that if you inspect the javascript console in a browser like chrome, it causes a javascript error to be thrown with this message: `Uncaught TypeError: undefined is not a function (application.js:32)`. Worse yet, all jQuery scripts cease to function properly if you try to run with it like this. I believe the error is thrown as a result of jQuery being included twice, but it's difficult to tell for certain since the `application.js` file is compressed.\\n\\nNote that the application will still function fine in `production` mode, but it doesn't work properly in `development` mode. Now there are two possible workarounds; the first is to set `config.assets.debug = false`, which will prevent the individual javascript files from being served up. This defeats the purpose of the development mode configuration, as the application will essentially work as though it's being run in production mode (with just one giant compressed `application.js` file). The second, more functional workaround, [as found here](http://stackoverflow.com/questions/8356251/rails-3-1-assets-strange-serving-in-development) is to add `config.serve_static_assets = false` to your `config/environments/development.rb` file. This will only work if you run `rake assets:precompile` at least once.\\n\\nThe fact remains that this is a bug, and that unless this is addressed, the `assets.compress` option is essentially useless without a hack workaround.\\n\\nHere's a link to [a simple app I created which demonstrates this bug](https://github.com/fullbridge-batkins/rails32_asset_problems).\\n\\nI am not the first to acknowledge this. It has been mentioned in [this issue report](https://github.com/rails/rails/issues/5095), as well as [this post on StackOverflow](http://stackoverflow.com/questions/8356251/rails-3-1-assets-strange-serving-in-development). Pardon me for re-posting this but I feel the other issue is mislabeled.\",\n", | |
" 'title': 'Asset Pipeline ignores assets.compress config option',\n", | |
" 'type': 'issue'},\n", | |
" {'action': 'created',\n", | |
" 'author': 'JonyGreen3',\n", | |
" 'comment_id': 139855557,\n", | |
" 'datetime': '2015-09-13 09:19:58+00:00',\n", | |
" 'masked_author': 'username_1',\n", | |
" 'text': 'i find a free online service to <a href=\"http://www.online-code.net/minify-js.html\">compress js</a> and <a href=\"http://www.online-code.net/minify-css.html\">minify css</a>, so it will reduce the size of web page.',\n", | |
" 'title': None,\n", | |
" 'type': 'comment'}],\n", | |
" 'text_size': 3308,\n", | |
" 'content': '<issue_start><issue_comment>Title: Asset Pipeline ignores assets.compress config option\\nusername_0: When you create a brand new application using `rails 3.2.x`, the default configuration for the asset pipeline in the `development` environment, as set in `config/environments/development.rb` is as follows:\\n\\n # Do not compress assets\\n config.assets.compress = false\\n \\n # Expands the lines which load the assets\\n config.assets.debug = true\\n\\nThis is described in the Asset Pipeline documentation [as shown here](http://guides.rubyonrails.org/asset_pipeline.html#upgrading-from-old-versions-of-rails). Unfortunately, this setup doesn\\'t actually work out of the box. If you generate a brand new application with a basic controller serving up a page with plain text, and then inspect the code, you\\'ll notice that it does indeed acknowledge the `config.assets.debug = true` option, as multiple javascript files will be included (`jquery.js` and `jquery_ujs.js` are provided by default). \\n\\nHowever, if you inspect what\\'s inside of the `application.js` file that is being served up, you will notice that it is actually a compressed combination of all the others. This must mean that `config.assets.compress = false` is being ignored and treated as `true` no matter what.\\n\\nYou\\'ll also notice that if you inspect the javascript console in a browser like chrome, it causes a javascript error to be thrown with this message: `Uncaught TypeError: undefined is not a function (application.js:32)`. Worse yet, all jQuery scripts cease to function properly if you try to run with it like this. I believe the error is thrown as a result of jQuery being included twice, but it\\'s difficult to tell for certain since the `application.js` file is compressed.\\n\\nNote that the application will still function fine in `production` mode, but it doesn\\'t work properly in `development` mode. Now there are two possible workarounds; the first is to set `config.assets.debug = false`, which will prevent the individual javascript files from being served up. This defeats the purpose of the development mode configuration, as the application will essentially work as though it\\'s being run in production mode (with just one giant compressed `application.js` file). The second, more functional workaround, [as found here](http://stackoverflow.com/questions/8356251/rails-3-1-assets-strange-serving-in-development) is to add `config.serve_static_assets = false` to your `config/environments/development.rb` file. This will only work if you run `rake assets:precompile` at least once.\\n\\nThe fact remains that this is a bug, and that unless this is addressed, the `assets.compress` option is essentially useless without a hack workaround.\\n\\nHere\\'s a link to [a simple app I created which demonstrates this bug](https://github.com/fullbridge-batkins/rails32_asset_problems).\\n\\nI am not the first to acknowledge this. It has been mentioned in [this issue report](https://github.com/rails/rails/issues/5095), as well as [this post on StackOverflow](http://stackoverflow.com/questions/8356251/rails-3-1-assets-strange-serving-in-development). Pardon me for re-posting this but I feel the other issue is mislabeled.\\n<issue_comment>username_1: i find a free online service to <a href=\"http://www.online-code.net/minify-js.html\">compress js</a> and <a href=\"http://www.online-code.net/minify-css.html\">minify css</a>, so it will reduce the size of web page.',\n", | |
" 'usernames': '[\"batter\", \"JonyGreen3\"]'}" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"list(dataset)[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "4ffcb53b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"issues = []\n", | |
"for row in dataset:\n", | |
" start_time, end_time = None, None\n", | |
" for event in row['events']:\n", | |
" event_type = event['action']\n", | |
" timestamp = event['datetime']\n", | |
" timestamp = timestamp.replace(\"Z\", \"+00:00\")\n", | |
"\n", | |
" if event_type == \"opened\": start_time = datetime.fromisoformat(timestamp)\n", | |
" elif event_type == 'closed': end_time = datetime.fromisoformat(timestamp)\n", | |
" elif event_type == \"created\" and not start_time: start_time = datetime.fromisoformat(timestamp) \n", | |
" elif event_type == \"reopened\" and not start_time: start_time = datetime.fromisoformat(timestamp)\n", | |
"\n", | |
" issues.append(GithubIssue(issue_id=row['issue_id'], metadata={}, text=row['content'], repo_name=row['repo'], start_ts=start_time, end_ts=end_time, embedding=None))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 144, | |
"id": "034b1763", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# issues[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "78cea959", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_issues(n: int, repos: list[str]):\n", | |
" dataset = load_dataset(gh_issues, split='train', streaming=True).filter(lambda x: x['repo'] in repos).take(n)\n", | |
"\n", | |
" for row in dataset:\n", | |
" start_time, end_time = None, None\n", | |
" for event in row['events']:\n", | |
" event_type = event['action']\n", | |
" timestamp = event['datetime']\n", | |
" timestamp = timestamp.replace(\"Z\", \"+00:00\")\n", | |
"\n", | |
" if event_type == \"opened\": start_time = datetime.fromisoformat(timestamp)\n", | |
" elif event_type == 'closed': end_time = datetime.fromisoformat(timestamp)\n", | |
" elif event_type == \"created\" and not start_time: start_time = datetime.fromisoformat(timestamp) \n", | |
" elif event_type == \"reopened\" and not start_time: start_time = datetime.fromisoformat(timestamp)\n", | |
"\n", | |
" yield GithubIssue(issue_id=row['issue_id'], metadata={}, text=row['content'], repo_name=row['repo'], start_ts=start_time, end_ts=end_time, embedding=None)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ebc2bcac", | |
"metadata": {}, | |
"source": [ | |
"## Data processing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"id": "f25d4579", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<instructor.client.Instructor at 0x7f01f010aa90>" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client = from_openai(OpenAI()); client" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"id": "3c24b9fb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sys_prompt = \"You are a helpful assistant that classifies and summarizes GitHub issues. When summarizing the issues, make sure to expand on specific accronyms and add additional explanation where necessary.\"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"id": "ccbb8b9e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"You are a helpful assistant that classifies and summarizes GitHub issues. When summarizing the issues, make sure to expand on specific accronyms and add additional explanation where necessary." | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Markdown(sys_prompt)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"id": "c1391b1e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"GithubIssue(issue_id=135787405, metadata={}, text='<issue_start><issue_comment>Title: stats standardized distributions - request for expected API\\nusername_0: I\\'m a mathematician trying to plot several distributions (notably in this case Gamma and truncated Normal) in order to do some basic notebook-style exploration for different parameter choices. I first started with truncated normal, and after over half an hour, I still could not get scipy.stats.truncnorm to plot a simple normal distribution with mean = 18, standard distribution = 1.5, but truncated at 12 and 24. This includes after trying to follow the formula in the documentation for a and b - the behavior was always surprising, typically cutting the distribution in half and throwing it\\'s support way beyond 24. I finally settled for just plotting a regular normal distribution when I couldn\\'t get it to work.\\r\\n\\r\\nNow I\\'m messing around with the gamma distribution, and the API is again surprising. I would suggest that in the spirit of \"batteries included\", the user shouldn\\'t have to do the sort of computations described in http://docs.scipy.org/doc/scipy/reference/tutorial/stats.html, shape parameter section, just to use the gamma distribution with its two typical parameters (k and theta, or alpha and beta, it doesn\\'t matter - they are easy to translate between) as described on wikipedia https://en.wikipedia.org/wiki/Gamma_distribution. \\r\\n\\r\\nMy request is that these standardized functions which lack a typical, full parameterization expose some kind of API allowing the user to specify all their typical parameters directly and get back a distribution with no further work required.\\n<issue_comment>username_0: I believe I mis-spoke on the case of the Gamma distribution - the scale parameter is equivalent to beta. Just a small edition to the doc string making this parameterization choice clear would be helpful.\\n<issue_comment>username_1: Cross-ref https://github.com/scipy/scipy/issues/4538.\\r\\nPRs welcome I guess.\\n<issue_comment>username_0: Ah, thanks! I searched for the issue figuring I couldn\\'t possibly be the only one having trouble with this, but didn\\'t find it with the key words I was using.\\n<issue_comment>username_2: I put link to paramnormal on 4538\\r\\n\\r\\nAll distribution are in the standard location scale form which can be a bit painful for support bounds that are not inf or zero.\\n<issue_comment>username_1: OK, I\\'m going to close this as a duplicate of gh-4538. @username_0 this is not to say we do not value your report, this is only to say that one issue is enough to track progress on this. Contributions welcome!<issue_closed>\\n<issue_comment>username_0: No problem - I understand. Thanks!!', repo_name='scipy/scipy', start_ts=datetime.datetime(2016, 2, 23, 16, 31, 15, tzinfo=datetime.timezone.utc), end_ts=datetime.datetime(2016, 2, 23, 20, 14, 35, tzinfo=datetime.timezone.utc), embedding=None)" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"issues[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"id": "cef29028", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"\n", | |
"Repo Name: scipy/scipy\n", | |
"Issue Text: <issue_start><issue_comment>Title: stats standardized distributions - request for expected API\n", | |
"username_0: I'm a mathematician trying to plot several distributions (notably in this case Gamma and truncated Normal) in order to do some basic notebook-style exploration for different parameter choices. I first started with truncated normal, and after over half an hour, I still could not get scipy.stats.truncnorm to plot a simple normal distribution with mean = 18, standard distribution = 1.5, but truncated at 12 and 24. This includes after trying to follow the formula in the documentation for a and b - the behavior was always surprising, typically cutting the distribution in half and throwing it's support way beyond 24. I finally settled for just plotting a regular normal distribution when I couldn't get it to work.\r\n", | |
"\r\n", | |
"Now I'm messing around with the gamma distribution, and the API is again surprising. I would suggest that in the spirit of \"batteries included\", the user shouldn't have to do the sort of computations described in http://docs.scipy.org/doc/scipy/reference/tutorial/stats.html, shape parameter section, just to use the gamma distribution with its two typical parameters (k and theta, or alpha and beta, it doesn't matter - they are easy to translate between) as described on wikipedia https://en.wikipedia.org/wiki/Gamma_distribution. \r\n", | |
"\r\n", | |
"My request is that these standardized functions which lack a typical, full parameterization expose some kind of API allowing the user to specify all their typical parameters directly and get back a distribution with no further work required.\n", | |
"<issue_comment>username_0: I believe I mis-spoke on the case of the Gamma distribution - the scale parameter is equivalent to beta. Just a small edition to the doc string making this parameterization choice clear would be helpful.\n", | |
"<issue_comment>username_1: Cross-ref https://github.com/scipy/scipy/issues/4538.\r\n", | |
"PRs welcome I guess.\n", | |
"<issue_comment>username_0: Ah, thanks! I searched for the issue figuring I couldn't possibly be the only one having trouble with this, but didn't find it with the key words I was using.\n", | |
"<issue_comment>username_2: I put link to paramnormal on 4538\r\n", | |
"\r\n", | |
"All distribution are in the standard location scale form which can be a bit painful for support bounds that are not inf or zero.\n", | |
"<issue_comment>username_1: OK, I'm going to close this as a duplicate of gh-4538. @username_0 this is not to say we do not value your report, this is only to say that one issue is enough to track progress on this. Contributions welcome!<issue_closed>\n", | |
"<issue_comment>username_0: No problem - I understand. Thanks!!" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"execution_count": 64, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"usr = Template(\n", | |
" dedent(\n", | |
" \"\"\"\n", | |
" Repo Name: {{ repo_name }}\n", | |
" Issue Text: {{ issue_text}}\n", | |
" \"\"\"\n", | |
" )\n", | |
").render(repo_name=issues[0].repo_name, issue_text=issues[0].text)\n", | |
"Markdown(usr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "ee11f903", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"client = from_openai(OpenAI())\n", | |
"classification = client.chat.completions.create(\n", | |
" response_model=ClassifiedSummary,\n", | |
" messages=[\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"You are a helpful assistant that classifies and summarizes GitHub issues. When summarizing the issues, make sure to expand on specific accronyms and add additional explanation where necessary.\",\n", | |
" },\n", | |
" {\n", | |
" \"role\": \"user\",\n", | |
" \"content\": Template(\n", | |
" dedent(\n", | |
" \"\"\"\n", | |
" Repo Name: {{ repo_name }}\n", | |
" Issue Text: {{ issue_text}}\n", | |
" \"\"\"\n", | |
" )\n", | |
" ).render(repo_name=issues[0].repo_name, issue_text=issues[0].text),\n", | |
" },\n", | |
" ],\n", | |
" model=\"gpt-4o-mini\",\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "bc89bc54", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ClassifiedSummary</span><span style=\"font-weight: bold\">(</span>\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">chain_of_thought</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The user, identifying as a mathematician, expressed difficulty in plotting distributions </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">using the SciPy library, specifically the truncated normal and gamma distributions. They found the API </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">non-intuitive, requiring extra computations beyond what they believed was necessary for standard parameters, </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">leading to frustration in both cases. The user requested the implementation of a more direct API allowing for all </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">typical parameters to be specified without additional calculations. They later recognized a small error regarding </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">the parameterization of the gamma distribution but reiterated the need for clearer documentation. The conversation </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">indicated that another issue had already been created addressing similar concerns, leading to the current issue </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">being marked as a duplicate. Overall, the main focus was on improving usability and clarity in the SciPy stats </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">module for standardized distributions, with suggestions for better documentation and direct parameter exposure to </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">simplify user interactions.'</span>,\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">label</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'CLOSED'</span>,\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">summary</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"A user reported difficulties in plotting the truncated normal and gamma distributions using SciPy's </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">stats module, finding the API confusing and requiring excessive computations for standard parameters. They </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">requested an improved API that would allow for easier parameter specification without additional calculations. </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">After clarifying some terminology and finding the issue to be a duplicate of another already existing issue, the </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">case was closed. The discussion highlighted the need for improved documentation and usability in the distribution </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">functions.\"</span>\n", | |
"<span style=\"font-weight: bold\">)</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[1;35mClassifiedSummary\u001b[0m\u001b[1m(\u001b[0m\n", | |
" \u001b[33mchain_of_thought\u001b[0m=\u001b[32m'The user, identifying as a mathematician, expressed difficulty in plotting distributions \u001b[0m\n", | |
"\u001b[32musing the SciPy library, specifically the truncated normal and gamma distributions. They found the API \u001b[0m\n", | |
"\u001b[32mnon-intuitive, requiring extra computations beyond what they believed was necessary for standard parameters, \u001b[0m\n", | |
"\u001b[32mleading to frustration in both cases. The user requested the implementation of a more direct API allowing for all \u001b[0m\n", | |
"\u001b[32mtypical parameters to be specified without additional calculations. They later recognized a small error regarding \u001b[0m\n", | |
"\u001b[32mthe parameterization of the gamma distribution but reiterated the need for clearer documentation. The conversation \u001b[0m\n", | |
"\u001b[32mindicated that another issue had already been created addressing similar concerns, leading to the current issue \u001b[0m\n", | |
"\u001b[32mbeing marked as a duplicate. Overall, the main focus was on improving usability and clarity in the SciPy stats \u001b[0m\n", | |
"\u001b[32mmodule for standardized distributions, with suggestions for better documentation and direct parameter exposure to \u001b[0m\n", | |
"\u001b[32msimplify user interactions.'\u001b[0m,\n", | |
" \u001b[33mlabel\u001b[0m=\u001b[32m'CLOSED'\u001b[0m,\n", | |
" \u001b[33msummary\u001b[0m=\u001b[32m\"A\u001b[0m\u001b[32m user reported difficulties in plotting the truncated normal and gamma distributions using SciPy's \u001b[0m\n", | |
"\u001b[32mstats module, finding the API confusing and requiring excessive computations for standard parameters. They \u001b[0m\n", | |
"\u001b[32mrequested an improved API that would allow for easier parameter specification without additional calculations. \u001b[0m\n", | |
"\u001b[32mAfter clarifying some terminology and finding the issue to be a duplicate of another already existing issue, the \u001b[0m\n", | |
"\u001b[32mcase was closed. The discussion highlighted the need for improved documentation and usability in the distribution \u001b[0m\n", | |
"\u001b[32mfunctions.\"\u001b[0m\n", | |
"\u001b[1m)\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rprint(classification)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "af782ee4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"proc_issue = ProcessedIssue(issue_id=issues[0].issue_id, repo_name=issues[0].repo_name, text=classification.summary, label=classification.label, embedding=None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"id": "dc630bdf", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ProcessedIssue</span><span style=\"font-weight: bold\">(</span>\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">issue_id</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">135787405</span>,\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"A user reported difficulties in plotting the truncated normal and gamma distributions using SciPy's stats</span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">module, finding the API confusing and requiring excessive computations for standard parameters. They requested an </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">improved API that would allow for easier parameter specification without additional calculations. After clarifying </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">some terminology and finding the issue to be a duplicate of another already existing issue, the case was closed. </span>\n", | |
"<span style=\"color: #008000; text-decoration-color: #008000\">The discussion highlighted the need for improved documentation and usability in the distribution functions.\"</span>,\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">label</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'CLOSED'</span>,\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">repo_name</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'scipy/scipy'</span>,\n", | |
" <span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n", | |
"<span style=\"font-weight: bold\">)</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[1;35mProcessedIssue\u001b[0m\u001b[1m(\u001b[0m\n", | |
" \u001b[33missue_id\u001b[0m=\u001b[1;36m135787405\u001b[0m,\n", | |
" \u001b[33mtext\u001b[0m=\u001b[32m\"A\u001b[0m\u001b[32m user reported difficulties in plotting the truncated normal and gamma distributions using SciPy's stats\u001b[0m\n", | |
"\u001b[32mmodule, finding the API confusing and requiring excessive computations for standard parameters. They requested an \u001b[0m\n", | |
"\u001b[32mimproved API that would allow for easier parameter specification without additional calculations. After clarifying \u001b[0m\n", | |
"\u001b[32msome terminology and finding the issue to be a duplicate of another already existing issue, the case was closed. \u001b[0m\n", | |
"\u001b[32mThe discussion highlighted the need for improved documentation and usability in the distribution functions.\"\u001b[0m,\n", | |
" \u001b[33mlabel\u001b[0m=\u001b[32m'CLOSED'\u001b[0m,\n", | |
" \u001b[33mrepo_name\u001b[0m=\u001b[32m'scipy/scipy'\u001b[0m,\n", | |
" \u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m\n", | |
"\u001b[1m)\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rprint(proc_issue)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "3104887c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"async def batch_classify_issue(\n", | |
" batch: list[GithubIssue], max_concurrent_requests: int = 20\n", | |
") -> list[ProcessedIssue]:\n", | |
" async def classify_issue(issue: GithubIssue, semaphore: Semaphore):\n", | |
" client = from_openai(AsyncOpenAI())\n", | |
" async with semaphore:\n", | |
" classification = await client.chat.completions.create(\n", | |
" response_model=ClassifiedSummary,\n", | |
" messages=[\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"You are a helpful assistant that classifies and summarizes GitHub issues. When summarizing the issues, make sure to expand on specific accronyms and add additional explanation where necessary.\",\n", | |
" },\n", | |
" {\n", | |
" \"role\": \"user\",\n", | |
" \"content\": Template(\n", | |
" dedent(\n", | |
" \"\"\"\n", | |
" Repo Name: {{ repo_name }}\n", | |
" Issue Text: {{ issue_text}}\n", | |
" \"\"\"\n", | |
" )\n", | |
" ).render(repo_name=issue.repo_name, issue_text=issue.text),\n", | |
" },\n", | |
" ],\n", | |
" model=\"gpt-4o-mini\",\n", | |
" )\n", | |
" return ProcessedIssue(\n", | |
" issue_id=issue.issue_id,\n", | |
" repo_name=issue.repo_name,\n", | |
" text=classification.summary,\n", | |
" label=classification.label,\n", | |
" embedding=None,\n", | |
" )\n", | |
"\n", | |
" semaphore = Semaphore(max_concurrent_requests)\n", | |
" coros = [classify_issue(item, semaphore) for item in batch]\n", | |
" results = await asyncio.gather(*coros)\n", | |
" return results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"id": "a0f2b003", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"item = proc_issue\n", | |
"input_text = item.text if len(item.text) < 8000 else item.text[:6000]\n", | |
"oai = OpenAI()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"id": "179feeb3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"embedding = oai.embeddings.create(input = input_text, model = 'text-embedding-3-small')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"id": "649a066d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"item.embedding = embedding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"id": "ce46638f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# rinspect(embedding)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"id": "8c3ee1bc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1536" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(embedding.data[0].embedding)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"id": "53d7809d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"async def batch_embeddings(\n", | |
" data: list[ProcessedIssue],\n", | |
" max_concurrent_calls: int = 20,\n", | |
") -> list[ProcessedIssue]:\n", | |
" oai = AsyncOpenAI()\n", | |
"\n", | |
" async def embed_row(\n", | |
" item: ProcessedIssue,\n", | |
" semaphore: Semaphore,\n", | |
" ):\n", | |
" async with semaphore:\n", | |
" input_text = item.text if len(item.text) < 8000 else item.text[:6000]\n", | |
" embedding = (\n", | |
" (\n", | |
" await oai.embeddings.create(\n", | |
" input=input_text, model=\"text-embedding-3-small\"\n", | |
" )\n", | |
" )\n", | |
" .data[0]\n", | |
" .embedding\n", | |
" )\n", | |
" item.embedding = embedding\n", | |
" return item\n", | |
"\n", | |
" semaphore = Semaphore(max_concurrent_calls)\n", | |
" coros = [embed_row(item, semaphore) for item in data]\n", | |
" results = await asyncio.gather(*coros)\n", | |
" return results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"id": "8e95b590", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# rinspect(item)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e3bb6ed6", | |
"metadata": {}, | |
"source": [ | |
"## Storing and Indexing enhanced data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "48a30b37", | |
"metadata": {}, | |
"source": [ | |
"SQL script is used to set up a database schema for storing and indexing GitHub issues with vector embeddings. Here's a breakdown of what each part of the script does:\n", | |
"\n", | |
"1. **Create Extension**:\n", | |
" ```sql\n", | |
" CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;\n", | |
" ```\n", | |
" - This line attempts to create the `vectorscale` extension if it doesn't already exist. Extensions in PostgreSQL are modules that add additional functionality to the database. The `CASCADE` option ensures that any dependencies required by the extension are also installed.\n", | |
"\n", | |
"2. **Drop Tables**:\n", | |
" ```sql\n", | |
" DROP TABLE IF EXISTS github_issue_summaries CASCADE;\n", | |
" DROP TABLE IF EXISTS github_issues CASCADE;\n", | |
" ```\n", | |
" - These lines drop the tables `github_issue_summaries` and `github_issues` if they exist. The `CASCADE` option ensures that any dependent objects (like indexes or foreign keys) are also dropped.\n", | |
"\n", | |
"3. **Create Table**:\n", | |
" ```sql\n", | |
" CREATE TABLE IF NOT EXISTS github_issues (\n", | |
" issue_id INTEGER,\n", | |
" metadata JSONB,\n", | |
" text TEXT,\n", | |
" repo_name TEXT,\n", | |
" start_ts TIMESTAMPTZ NOT NULL,\n", | |
" end_ts TIMESTAMPTZ,\n", | |
" embedding VECTOR(1536) NOT NULL\n", | |
" );\n", | |
" ```\n", | |
" - This creates a table named `github_issues` if it doesn't already exist. The table has the following columns:\n", | |
" - `issue_id`: An integer representing the unique identifier for the issue.\n", | |
" - `metadata`: A JSONB column for storing additional metadata about the issue.\n", | |
" - `text`: A text column for storing the issue's content.\n", | |
" - `repo_name`: A text column for storing the name of the repository the issue belongs to.\n", | |
" - `start_ts`: A timestamp with time zone (TIMESTAMPTZ) indicating when the issue was opened. This field is required (`NOT NULL`).\n", | |
" - `end_ts`: A TIMESTAMPTZ indicating when the issue was closed. This field is optional.\n", | |
" - `embedding`: A vector column with a dimension of 1536, used to store the vector representation of the issue. This field is required (`NOT NULL`).\n", | |
"\n", | |
"4. **Create Index**:\n", | |
" ```sql\n", | |
" CREATE INDEX github_issue_embedding_idx\n", | |
" ON github_issues\n", | |
" USING diskann (embedding);\n", | |
" ```\n", | |
" - This creates an index on the `embedding` column of the `github_issues` table using the `diskann` indexing method. `diskann` is a specialized index type used for efficient similarity search on high-dimensional vectors, which is useful for operations like nearest neighbor search in machine learning applications.\n", | |
"\n", | |
"Overall, this SQL script sets up a table to store GitHub issues with vector embeddings and creates an index to efficiently query these embeddings. This setup is likely part of a system that processes and analyzes GitHub issues using machine learning techniques." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c61d8e4f", | |
"metadata": {}, | |
"source": [ | |
"1. **Create a Hypertable**:\n", | |
" ```sql\n", | |
" SELECT create_hypertable('github_issues', 'start_ts', chunk_time_interval => INTERVAL '1 month');\n", | |
" ```\n", | |
" - This line converts the `github_issues` table into a hypertable, which is a special type of table in TimescaleDB designed for time-series data. \n", | |
" - The hypertable is partitioned by the `start_ts` column, which represents the start timestamp of each issue.\n", | |
" - The `chunk_time_interval => INTERVAL '1 month'` specifies that the data should be partitioned into chunks of one month. This helps in efficiently managing and querying large volumes of time-series data.\n", | |
"\n", | |
"2. **Create a Unique Index**:\n", | |
" ```sql\n", | |
" CREATE UNIQUE INDEX ON github_issues (issue_id, start_ts);\n", | |
" ```\n", | |
" - This creates a unique index on the `github_issues` table using the combination of `issue_id` and `start_ts`.\n", | |
" - The unique index ensures that each combination of `issue_id` and `start_ts` is unique, preventing duplicate entries for the same issue at the same start time.\n", | |
"\n", | |
"3. **Create Table for Summaries**:\n", | |
" ```sql\n", | |
" CREATE TABLE github_issue_summaries (\n", | |
" issue_id INTEGER,\n", | |
" text TEXT,\n", | |
" label issue_label NOT NULL,\n", | |
" repo_name TEXT,\n", | |
" embedding VECTOR(1536) NOT NULL\n", | |
" );\n", | |
" ```\n", | |
" - This creates a new table named `github_issue_summaries` to store summarized information about GitHub issues.\n", | |
" - The table includes the following columns:\n", | |
" - `issue_id`: An integer representing the unique identifier for the issue.\n", | |
" - `text`: A text column for storing the summarized content of the issue.\n", | |
" - `label`: A column of type `issue_label` (presumably a custom type) that indicates the classification label of the issue. This field is required (`NOT NULL`).\n", | |
" - `repo_name`: A text column for storing the name of the repository the issue belongs to.\n", | |
" - `embedding`: A vector column with a dimension of 1536, used to store the vector representation of the summarized issue. This field is required (`NOT NULL`).\n", | |
"\n", | |
"4. **Create Index on Summaries**:\n", | |
" ```sql\n", | |
" CREATE INDEX github_issue_summaries_embedding_idx\n", | |
" ON github_issue_summaries\n", | |
" USING diskann (embedding);\n", | |
" ```\n", | |
" - This creates an index on the `embedding` column of the `github_issue_summaries` table using the `diskann` indexing method.\n", | |
" - `diskann` is a specialized index type used for efficient similarity search on high-dimensional vectors, which is useful for operations like nearest neighbor search in machine learning applications.\n", | |
"\n", | |
"Overall, this SQL script sets up a schema for efficiently storing and querying GitHub issues and their summaries, leveraging TimescaleDB's capabilities for handling time-series data and vector indexing for machine learning applications." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"id": "e979d031", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"from pgvector.asyncpg import register_vector\n", | |
"import asyncpg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"id": "ee7e5a22", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"conn = await asyncpg.connect(os.getenv('DB_URL'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"id": "6ef8431c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<Record extname='plpgsql' extversion='1.0'>\n", | |
"<Record extname='pg_stat_statements' extversion='1.10'>\n", | |
"<Record extname='timescaledb' extversion='2.17.0'>\n", | |
"<Record extname='timescaledb_toolkit' extversion='1.18.0'>\n", | |
"<Record extname='vector' extversion='0.7.4'>\n" | |
] | |
} | |
], | |
"source": [ | |
"extensions = await conn.fetch('select extname, extversion from pg_extension')\n", | |
"for ext in extensions: print(ext)\n", | |
"await conn.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"id": "b251432c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'CREATE INDEX'" | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"init_sql = \"\"\"\n", | |
"CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;\n", | |
"\n", | |
"DROP TABLE IF EXISTS github_issue_summaries CASCADE;\n", | |
"DROP TABLE IF EXISTS github_issues CASCADE;\n", | |
"\n", | |
"CREATE TABLE IF NOT EXISTS github_issues (\n", | |
" issue_id INTEGER,\n", | |
" metadata JSONB,\n", | |
" text TEXT,\n", | |
" repo_name TEXT,\n", | |
" start_ts TIMESTAMPTZ NOT NULL,\n", | |
" end_ts TIMESTAMPTZ,\n", | |
" embedding VECTOR(1536) NOT NULL\n", | |
");\n", | |
"\n", | |
"CREATE INDEX github_issue_embedding_idx\n", | |
"ON github_issues\n", | |
"USING diskann (embedding);\n", | |
"\n", | |
"-- Create a Hypertable that breaks it down by 1 month intervals\n", | |
"SELECT create_hypertable('github_issues', 'start_ts', chunk_time_interval => INTERVAL '1 month');\n", | |
"\n", | |
"CREATE UNIQUE INDEX ON github_issues (issue_id, start_ts);\n", | |
"\n", | |
"CREATE TYPE issue_label AS ENUM ('OPEN', 'CLOSED');\n", | |
"\n", | |
"CREATE TABLE github_issue_summaries (\n", | |
" issue_id INTEGER,\n", | |
" text TEXT,\n", | |
" label issue_label NOT NULL,\n", | |
" repo_name TEXT,\n", | |
" embedding VECTOR(1536) NOT NULL\n", | |
");\n", | |
"\n", | |
"CREATE INDEX github_issue_summaries_embedding_idx\n", | |
"ON github_issue_summaries\n", | |
"USING diskann (embedding);\n", | |
"\"\"\"\n", | |
"\n", | |
"async def get_conn():\n", | |
" conn = await asyncpg.connect(os.getenv(\"DB_URL\"))\n", | |
" await conn.execute('CREATE EXTENSION IF NOT EXISTS vector CASCADE;')\n", | |
" await register_vector(conn)\n", | |
" return conn\n", | |
"\n", | |
"conn = await get_conn()\n", | |
"await conn.execute(init_sql)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "56007ac0", | |
"metadata": {}, | |
"source": [ | |
"Added `CREATE TYPE issue_label AS ENUM ('OPEN', 'CLOSED');` \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 78, | |
"id": "954878df", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"\n", | |
"async def insert_github_issue_summaries(conn, issues: list[GithubIssue]):\n", | |
" insert_query = \"\"\"\n", | |
" INSERT INTO github_issue_summaries (issue_id, text, label, embedding,repo_name)\n", | |
" VALUES ($1, $2, $3, $4, $5)\n", | |
" \"\"\"\n", | |
" summarized_issues = await batch_classify_issue(issues)\n", | |
" embedded_summaries = await batch_embeddings(summarized_issues)\n", | |
"\n", | |
" await conn.executemany(\n", | |
" insert_query,\n", | |
" [\n", | |
" (item.issue_id, item.text, item.label, item.embedding, item.repo_name)\n", | |
" for item in embedded_summaries\n", | |
" ],\n", | |
" )\n", | |
"\n", | |
" print(\"GitHub issue summaries inserted successfully.\")\n", | |
"\n", | |
"async def insert_github_issues(conn, issues: list[GithubIssue]):\n", | |
" insert_query = \"\"\"\n", | |
" INSERT INTO github_issues (issue_id, metadata, text, repo_name, start_ts, end_ts, embedding)\n", | |
" VALUES ($1, $2, $3, $4, $5, $6, $7)\n", | |
" \"\"\"\n", | |
" embedded_issues = await batch_embeddings(issues)\n", | |
"\n", | |
" await conn.executemany(\n", | |
" insert_query,\n", | |
" [\n", | |
" (\n", | |
" item.issue_id,\n", | |
" json.dumps(item.metadata),\n", | |
" item.text,\n", | |
" item.repo_name,\n", | |
" item.start_ts,\n", | |
" item.end_ts,\n", | |
" item.embedding,\n", | |
" )\n", | |
" for item in embedded_issues\n", | |
" ],\n", | |
" )\n", | |
" print(\"GitHub issues inserted successfully.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 80, | |
"id": "3cacbd71", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"repos = [\n", | |
"\t\t\t\t\"rust-lang/rust\",\n", | |
" \t\t \t\"kubernetes/kubernetes\",\n", | |
" \t\t\"apache/spark\",\n", | |
" ]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 81, | |
"id": "da972d60", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"issues = list(get_issues(100, repos))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"id": "804f2c42", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 100/100 [00:03<00:00, 26.01it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GitHub issues inserted successfully.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 100/100 [00:23<00:00, 4.19it/s]\n", | |
"100%|██████████| 100/100 [00:02<00:00, 47.17it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GitHub issue summaries inserted successfully.\n" | |
] | |
} | |
], | |
"source": [ | |
"async def process_issues():\n", | |
"\t\trepos = [\n", | |
"\t\t\t\t\"rust-lang/rust\",\n", | |
" \t\t \t\"kubernetes/kubernetes\",\n", | |
" \t\t\"apache/spark\",\n", | |
" ]\n", | |
"\t\tconn = await get_conn()\n", | |
"\t\tissues = list(get_issues(100,repos))\n", | |
"\t\tawait insert_github_issues(conn,issues)\n", | |
"\t\tawait insert_github_issue_summaries(conn,issues)\n", | |
"\n", | |
"await process_issues()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e69a9433", | |
"metadata": {}, | |
"source": [ | |
"# Evals Driven Development" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"id": "5cb89611", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pydantic import BaseModel, Field\n", | |
"\n", | |
"class SearchIssues(BaseModel):\n", | |
" \"\"\"\n", | |
" Use this when the user wants to get original issue information from the database \n", | |
" \"\"\"\n", | |
"\n", | |
" query: Optional[str]\n", | |
" repo: str = Field(\n", | |
" description=\"the repo to search for issues in, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
" \n", | |
"class RunSQLReturnPandas(BaseModel):\n", | |
" \"\"\"\n", | |
" Use this function when the user wants to do time-series analysis or data analysis and we don't have a tool that can supply the necessary information\n", | |
" \"\"\"\n", | |
"\n", | |
" query: str = Field(description=\"Description of user's query\")\n", | |
" repos: list[str] = Field(\n", | |
" description=\"the repos to run the query on, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
"class SearchSummaries(BaseModel):\n", | |
" \"\"\"\n", | |
"\t\tThis function retrieves summarized information about GitHub issues that match/are similar to a specific query, It's particularly useful for obtaining a quick snapshot of issue trends or patterns within a project.\n", | |
" \"\"\"\n", | |
"\n", | |
" query: Optional[str] = Field(description=\"Relevant user query if any\")\n", | |
" repo: str = Field(\n", | |
" description=\"the repo to search for issues in, should be in the format of 'owner/repo'\"\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"id": "540d3e83", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000080; text-decoration-color: #000080\">╭──────────────────────────────────────── </span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'__main__.SearchIssues'</span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\">></span><span style=\"color: #000080; text-decoration-color: #000080\"> ────────────────────────────────────────╮</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; font-style: italic\">class </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">SearchIssues</span><span style=\"font-weight: bold\">(</span>*, query: Optional<span style=\"font-weight: bold\">[</span>str<span style=\"font-weight: bold\">]</span>, repo: str<span style=\"font-weight: bold\">)</span> -> <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>: <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">Use this when the user wants to get original issue information from the database</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_computed_fields</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_config</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_extra</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e40</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields</span> = <span style=\"font-weight: bold\">{</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'query'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">Union</span><span style=\"font-weight: bold\">[</span>str, NoneType<span style=\"font-weight: bold\">]</span>, <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span><span style=\"font-weight: bold\">)</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'repo'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">str</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">description</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"the repo to search for issues in, should be in the format of </span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'owner/repo'\"</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">)</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields_set</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e90</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[34m╭─\u001b[0m\u001b[34m───────────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'__main__.SearchIssues'\u001b[0m\u001b[1;34m>\u001b[0m\u001b[34m \u001b[0m\u001b[34m───────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;96mclass \u001b[0m\u001b[1;31mSearchIssues\u001b[0m\u001b[1m(\u001b[0m*, query: Optional\u001b[1m[\u001b[0mstr\u001b[1m]\u001b[0m, repo: str\u001b[1m)\u001b[0m -> \u001b[3;35mNone\u001b[0m: \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mUse this when the user wants to get original issue information from the database\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_computed_fields\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_config\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_extra\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e40\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields\u001b[0m = \u001b[1m{\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'query'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[33mannotation\u001b[0m=\u001b[35mUnion\u001b[0m\u001b[1m[\u001b[0mstr, NoneType\u001b[1m]\u001b[0m, \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m\u001b[1m)\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'repo'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mannotation\u001b[0m=\u001b[35mstr\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mdescription\u001b[0m=\u001b[32m\"the\u001b[0m\u001b[32m repo to search for issues in, should be in the format of \u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'owner/repo'\"\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m)\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields_set\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e90\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rinspect(SearchIssues)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 87, | |
"id": "1c1ae3c1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000080; text-decoration-color: #000080\">╭────────────────────────────────────── </span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'__main__.SearchSummaries'</span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\">></span><span style=\"color: #000080; text-decoration-color: #000080\"> ───────────────────────────────────────╮</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; font-style: italic\">class </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">SearchSummaries</span><span style=\"font-weight: bold\">(</span>*, query: Optional<span style=\"font-weight: bold\">[</span>str<span style=\"font-weight: bold\">]</span>, repo: str<span style=\"font-weight: bold\">)</span> -> <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>: <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">This function retrieves summarized information about GitHub issues that match/are similar to a specific query, </span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">It's particularly useful for obtaining a quick snapshot of issue trends or patterns within a project.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_computed_fields</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_config</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_extra</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e40</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields</span> = <span style=\"font-weight: bold\">{</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'query'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">Union</span><span style=\"font-weight: bold\">[</span>str, NoneType<span style=\"font-weight: bold\">]</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">description</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Relevant user query if any'</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">)</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'repo'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">str</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">description</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"the repo to search for issues in, should be in the format of </span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'owner/repo'\"</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">)</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields_set</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e90</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[34m╭─\u001b[0m\u001b[34m─────────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'__main__.SearchSummaries'\u001b[0m\u001b[1;34m>\u001b[0m\u001b[34m \u001b[0m\u001b[34m──────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;96mclass \u001b[0m\u001b[1;31mSearchSummaries\u001b[0m\u001b[1m(\u001b[0m*, query: Optional\u001b[1m[\u001b[0mstr\u001b[1m]\u001b[0m, repo: str\u001b[1m)\u001b[0m -> \u001b[3;35mNone\u001b[0m: \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mThis function retrieves summarized information about GitHub issues that match/are similar to a specific query, \u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mIt's particularly useful for obtaining a quick snapshot of issue trends or patterns within a project.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_computed_fields\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_config\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_extra\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e40\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields\u001b[0m = \u001b[1m{\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'query'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mannotation\u001b[0m=\u001b[35mUnion\u001b[0m\u001b[1m[\u001b[0mstr, NoneType\u001b[1m]\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mdescription\u001b[0m=\u001b[32m'Relevant user query if any'\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m)\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'repo'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mannotation\u001b[0m=\u001b[35mstr\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mdescription\u001b[0m=\u001b[32m\"the\u001b[0m\u001b[32m repo to search for issues in, should be in the format of \u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'owner/repo'\"\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m)\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields_set\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e90\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rinspect(SearchSummaries)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"id": "c2408836", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000080; text-decoration-color: #000080\">╭───────────────────────────────────── </span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'__main__.RunSQLReturnPandas'</span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\">></span><span style=\"color: #000080; text-decoration-color: #000080\"> ─────────────────────────────────────╮</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; font-style: italic\">class </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">RunSQLReturnPandas</span><span style=\"font-weight: bold\">(</span>*, query: str, repos: list<span style=\"font-weight: bold\">[</span>str<span style=\"font-weight: bold\">])</span> -> <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>: <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">Use this function when the user wants to do time-series analysis or data analysis and we don't have a tool that</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">can supply the necessary information</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_computed_fields</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_config</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_extra</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e40</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields</span> = <span style=\"font-weight: bold\">{</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'query'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">str</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">description</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"Description of user's query\"</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">)</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'repos'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">list</span><span style=\"font-weight: bold\">[</span>str<span style=\"font-weight: bold\">]</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">description</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"the repos to run the query on, should be in the format of </span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'owner/repo'\"</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">)</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields_set</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e90</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[34m╭─\u001b[0m\u001b[34m────────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'__main__.RunSQLReturnPandas'\u001b[0m\u001b[1;34m>\u001b[0m\u001b[34m \u001b[0m\u001b[34m────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;96mclass \u001b[0m\u001b[1;31mRunSQLReturnPandas\u001b[0m\u001b[1m(\u001b[0m*, query: str, repos: list\u001b[1m[\u001b[0mstr\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m -> \u001b[3;35mNone\u001b[0m: \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mUse this function when the user wants to do time-series analysis or data analysis and we don't have a tool that\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mcan supply the necessary information\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_computed_fields\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_config\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_extra\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e40\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields\u001b[0m = \u001b[1m{\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'query'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mannotation\u001b[0m=\u001b[35mstr\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mdescription\u001b[0m=\u001b[32m\"Description\u001b[0m\u001b[32m of user's query\"\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m)\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'repos'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mannotation\u001b[0m=\u001b[35mlist\u001b[0m\u001b[1m[\u001b[0mstr\u001b[1m]\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mdescription\u001b[0m=\u001b[32m\"the\u001b[0m\u001b[32m repos to run the query on, should be in the format of \u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'owner/repo'\"\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m)\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields_set\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e90\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rinspect(RunSQLReturnPandas)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"id": "63e82f5d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from typing import Iterable, Union\n", | |
"\n", | |
"def one_step_agent(question: str):\n", | |
" import instructor\n", | |
" import openai\n", | |
"\n", | |
" client = instructor.from_openai(\n", | |
" openai.OpenAI(), mode=instructor.Mode.PARALLEL_TOOLS\n", | |
" )\n", | |
"\n", | |
" return client.chat.completions.create(\n", | |
" model=\"gpt-4o-mini\",\n", | |
" messages=[\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"You are an AI assistant that helps users query and analyze GitHub issues stored in a PostgreSQL database. Search for summaries when the user wants to understand the trends or patterns within a project. Otherwise just get the issues and return them. Only resort to SQL queries if the other tools are not able to answer the user's query.\",\n", | |
" },\n", | |
" {\"role\": \"user\", \"content\": question},\n", | |
" ],\n", | |
" response_model=Iterable[\n", | |
" Union[\n", | |
" RunSQLReturnPandas,\n", | |
" SearchIssues,\n", | |
" SearchSummaries,\n", | |
" ]\n", | |
" ],\n", | |
" )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 92, | |
"id": "9c7b441e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from typing import Iterable, Union\n", | |
"\n", | |
"def one_step_agent(question: str):\n", | |
" import instructor, openai\n", | |
" client = instructor.from_openai(openai.OpenAI(), mode=instructor.Mode.PARALLEL_TOOLS)\n", | |
" return client.chat.completions.create(model='gpt-4o-mini', messages=[\n", | |
" {'role': 'system',\n", | |
" 'content': \"You are an AI assistant that helps users query and analyze GitHub issues stored in a PostgreSQL database. Search for summaries when the user wants to understand the trends or patterns within a project. Otherwise just get the issues and return them. Only resort to SQL queries if the other tools are not able to answer the user's query.\"\n", | |
" },\n", | |
" {'role': 'user', 'content': question}\n", | |
" ],\n", | |
" response_model = Iterable[Union[RunSQLReturnPandas, SearchIssues, SearchSummaries]]\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f937ee7b", | |
"metadata": {}, | |
"source": [ | |
"## Testing the agent" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 93, | |
"id": "27f7e7fc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tests = [\n", | |
" [\n", | |
" \"What is the average time to first response for issues in the azure repository over the last 6 months? Has this metric improved or worsened?\",\n", | |
" [RunSQLReturnPandas],\n", | |
" ],\n", | |
" [\n", | |
" \"How many issues mentioned issues with Cohere in the 'vercel/next.js' repository in the last 6 months?\",\n", | |
" [SearchIssues],\n", | |
" ],\n", | |
" [\n", | |
" \"What were some of the big features that were implemented in the last 4 months for the scipy repo that addressed some previously open issues?\",\n", | |
" [SearchSummaries],\n", | |
" ],\n", | |
" ]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"id": "4fc3b1b4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"\"How many issues mentioned issues with Cohere in the 'vercel/next.js' repository in the last 6 months?\"" | |
] | |
}, | |
"execution_count": 97, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tests[1][0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"id": "3c158531", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"response = one_step_agent(tests[1][0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"id": "13c91ab9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"SearchIssues(query='Cohere', repo='vercel/next.js')" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"list(response)[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"id": "b48c5c5b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'What were some of the big features that were implemented in the last 4 months for the scipy repo that addressed some previously open issues?'" | |
] | |
}, | |
"execution_count": 101, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tests[2][0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"id": "1b4947a5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"SearchSummaries(query='big features implemented', repo='scipy/scipy')" | |
] | |
}, | |
"execution_count": 102, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"response = one_step_agent(tests[2][0])\n", | |
"list(response)[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"id": "4613c73c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'What is the average time to first response for issues in the azure repository over the last 6 months? Has this metric improved or worsened?'" | |
] | |
}, | |
"execution_count": 103, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tests[0][0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"id": "74bbdb2f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RunSQLReturnPandas(query=\"SELECT AVG(response_time) AS avg_response_time, DATE_TRUNC('month', created_at) AS month FROM ( SELECT i.id, i.created_at, MIN(c.created_at) AS first_response_at, EXTRACT(EPOCH FROM (MIN(c.created_at) - i.created_at)) / 3600 AS response_time FROM issues i LEFT JOIN comments c ON i.id = c.issue_id GROUP BY i.id, i.created_at ) AS response_times WHERE created_at >= NOW() - INTERVAL '6 months' GROUP BY month ORDER BY month;\", repos=['azure/repo'])" | |
] | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"response = one_step_agent(tests[0][0])\n", | |
"list(response)[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"id": "bf5c6009", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tests = [\n", | |
" [\n", | |
" \"What is the average time to first response for issues in the azure repository over the last 6 months? Has this metric improved or worsened?\",\n", | |
" [RunSQLReturnPandas],\n", | |
" ],\n", | |
" [\n", | |
" \"How many issues mentioned issues with Cohere in the 'vercel/next.js' repository in the last 6 months?\",\n", | |
" [SearchIssues],\n", | |
" ],\n", | |
" [\n", | |
" \"What were some of the big features that were implemented in the last 4 months for the scipy repo that addressed some previously open issues?\",\n", | |
" [SearchSummaries],\n", | |
" ],\n", | |
" ]\n", | |
" \n", | |
"for query, expected_result in tests:\n", | |
" response = one_step_agent(query)\n", | |
" for expected_call, agent_call in zip(expected_result, response):\n", | |
" assert isinstance(agent_call, expected_call)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8877d2c2", | |
"metadata": {}, | |
"source": [ | |
"## Embedding Search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"id": "92c8904f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pydantic import BaseModel, Field\n", | |
"from typing import Optional\n", | |
"from openai import OpenAI\n", | |
"from jinja2 import Template\n", | |
"from asyncpg import Connection\n", | |
"\n", | |
"class SearchIssues(BaseModel):\n", | |
" \"\"\"\n", | |
" Use this when the user wants to get original issue information from the database\n", | |
" \"\"\"\n", | |
"\n", | |
" query: Optional[str]\n", | |
" repo: str = Field(\n", | |
" description=\"the repo to search for issues in, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
" async def execute(self, conn: Connection, limit: int):\n", | |
" if self.query:\n", | |
" embedding = (\n", | |
" OpenAI()\n", | |
" .embeddings.create(input=self.query, model=\"text-embedding-3-small\")\n", | |
" .data[0]\n", | |
" .embedding\n", | |
" )\n", | |
" args = [self.repo, limit, embedding]\n", | |
" else:\n", | |
" args = [self.repo, limit]\n", | |
" embedding = None\n", | |
"\n", | |
" sql_query = Template(\n", | |
" \"\"\"\n", | |
" SELECT *\n", | |
" FROM {{ table_name }}\n", | |
" WHERE repo_name = $1\n", | |
" {%- if embedding is not none %}\n", | |
" ORDER BY embedding <=> $3\n", | |
" {%- endif %}\n", | |
" LIMIT $2\n", | |
" \"\"\"\n", | |
" ).render(table_name=\"github_issues\", embedding=embedding)\n", | |
"\n", | |
" return await conn.fetch(sql_query, *args)\n", | |
"\n", | |
"class RunSQLReturnPandas(BaseModel):\n", | |
" \"\"\"\n", | |
" Use this function when the user wants to do time-series analysis or data analysis and we don't have a tool that can supply the necessary information\n", | |
" \"\"\"\n", | |
"\n", | |
" query: str = Field(description=\"Description of user's query\")\n", | |
" repos: list[str] = Field(\n", | |
" description=\"the repos to run the query on, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
" async def execute(self, conn: Connection, limit: int):\n", | |
" pass\n", | |
"\n", | |
"class SearchSummaries(BaseModel):\n", | |
" \"\"\"\n", | |
" This function retrieves summarized information about GitHub issues that match/are similar to a specific query, It's particularly useful for obtaining a quick snapshot of issue trends or patterns within a project.\n", | |
" \"\"\"\n", | |
"\n", | |
" query: Optional[str] = Field(description=\"Relevant user query if any\")\n", | |
" repo: str = Field(\n", | |
" description=\"the repo to search for issues in, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
" async def execute(self, conn: Connection, limit: int):\n", | |
" if self.query:\n", | |
" embedding = (\n", | |
" OpenAI()\n", | |
" .embeddings.create(input=self.query, model=\"text-embedding-3-small\")\n", | |
" .data[0]\n", | |
" .embedding\n", | |
" )\n", | |
" args = [self.repo, limit, embedding]\n", | |
" else:\n", | |
" args = [self.repo, limit]\n", | |
" embedding = None\n", | |
"\n", | |
" sql_query = Template(\n", | |
" \"\"\"\n", | |
" SELECT *\n", | |
" FROM {{ table_name }}\n", | |
" WHERE repo_name = $1\n", | |
" {%- if embedding is not none %}\n", | |
" ORDER BY embedding <=> $3\n", | |
" {%- endif %}\n", | |
" LIMIT $2\n", | |
" \"\"\"\n", | |
" ).render(table_name=\"github_issue_summaries\", embedding=embedding)\n", | |
"\n", | |
" return await conn.fetch(sql_query, *args)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"id": "4ec496af", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"User highlights confusion and difficulty in starting with Kubernetes on Ubuntu due to poor documentation. Calls for better synchronization between code changes and documentation updates. Discussion includes ongoing documentation overhaul efforts, acknowledgment of community support limitations, and individual frustration with Kubernetes leadership's response to documentation issues.\n", | |
"This issue addresses the need to improve the documentation on Kubernetes authentication, particularly focusing on certificate generation and configuration. Users reported problems with generating certificates and errors related to authentication, sharing solutions and insights about effective configurations. They also emphasized the importance of clear documentation and proposed enhancements to better explain the authentication processes within Kubernetes.\n", | |
"The issue revolves around improving capacity planning in Kubernetes by making cluster fullness and pending pod metrics more accessible to users. Key discussions include suggestions for clearer metrics, the impact of resource fragmentation, the functionality of horizontal autoscaling, and enhancing scheduling tools to monitor node selectors and available resources effectively.\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"query = \"What are the main problems people are facing with installation with Kubernetes\"\n", | |
"\n", | |
"conn = await get_conn()\n", | |
"limit = 10\n", | |
"resp = await SearchSummaries(query=query, repo=\"kubernetes/kubernetes\").execute(\n", | |
" conn, limit\n", | |
")\n", | |
"\n", | |
"for row in resp[:3]:\n", | |
" print(row[\"text\"])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d00844d3", | |
"metadata": {}, | |
"source": [ | |
"## Dealing with misspellings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"id": "c9a29965", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from fuzzywuzzy import process" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"id": "be52ce6f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000080; text-decoration-color: #000080\">╭──────────────────────────────────── </span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">function</span><span style=\"color: #000000; text-decoration-color: #000000\"> extractOne at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab16583c40</span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\">></span><span style=\"color: #000080; text-decoration-color: #000080\"> ────────────────────────────────────╮</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; font-style: italic\">def </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">extractOne</span><span style=\"font-weight: bold\">(</span>query, choices, <span style=\"color: #808000; text-decoration-color: #808000\">processor</span>=<span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">function</span><span style=\"color: #000000; text-decoration-color: #000000\"> full_process at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab16580860</span><span style=\"color: #000000; text-decoration-color: #000000\">>, </span><span style=\"color: #808000; text-decoration-color: #808000\">scorer</span><span style=\"color: #000000; text-decoration-color: #000000\">=<function WRatio at </span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab16583a60</span><span style=\"font-weight: bold\">></span>, <span style=\"color: #808000; text-decoration-color: #808000\">score_cutoff</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span><span style=\"font-weight: bold\">)</span>: <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">Find the single best match above a score in a list of choices.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">This is a convenience method which returns the single best choice.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">See </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">extract</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">()</span><span style=\"color: #008080; text-decoration-color: #008080\"> for the full arguments list.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">Args:</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> query: A string to match against</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> choices: A list or dictionary of choices, suitable for use with</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">extract</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">()</span><span style=\"color: #008080; text-decoration-color: #008080\">.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> processor: Optional function for transforming choices before matching.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> See </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">extract</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">()</span><span style=\"color: #008080; text-decoration-color: #008080\">.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> scorer: Scoring function for </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">extract</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">()</span><span style=\"color: #008080; text-decoration-color: #008080\">.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> score_cutoff: Optional argument for score threshold. If the best</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> match is found, but it is not greater than this number, then</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> return </span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #008080; text-decoration-color: #008080\"> anyway </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">(</span><span style=\"color: #008000; text-decoration-color: #008000\">\"not a good enough match\"</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">)</span><span style=\"color: #008080; text-decoration-color: #008080\">. Defaults to </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span><span style=\"color: #008080; text-decoration-color: #008080\">.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">Returns:</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> A tuple containing a single match and its score, if a match</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\"> was found that was above score_cutoff. Otherwise, returns </span><span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"color: #008080; text-decoration-color: #008080\">.</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">37</span><span style=\"font-style: italic\"> attribute(s) not shown.</span> Run <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">inspect</span><span style=\"font-weight: bold\">(</span>inspect<span style=\"font-weight: bold\">)</span> for options. <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[34m╭─\u001b[0m\u001b[34m───────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m extractOne at \u001b[0m\u001b[1;36m0x7fab16583c40\u001b[0m\u001b[1;34m>\u001b[0m\u001b[34m \u001b[0m\u001b[34m───────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;96mdef \u001b[0m\u001b[1;31mextractOne\u001b[0m\u001b[1m(\u001b[0mquery, choices, \u001b[33mprocessor\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m full_process at \u001b[0m\u001b[1;36m0x7fab16580860\u001b[0m\u001b[39m>, \u001b[0m\u001b[33mscorer\u001b[0m\u001b[39m=<function WRatio at \u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1;36m0x7fab16583a60\u001b[0m\u001b[1m>\u001b[0m, \u001b[33mscore_cutoff\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1m)\u001b[0m: \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mFind the single best match above a score in a list of choices.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mThis is a convenience method which returns the single best choice.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mSee \u001b[0m\u001b[1;35mextract\u001b[0m\u001b[1;36m(\u001b[0m\u001b[1;36m)\u001b[0m\u001b[36m for the full arguments list.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mArgs:\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m query: A string to match against\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m choices: A list or dictionary of choices, suitable for use with\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[1;35mextract\u001b[0m\u001b[1;36m(\u001b[0m\u001b[1;36m)\u001b[0m\u001b[36m.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m processor: Optional function for transforming choices before matching.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m See \u001b[0m\u001b[1;35mextract\u001b[0m\u001b[1;36m(\u001b[0m\u001b[1;36m)\u001b[0m\u001b[36m.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m scorer: Scoring function for \u001b[0m\u001b[1;35mextract\u001b[0m\u001b[1;36m(\u001b[0m\u001b[1;36m)\u001b[0m\u001b[36m.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m score_cutoff: Optional argument for score threshold. If the best\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m match is found, but it is not greater than this number, then\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m return \u001b[0m\u001b[3;35mNone\u001b[0m\u001b[36m anyway \u001b[0m\u001b[1;36m(\u001b[0m\u001b[32m\"not a good enough match\"\u001b[0m\u001b[1;36m)\u001b[0m\u001b[36m. Defaults to \u001b[0m\u001b[1;36m0\u001b[0m\u001b[36m.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mReturns:\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m A tuple containing a single match and its score, if a match\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36m was found that was above score_cutoff. Otherwise, returns \u001b[0m\u001b[3;35mNone\u001b[0m\u001b[36m.\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1;36m37\u001b[0m\u001b[3m attribute(s) not shown.\u001b[0m Run \u001b[1;35minspect\u001b[0m\u001b[1m(\u001b[0minspect\u001b[1m)\u001b[0m for options. \u001b[34m│\u001b[0m\n", | |
"\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rinspect(process.extractOne, help=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"id": "233457f7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"repos = ['rust-lang/rust', 'kubernetes/kubernetes', 'apache/spark', 'golang/go', 'tensorflow/tensorflow', 'MicrosoftDocs/azure-docs', 'pytorch/pytorch', 'Microsoft/TypeScript', 'python/cpython', 'facebook/react', 'django/django', 'rails/rails', 'bitcoin/bitcoin', 'nodejs/node', 'ocaml/opam-repository', 'apache/airflow', 'scipy/scipy', 'vercel/next.js']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"id": "4de9af30", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"query, expected = \"kuberntes\", \"kubernetes/kubernetes\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"id": "1e452415", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('kubernetes/kubernetes', 80)" | |
] | |
}, | |
"execution_count": 121, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"process.extractOne(query, repos) # single_match, score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 122, | |
"id": "390a81af", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from fuzzywuzzy import process\n", | |
"\n", | |
"def find_closest_repo(query: str, repos: list[str]) -> str | None:\n", | |
" if not query: return None\n", | |
"\n", | |
" best_match = process.extractOne(query, repos)\n", | |
" return best_match[0] if best_match[1] >= 80 else None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"id": "7c5f2e8c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"test = [['kuberntes', 'kubernetes/kubernetes'], ['next.js', 'vercel/next.js'], ['scipy', 'scipy/scipy'], ['', None], ['fakerepo', None]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"id": "13604217", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for query, expected in test:\n", | |
" assert find_closest_repo(query, repos) == expected" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 128, | |
"id": "578e9573", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pydantic import BaseModel, Field, field_validator, ValidationInfo\n", | |
"from typing import Optional\n", | |
"from openai import OpenAI\n", | |
"from jinja2 import Template\n", | |
"from asyncpg import Connection\n", | |
"\n", | |
"class SearchIssues(BaseModel):\n", | |
" \"\"\"\n", | |
" Use this when the user wants to get original issue information from the database\n", | |
" \"\"\"\n", | |
"\n", | |
" query: Optional[str]\n", | |
" repo: str = Field(\n", | |
" description=\"the repo to search for issues in, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
" @field_validator(\"repo\")\n", | |
" def validate_repo(cls, v: str, info: ValidationInfo):\n", | |
" matched_repo = find_closest_repo(v, info.context[\"repos\"])\n", | |
" if matched_repo is None:\n", | |
" raise ValueError(\n", | |
" f\"Unable to match repo {v} to a list of known repos of {info.context['repos']}\"\n", | |
" )\n", | |
" return matched_repo\n", | |
"\n", | |
" async def execute(self, conn: Connection, limit: int):\n", | |
" if self.query:\n", | |
" embedding = (\n", | |
" OpenAI()\n", | |
" .embeddings.create(input=self.query, model=\"text-embedding-3-small\")\n", | |
" .data[0]\n", | |
" .embedding\n", | |
" )\n", | |
" args = [self.repo, limit, embedding]\n", | |
" else:\n", | |
" args = [self.repo, limit]\n", | |
" embedding = None\n", | |
"\n", | |
" sql_query = Template(\n", | |
" \"\"\"\n", | |
" SELECT *\n", | |
" FROM {{ table_name }}\n", | |
" WHERE repo_name = $1\n", | |
" {%- if embedding is not none %}\n", | |
" ORDER BY embedding <=> $3\n", | |
" {%- endif %}\n", | |
" LIMIT $2\n", | |
" \"\"\"\n", | |
" ).render(table_name=\"github_issues\", embedding=embedding)\n", | |
"\n", | |
" return await conn.fetch(sql_query, *args)\n", | |
"\n", | |
"class RunSQLReturnPandas(BaseModel):\n", | |
" \"\"\"\n", | |
" Use this function when the user wants to do time-series analysis or data analysis and we don't have a tool that can supply the necessary information\n", | |
" \"\"\"\n", | |
"\n", | |
" query: str = Field(description=\"Description of user's query\")\n", | |
" repos: list[str] = Field(\n", | |
" description=\"the repos to run the query on, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
" async def execute(self, conn: Connection, limit: int):\n", | |
" pass\n", | |
"\n", | |
"class SearchSummaries(BaseModel):\n", | |
" \"\"\"\n", | |
" This function retrieves summarized information about GitHub issues that match/are similar to a specific query, It's particularly useful for obtaining a quick snapshot of issue trends or patterns within a project.\n", | |
" \"\"\"\n", | |
"\n", | |
" query: Optional[str] = Field(description=\"Relevant user query if any\")\n", | |
" repo: str = Field(\n", | |
" description=\"the repo to search for issues in, should be in the format of 'owner/repo'\"\n", | |
" )\n", | |
"\n", | |
" @field_validator(\"repo\")\n", | |
" def validate_repo(cls, v: str, info: ValidationInfo):\n", | |
" matched_repo = find_closest_repo(v, info.context[\"repos\"])\n", | |
" if matched_repo is None:\n", | |
" raise ValueError(\n", | |
" f\"Unable to match repo {v} to a list of known repos of {info.context['repos']}\"\n", | |
" )\n", | |
" return matched_repo\n", | |
"\n", | |
" async def execute(self, conn: Connection, limit: int):\n", | |
" if self.query:\n", | |
" embedding = (\n", | |
" OpenAI()\n", | |
" .embeddings.create(input=self.query, model=\"text-embedding-3-small\")\n", | |
" .data[0]\n", | |
" .embedding\n", | |
" )\n", | |
" args = [self.repo, limit, embedding]\n", | |
" else:\n", | |
" args = [self.repo, limit]\n", | |
" embedding = None\n", | |
"\n", | |
" sql_query = Template(\n", | |
" \"\"\"\n", | |
" SELECT *\n", | |
" FROM {{ table_name }}\n", | |
" WHERE repo_name = $1\n", | |
" {%- if embedding is not none %}\n", | |
" ORDER BY embedding <=> $3\n", | |
" {%- endif %}\n", | |
" LIMIT $2\n", | |
" \"\"\"\n", | |
" ).render(table_name=\"github_issue_summaries\", embedding=embedding)\n", | |
"\n", | |
" return await conn.fetch(sql_query, *args)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "fff51d0c", | |
"metadata": {}, | |
"source": [ | |
"Note: fieldvalidator and ValidationInfo are missing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 129, | |
"id": "c26ee44f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"User highlights confusion and difficulty in starting with Kubernetes on Ubuntu due to poor documentation. Calls for better synchronization between code changes and documentation updates. Discussion includes ongoing documentation overhaul efforts, acknowledgment of community support limitations, and individual frustration with Kubernetes leadership's response to documentation issues.\n", | |
"This issue addresses the need to improve the documentation on Kubernetes authentication, particularly focusing on certificate generation and configuration. Users reported problems with generating certificates and errors related to authentication, sharing solutions and insights about effective configurations. They also emphasized the importance of clear documentation and proposed enhancements to better explain the authentication processes within Kubernetes.\n", | |
"The issue revolves around improving capacity planning in Kubernetes by making cluster fullness and pending pod metrics more accessible to users. Key discussions include suggestions for clearer metrics, the impact of resource fragmentation, the functionality of horizontal autoscaling, and enhancing scheduling tools to monitor node selectors and available resources effectively.\n" | |
] | |
} | |
], | |
"source": [ | |
"repos = [\n", | |
" \"rust-lang/rust\",\n", | |
" \"kubernetes/kubernetes\",\n", | |
" \"apache/spark\",\n", | |
" \"golang/go\",\n", | |
" \"tensorflow/tensorflow\",\n", | |
"]\n", | |
"\n", | |
"query = (\n", | |
" \"What are the main problems people are facing with installation with Kubernetes\"\n", | |
")\n", | |
"\n", | |
"conn = await get_conn()\n", | |
"limit = 10\n", | |
"resp = await SearchSummaries.model_validate_json(\n", | |
" json.dumps({\"query\": query, \"repo\": \"kuberntes\"}),\n", | |
" context={\"repos\": repos},\n", | |
").execute(conn, limit)\n", | |
"\n", | |
"for row in resp[:3]:\n", | |
" print(row[\"text\"])\n", | |
"\n", | |
"await conn.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 132, | |
"id": "52189f66", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'{\"query\": \"What are the main problems people are facing with installation with Kubernetes\", \"repo\": \"kuberntes\"}'" | |
] | |
}, | |
"execution_count": 132, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"json.dumps({\"query\": query, \"repo\": \"kuberntes\"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 131, | |
"id": "c0f7a633", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"SearchSummaries(query='What are the main problems people are facing with installation with Kubernetes', repo='kubernetes/kubernetes')" | |
] | |
}, | |
"execution_count": 131, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"SearchSummaries.model_validate_json(json.dumps({\"query\": query, \"repo\": \"kuberntes\"}),\n", | |
" context={\"repos\": repos},)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "6b4ae7de", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000080; text-decoration-color: #000080\">╭──────────────────────────────────────── </span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'__main__.SearchIssues'</span><span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\">></span><span style=\"color: #000080; text-decoration-color: #000080\"> ────────────────────────────────────────╮</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; font-style: italic\">class </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">SearchIssues</span><span style=\"font-weight: bold\">(</span>*, query: Optional<span style=\"font-weight: bold\">[</span>str<span style=\"font-weight: bold\">]</span>, repo: str<span style=\"font-weight: bold\">)</span> -> <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>: <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008080; text-decoration-color: #008080\">Use this when the user wants to get original issue information from the database</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_computed_fields</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_config</span> = <span style=\"font-weight: bold\">{}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_extra</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e40</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields</span> = <span style=\"font-weight: bold\">{</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'query'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">Union</span><span style=\"font-weight: bold\">[</span>str, NoneType<span style=\"font-weight: bold\">]</span>, <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span><span style=\"font-weight: bold\">)</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'repo'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FieldInfo</span><span style=\"font-weight: bold\">(</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">annotation</span>=<span style=\"color: #800080; text-decoration-color: #800080\">str</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">required</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">description</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"the repo to search for issues in, should be in the format of </span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #008000; text-decoration-color: #008000\">'owner/repo'\"</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">)</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"font-weight: bold\">}</span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-style: italic\">model_fields_set</span> = <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">property</span><span style=\"color: #000000; text-decoration-color: #000000\"> object at </span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0x7fab5a511e90</span><span style=\"font-weight: bold\">></span> <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n", | |
"<span style=\"color: #000080; text-decoration-color: #000080\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", | |
"</pre>\n" | |
], | |
"text/plain": [ | |
"\u001b[34m╭─\u001b[0m\u001b[34m───────────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'__main__.SearchIssues'\u001b[0m\u001b[1;34m>\u001b[0m\u001b[34m \u001b[0m\u001b[34m───────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;96mclass \u001b[0m\u001b[1;31mSearchIssues\u001b[0m\u001b[1m(\u001b[0m*, query: Optional\u001b[1m[\u001b[0mstr\u001b[1m]\u001b[0m, repo: str\u001b[1m)\u001b[0m -> \u001b[3;35mNone\u001b[0m: \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[36mUse this when the user wants to get original issue information from the database\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_computed_fields\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_config\u001b[0m = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_extra\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e40\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields\u001b[0m = \u001b[1m{\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'query'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[33mannotation\u001b[0m=\u001b[35mUnion\u001b[0m\u001b[1m[\u001b[0mstr, NoneType\u001b[1m]\u001b[0m, \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m\u001b[1m)\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'repo'\u001b[0m: \u001b[1;35mFieldInfo\u001b[0m\u001b[1m(\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mannotation\u001b[0m=\u001b[35mstr\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mrequired\u001b[0m=\u001b[3;92mTrue\u001b[0m, \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[33mdescription\u001b[0m=\u001b[32m\"the\u001b[0m\u001b[32m repo to search for issues in, should be in the format of \u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[32m'owner/repo'\"\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m)\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[1m}\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m│\u001b[0m \u001b[3;33mmodel_fields_set\u001b[0m = \u001b[1m<\u001b[0m\u001b[1;95mproperty\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7fab5a511e90\u001b[0m\u001b[1m>\u001b[0m \u001b[34m│\u001b[0m\n", | |
"\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"rinspect(SearchIssues)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6fb4c3ed", | |
"metadata": {}, | |
"source": [ | |
"# Implementing Text-to-SQL" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "49fc41a4", | |
"metadata": {}, | |
"source": [ | |
"You are a SQL expert tasked with writing queries including a time attribute for the relevant table. The user wants to execute a query for the following repos: {self.repos} to answer the query of {self.user_query_summary}.\n", | |
"\n", | |
"Key guidelines:\n", | |
"- Use the `repo_name` column for repository filtering.\n", | |
"- Employ the `time_bucket` function for time-based partitioning when specified.\n", | |
"- The `metadata` field is currently empty, so do not use it.\n", | |
"- Use the `issue_label` column in `github_issue_summaries` to determine issue status.\n", | |
"\n", | |
"-- Database Information Below--\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9aafbbc7", | |
"metadata": {}, | |
"source": [ | |
"# Putting it all together" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"id": "e58be69a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import instructor\n", | |
"from pydantic import BaseModel\n", | |
"from asyncpg import Record\n", | |
"from typing import Optional\n", | |
"from jinja2 import Template\n", | |
"from openai import OpenAI\n", | |
"\n", | |
"class Summary(BaseModel):\n", | |
" chain_of_thought: str\n", | |
" summary: str" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 134, | |
"id": "0678e890", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def summarize_content(issues: list[Record], query: Optional[str]):\n", | |
" client = instructor.from_openai(OpenAI())\n", | |
" return client.chat.completions.create(\n", | |
" messages=[\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"\"\"You're a helpful assistant that summarizes information about issues from a github repository. Be sure to output your response in a single paragraph that is concise and to the point.\"\"\",\n", | |
" },\n", | |
" {\n", | |
" \"role\": \"user\",\n", | |
" \"content\": Template(\n", | |
" \"\"\"\n", | |
" Here are the relevant issues:\n", | |
" {% for issue in issues %}\n", | |
" - {{ issue['text'] }}\n", | |
" {% endfor %}\n", | |
" {% if query %}\n", | |
" My specific query is: {{ query }}\n", | |
" {% else %}\n", | |
" Please provide a broad summary and key insights from the issues above.\n", | |
" {% endif %}\n", | |
" \"\"\"\n", | |
" ).render(issues=issues, query=query),\n", | |
" },\n", | |
" ],\n", | |
" response_model=Summary,\n", | |
" model=\"gpt-4o-mini\",\n", | |
" )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 135, | |
"id": "d75caeff", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def one_step_agent(question: str, repos: list[str]):\n", | |
" client = instructor.from_openai(OpenAI(), mode=instructor.Mode.PARALLEL_TOOLS)\n", | |
"\n", | |
" return client.chat.completions.create(\n", | |
" model=\"gpt-4o-mini\",\n", | |
" messages=[\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"You are an AI assistant that helps users query and analyze GitHub issues stored in a PostgreSQL database. Search for summaries when the user wants to understand the trends or patterns within a project. Otherwise just get the issues and return them. Only resort to SQL queries if the other tools are not able to answer the user's query.\",\n", | |
" },\n", | |
" {\n", | |
" \"role\": \"user\",\n", | |
" \"content\": Template(\n", | |
" \"\"\"\n", | |
" Here is the user's question: {{ question }}\n", | |
" Here is a list of repos that we have stored in our database. Choose the one that is most relevant to the user's query:\n", | |
" {% for repo in repos %}\n", | |
" - {{ repo }}\n", | |
" {% endfor %}\n", | |
" \"\"\"\n", | |
" ).render(question=question, repos=repos),\n", | |
" },\n", | |
" ],\n", | |
" validation_context={\"repos\": repos},\n", | |
" response_model=Iterable[\n", | |
" Union[\n", | |
" RunSQLReturnPandas,\n", | |
" SearchIssues,\n", | |
" SearchSummaries,\n", | |
" ]\n", | |
" ],\n", | |
" )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"id": "94240f26", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"query = \"What are the main issues people face with endpoint connectivity between different pods in kubernetes?\"\n", | |
"repos = [\n", | |
" \"rust-lang/rust\",\n", | |
" \"kubernetes/kubernetes\",\n", | |
" \"apache/spark\",\n", | |
" \"golang/go\",\n", | |
" \"tensorflow/tensorflow\",\n", | |
" \"MicrosoftDocs/azure-docs\",\n", | |
" \"pytorch/pytorch\",\n", | |
" \"Microsoft/TypeScript\",\n", | |
" \"python/cpython\",\n", | |
" \"facebook/react\",\n", | |
" \"django/django\",\n", | |
" \"rails/rails\",\n", | |
" \"bitcoin/bitcoin\",\n", | |
" \"nodejs/node\",\n", | |
" \"ocaml/opam-repository\",\n", | |
" \"apache/airflow\",\n", | |
" \"scipy/scipy\",\n", | |
" \"vercel/next.js\",\n", | |
"]\n", | |
"\n", | |
"resp = one_step_agent(query, repos)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 141, | |
"id": "7559f9dd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[SearchSummaries(query='endpoint connectivity pods kubernetes', repo='kubernetes/kubernetes')]\n" | |
] | |
} | |
], | |
"source": [ | |
"conn = await get_conn()\n", | |
"limit = 10\n", | |
"\n", | |
"tools = [tool for tool in resp]\n", | |
"print(tools)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 142, | |
"id": "2c1fdead", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The issues reviewed do not specifically highlight endpoint connectivity problems between different pods in Kubernetes. Commonly faced endpoint connectivity challenges could include networking plugin misconfigurations, firewall settings, missing network interfaces, or issues with DNS resolution.\n" | |
] | |
} | |
], | |
"source": [ | |
"result = await tools[0].execute(conn, limit)\n", | |
"\n", | |
"summary = summarize_content(result, query)\n", | |
"print(summary.summary)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2260fe95", | |
"metadata": {}, | |
"source": [ | |
"# fin" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment