Last active
July 4, 2024 15:29
-
-
Save tspannhw/93fbc60c03a2933ba814118503636698 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "eb201b96-3bd6-4f54-9f3e-0de2aa1a5b2b", | |
| "metadata": {}, | |
| "source": [ | |
| "## 03-July-2024 == BM25 with Tim slides\n", | |
| "\n", | |
| "#### Tim Spann @PaaSDev\n", | |
| "\n", | |
| "### Milvus - Attu\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "### CODE + COMMUNITY\n", | |
| "\n", | |
| "Please join my meetup group NJ/NYC/Philly/Virtual. \n", | |
| "\n", | |
| "[https://www.meetup.com/unstructured-data-meetup-new-york/](https://www.meetup.com/unstructured-data-meetup-new-york/)\n", | |
| "\n", | |
| "\n", | |
| "#### Contact Us\n", | |
| "\n", | |
| "Get Milvused! [https://milvus.io/](https://milvus.io/)\n", | |
| "\n", | |
| "Read my Newsletter every week! [https://github.com/tspannhw/FLiPStackWeekly/blob/main/142-17June2024.md](https://github.com/tspannhw/FLiPStackWeekly/blob/main/142-17June2024.md)\n", | |
| "\n", | |
| "For more cool Unstructured Data, AI and Vector Database videos check out the Milvus vector database videos here\n", | |
| "[https://www.youtube.com/@MilvusVectorDatabase/videos](https://www.youtube.com/@MilvusVectorDatabase/videos)\n", | |
| "\n", | |
| "#### Unstructured Data Meetups \n", | |
| "\n", | |
| "[https://www.meetup.com/pro/unstructureddata/](https://www.meetup.com/pro/unstructureddata/)\n", | |
| "[https://zilliz.com/community/unstructured-data-meetup](https://zilliz.com/community/unstructured-data-meetup)\n", | |
| "[https://zilliz.com/event](https://zilliz.com/event)\n", | |
| "\n", | |
| "#### [https://x.com/milvusio](Twitter/X) \n", | |
| "\n", | |
| "#### [https://www.linkedin.com/company/zilliz/](LinkedIn)\n", | |
| "\n", | |
| "#### [https://discord.com/invite/FjCMmaJng6](Discord)\n", | |
| "\n", | |
| "#### [https://milvusio.medium.com/](Blog)\n", | |
| "\n", | |
| "#### Please star our [https://github.com/milvus-io/milvus](Github)\n", | |
| "\n", | |
| "#### [https://www.youtube.com/@FLaNK-Stack](Youtube)\n", | |
| "\n", | |
| "#### [https://medium.com/@tspann/subscribe](Blog)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 77, | |
| "id": "4deca7a7-2817-4f18-b6ab-d9397b3735cb", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "from pymilvus import MilvusClient\n", | |
| "from pymilvus import (\n", | |
| " utility,\n", | |
| " FieldSchema, CollectionSchema, DataType,\n", | |
| " Collection, AnnSearchRequest, RRFRanker, connections,\n", | |
| ")\n", | |
| "from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer\n", | |
| "from pymilvus.model.sparse import BM25EmbeddingFunction\n", | |
| "from pymilvus import model\n", | |
| "\n", | |
| "DIMENSION = 64 \n", | |
| "MILVUS_URL = \"http://192.168.1.163:19530\" \n", | |
| "COLLECTION_NAME = \"traveladvisories\"\n", | |
| "TRAVEL_URL = \"https://travel.state.gov/_res/rss/TAsTWs.xml\"\n", | |
| "\n", | |
| "# -----------------------------------------------------------------------------\n", | |
| "# Connect to Milvus\n", | |
| "\n", | |
| "# Local Docker Server\n", | |
| "milvus_client = MilvusClient( uri=MILVUS_URL )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 78, | |
| "id": "e4cd2db0-3d15-4980-a3b7-40634755402e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "{'state': <LoadState: NotLoad>}\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from pymilvus import connections\n", | |
| "from pymilvus import utility\n", | |
| "from pymilvus import FieldSchema, CollectionSchema, DataType, Collection\n", | |
| "import pprint\n", | |
| "\n", | |
| "## schema\n", | |
| "schema = milvus_client.create_schema(auto_id=True, enable_dynamic_fields=True)\n", | |
| "schema.add_field(field_name=\"pk\", datatype=DataType.VARCHAR, is_primary=True, max_length=100)\n", | |
| "schema.add_field(field_name=\"title\", datatype=DataType.VARCHAR, max_length=512)\n", | |
| "schema.add_field(field_name=\"link\", datatype=DataType.VARCHAR, max_length=512)\n", | |
| "schema.add_field(field_name=\"summary\", datatype=DataType.VARCHAR, max_length=50000)\n", | |
| "schema.add_field(field_name=\"publisheddate\", datatype=DataType.VARCHAR, max_length=100)\n", | |
| "schema.add_field(field_name=\"sparse_vector\", datatype=DataType.SPARSE_FLOAT_VECTOR)\n", | |
| "\n", | |
| "## index\n", | |
| "##index_params = milvus_client.prepare_index_params()\n", | |
| "\n", | |
| "##index_params.add_index(\n", | |
| "## field_name=\"summaryvector\",\n", | |
| "## index_type=\"SPARSE_INVERTED_INDEX\",\n", | |
| "## metric_type=\"IP\"\n", | |
| "##)\n", | |
| "# index_params=index_params\n", | |
| "## create collection\n", | |
| "milvus_client.create_collection(\n", | |
| " collection_name = COLLECTION_NAME,\n", | |
| " schema=schema\n", | |
| ")\n", | |
| "\n", | |
| "res = milvus_client.get_load_state(\n", | |
| " collection_name = COLLECTION_NAME\n", | |
| ")\n", | |
| "\n", | |
| "print(res)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 79, | |
| "id": "763d98dd-cc2b-4d56-9461-bbfec7b39194", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Build Corpous\n", | |
| "\n", | |
| "analyzer = build_default_analyzer(language=\"en\") \n", | |
| "\n", | |
| "# Create corpus based samples from documents\n", | |
| "corpus = [\n", | |
| " \"Reissued after periodic review without changes. Exercise normal precautions in Bhutan. Read the country information page for additional information on travel to Bhutan. If you decide to travel to Bhutan: Enroll in the Smart Traveler Enrollment Program ( STEP ) to receive Alerts and make it easier to locate you in an emergency. Follow the Department of State on Facebook and Twitter . Review the Country Security Report for Bhutan. Visit the CDC page for the latest Travel Health Information related to your travel. Prepare a contingency plan for emergency situations. Review the Traveler’s Checklist\",\n", | |
| " \"Reissued with obsolete COVID-19 page links removed\",\n", | |
| " \"Exercise increased caution in Tajikistan due to terrorism, unexploded landmines, and occasional violence near the border with Kyrgyzstan\",\n", | |
| " \"Prepare a contingency plan for emergency situations. Review the Traveler’s Checklist\",\n", | |
| " \"If you decide to travel to Zambia: Enroll in the Smart Traveler Enrollment Program ( STEP ) to receive Alerts and make it easier to locate you in an emergency. Follow the Department of State on Facebook and Twitter . Follow the U.S. Embassy in Zambia on Facebook and Twitter . Review the Country Security Report for Zambia. Prepare a contingency plan for emergency situations. Review the Traveler’s Checklist . Visit the CDC page for the latest Travel Health Information related to your travel\",\n", | |
| " \"Exercise normal precautions in Barbados. Read the country information page for additional information on travel to Barbados. If you decide to travel to Barbados\",\n", | |
| "]\n", | |
| "\n", | |
| "# Use the analyzer to instantiate the BM25EmbeddingFunction\n", | |
| "bm25_ef = BM25EmbeddingFunction(analyzer)\n", | |
| "\n", | |
| "# Fit the model on the corpus to get the statstics of the corpus\n", | |
| "bm25_ef.fit(corpus)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 87, | |
| "id": "a7a384ba-86fe-4df7-8086-d50a4158f271", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Bhutan - Level 1: Exercise Normal Precautions\n", | |
| "Senegal - Level 1: Exercise Normal Precautions\n", | |
| "Tajikistan - Level 2: Exercise Increased Caution\n", | |
| "Iceland - Level 1: Exercise Normal Precautions\n", | |
| "Mainland China, Hong Kong & Macau - See Summaries - Level 3: Reconsider Travel\n", | |
| "Zambia - Level 1: Exercise Normal Precautions\n", | |
| "Armenia - Level 2: Exercise Increased Caution\n", | |
| "Syria - Level 4: Do Not Travel\n", | |
| "Bolivia - Level 2: Exercise Increased Caution\n", | |
| "Taiwan - Level 1: Exercise Normal Precautions\n", | |
| "Rwanda - Level 1: Exercise Normal Precautions\n", | |
| "Uruguay - Level 2: Exercise Increased Caution\n", | |
| "Barbados - Level 1: Exercise Normal Precautions\n", | |
| "North Korea - Level 4: Do Not Travel\n", | |
| "Chad - Level 3: Reconsider Travel\n", | |
| "Hungary - Level 1: Exercise Normal Precautions\n", | |
| "Malta - Level 1: Exercise Normal Precautions\n", | |
| "Burma (Myanmar) - Level 4: Do Not Travel\n", | |
| "Qatar - Level 1: Exercise Normal Precautions\n", | |
| "Saudi Arabia - Level 3: Reconsider Travel\n", | |
| "Estonia - Level 1: Exercise Normal Precautions\n", | |
| "British Virgin Islands - Level 1: Exercise Normal Precautions\n", | |
| "The Bahamas - Level 2: Exercise Increased Caution\n", | |
| "Montenegro - Level 1: Exercise Normal Precautions\n", | |
| "Argentina - Level 1: Exercise Normal Precautions\n", | |
| "Belgium - Level 2: Exercise Increased Caution\n", | |
| "Jordan - Level 2: Exercise Increased Caution\n", | |
| "Sudan - Level 4: Do Not Travel\n", | |
| "Cuba - Level 2: Exercise Increased Caution\n", | |
| "Slovakia - Level 1: Exercise Normal Precautions\n", | |
| "Guinea-Bissau - Level 3: Reconsider Travel\n", | |
| "Cyprus - Level 1: Exercise Normal Precautions\n", | |
| "Eritrea - Level 2: Exercise Increased Caution\n", | |
| "Morocco - Level 2: Exercise Increased Caution\n", | |
| "Germany - Level 2: Exercise Increased Caution\n", | |
| "Jamaica - Level 3: Reconsider Travel\n", | |
| "Algeria - Level 2: Exercise Increased Caution\n", | |
| "Paraguay - Level 1: Exercise Normal Precautions\n", | |
| "Andorra - Level 1: Exercise Normal Precautions\n", | |
| "Lesotho - Level 1: Exercise Normal Precautions\n", | |
| "Dominica - Level 1: Exercise Normal Precautions\n", | |
| "French West Indies - Level 1: Exercise Normal Precautions\n", | |
| "Turkey - Level 2: Exercise Increased Caution\n", | |
| "Czech Republic - Level 1: Exercise Normal Precautions\n", | |
| "Portugal - Level 1: Exercise Normal Precautions\n", | |
| "Afghanistan - Level 4: Do Not Travel\n", | |
| "Suriname - Level 1: Exercise Normal Precautions\n", | |
| "Guatemala - Level 3: Reconsider Travel\n", | |
| "Curaçao - Level 1: Exercise Normal Precautions\n", | |
| "Oman - Level 2: Exercise Increased Caution\n", | |
| "Brazil - Level 2: Exercise Increased Caution\n", | |
| "Austria - Level 1: Exercise Normal Precautions\n", | |
| "Azerbaijan - Level 2: Exercise Increased Caution\n", | |
| "Cameroon - Level 2: Exercise Increased Caution\n", | |
| "Colombia - Level 3: Reconsider Travel\n", | |
| "Georgia - Level 1: Exercise Normal Precautions\n", | |
| "Saint Vincent and the Grenadines - Level 1: Exercise Normal Precautions\n", | |
| "Chile - Level 2: Exercise Increased Caution\n", | |
| "Canada - Level 1: Exercise Normal Precautions\n", | |
| "Belarus - Level 4: Do Not Travel\n", | |
| "Angola - Level 2: Exercise Increased Caution\n", | |
| "Luxembourg - Level 1: Exercise Normal Precautions\n", | |
| "Tuvalu - Level 1: Exercise Normal Precautions\n", | |
| "Kiribati - Level 1: Exercise Normal Precautions\n", | |
| "Zimbabwe - Level 2: Exercise Increased Caution\n", | |
| "Anguilla - Level 1: Exercise Normal Precautions\n", | |
| "North Macedonia - Level 1: Exercise Normal Precautions\n", | |
| "Japan - Level 1: Exercise Normal Precautions\n", | |
| "Bangladesh - Level 2: Exercise Increased Caution\n", | |
| "Ghana - Level 2: Exercise Increased Caution\n", | |
| "Aruba - Level 1: Exercise Normal Precautions\n", | |
| "Sweden - Level 2: Exercise Increased Caution\n", | |
| "French Guiana - Level 1: Exercise Normal Precautions\n", | |
| "Saint Kitts and Nevis - Level 1: Exercise Normal Precautions\n", | |
| "Gabon - Level 2: Exercise Increased Caution\n", | |
| "Mongolia - Level 1: Exercise Normal Precautions\n", | |
| "El Salvador - Level 3: Reconsider Travel\n", | |
| "Madagascar - Level 2: Exercise Increased Caution\n", | |
| "Poland - Level 1: Exercise Normal Precautions\n", | |
| "Mauritius - Level 1: Exercise Normal Precautions\n", | |
| "Moldova - Level 2: Exercise Increased Caution\n", | |
| "Namibia - Level 2: Exercise Increased Caution\n", | |
| "Nigeria - Level 3: Reconsider Travel\n", | |
| "Tunisia - Level 2: Exercise Increased Caution\n", | |
| "Maldives - Level 2: Exercise Increased Caution\n", | |
| "Greece - Level 1: Exercise Normal Precautions\n", | |
| "Central African Republic - Level 4: Do Not Travel\n", | |
| "Somalia - Level 4: Do Not Travel\n", | |
| "Grenada - Level 1: Exercise Normal Precautions\n", | |
| "Norway - Level 1: Exercise Normal Precautions\n", | |
| "Tanzania - Level 2: Exercise Increased Caution\n", | |
| "Tonga - Level 1: Exercise Normal Precautions\n", | |
| "South Sudan - Level 4: Do Not Travel\n", | |
| "Ukraine - Level 4: Do Not Travel\n", | |
| "Ireland - Level 1: Exercise Normal Precautions\n", | |
| "Palau - Level 1: Exercise Normal Precautions\n", | |
| "Russia - Level 4: Do Not Travel\n", | |
| "Sao Tome and Principe - Level 1: Exercise Normal Precautions\n", | |
| "Antarctica - Level 2: Exercise Increased Caution\n", | |
| "Democratic Republic of the Congo - Level 3: Reconsider Travel\n", | |
| "Ecuador - Level 2: Exercise Increased Caution\n", | |
| "Indonesia - Level 2: Exercise Increased Caution\n", | |
| "Latvia - Level 1: Exercise Normal Precautions\n", | |
| "Philippines - Level 2: Exercise Increased Caution\n", | |
| "Marshall Islands - Level 1: Exercise Normal Precautions\n", | |
| "Togo - Level 1: Exercise Normal Precautions\n", | |
| "Uganda - Level 3: Reconsider Travel\n", | |
| "Finland - Level 1: Exercise Normal Precautions\n", | |
| "Croatia - Level 1: Exercise Normal Precautions\n", | |
| "United Kingdom - Level 2: Exercise Increased Caution\n", | |
| "Turks and Caicos Islands - Level 2: Exercise Increased Caution\n", | |
| "Nicaragua - Level 3: Reconsider Travel\n", | |
| "Cambodia - Level 1: Exercise Normal Precautions\n", | |
| "Bosnia and Herzegovina - Level 2: Exercise Increased Caution\n", | |
| "Dominican Republic - Level 2: Exercise Increased Caution\n", | |
| "Spain - Level 2: Exercise Increased Caution\n", | |
| "Liechtenstein - Level 1: Exercise Normal Precautions\n", | |
| "Brunei - Level 1: Exercise Normal Precautions\n", | |
| "Uzbekistan - Level 1: Exercise Normal Precautions\n", | |
| "Kenya - Level 2: Exercise Increased Caution\n", | |
| "Saint Lucia - Level 1: Exercise Normal Precautions\n", | |
| "Benin - Level 2: Exercise Increased Caution\n", | |
| "Papua New Guinea - Level 3: Reconsider Travel\n", | |
| "Slovenia - Level 1: Exercise Normal Precautions\n", | |
| "Sri Lanka - Level 2: Exercise Increased Caution\n", | |
| "French Polynesia - Level 1: Exercise Normal Precautions\n", | |
| "Vietnam - Level 1: Exercise Normal Precautions\n", | |
| "Antigua and Barbuda - Level 1: Exercise Normal Precautions\n", | |
| "Burkina Faso - Level 4: Do Not Travel\n", | |
| "Switzerland - Level 1: Exercise Normal Precautions\n", | |
| "See Individual Summaries -\n", | |
| "Singapore - Level 1: Exercise Normal Precautions\n", | |
| "Iraq - Level 4: Do Not Travel\n", | |
| "Albania - Level 2: Exercise Increased Caution\n", | |
| "The Gambia - Level 2: Exercise Increased Caution\n", | |
| "Netherlands - Level 2: Exercise Increased Caution\n", | |
| "Nauru - Level 1: Exercise Normal Precautions\n", | |
| "Seychelles - Level 1: Exercise Normal Precautions\n", | |
| "Yemen - Level 4: Do Not Travel\n", | |
| "Comoros - Level 2: Exercise Increased Caution\n", | |
| "Trinidad and Tobago - Level 3: Reconsider Travel\n", | |
| "Mozambique - Level 2: Exercise Increased Caution\n", | |
| "Micronesia - Level 1: Exercise Normal Precautions\n", | |
| "Kingdom of Denmark - Level 2: Exercise Increased Caution\n", | |
| "Nepal - Level 2: Exercise Increased Caution\n", | |
| "Honduras - Level 3: Reconsider Travel\n", | |
| "Mali - Level 4: Do Not Travel\n", | |
| "Equatorial Guinea - Level 2: Exercise Increased Caution\n", | |
| "Kazakhstan - Level 1: Exercise Normal Precautions\n", | |
| "Laos - Level 2: Exercise Increased Caution\n", | |
| "Djibouti - Level 2: Exercise Increased Caution\n", | |
| "Sierra Leone - Level 2: Exercise Increased Caution\n", | |
| "Peru - Level 2: Exercise Increased Caution\n", | |
| "Lithuania - Level 1: Exercise Normal Precautions\n", | |
| "Fiji - Level 1: Exercise Normal Precautions\n", | |
| "Egypt - Level 3: Reconsider Travel\n", | |
| "Serbia - Level 2: Exercise Increased Caution\n", | |
| "Cabo Verde - Level 1: Exercise Normal Precautions\n", | |
| "Haiti - Level 4: Do Not Travel\n", | |
| "Thailand - Level 1: Exercise Normal Precautions\n", | |
| "New Caledonia - Level 3: Reconsider Travel\n", | |
| "Burundi - Level 3: Reconsider Travel\n", | |
| "Costa Rica - Level 2: Exercise Increased Caution\n", | |
| "South Korea - Level 1: Exercise Normal Precautions\n", | |
| "Pakistan - Level 3: Reconsider Travel\n", | |
| "Venezuela - Level 4: Do Not Travel\n", | |
| "Solomon Islands - Level 2: Exercise Increased Caution\n", | |
| "Belize - Level 2: Exercise Increased Caution\n", | |
| "Malaysia - Level 1: Exercise Normal Precautions\n", | |
| "United Arab Emirates - Level 2: Exercise Increased Caution\n", | |
| "Liberia - Level 2: Exercise Increased Caution\n", | |
| "Timor-Leste - Level 2: Exercise Increased Caution\n", | |
| "Iran - Level 4: Do Not Travel\n", | |
| "Mauritania - Level 3: Reconsider Travel\n", | |
| "Guinea - Level 3: Reconsider Travel\n", | |
| "Guyana - Level 3: Reconsider Travel\n", | |
| "Niger - Level 3: Reconsider Travel\n", | |
| "Republic of the Congo - Level 2: Exercise Increased Caution\n", | |
| "Botswana - Level 2: Exercise Increased Caution\n", | |
| "Samoa - Level 1: Exercise Normal Precautions\n", | |
| "Bahrain - Level 2: Exercise Increased Caution\n", | |
| "Kosovo - Level 2: Exercise Increased Caution\n", | |
| "Cayman Islands - Level 1: Exercise Normal Precautions\n", | |
| "Malawi - Level 2: Exercise Increased Caution\n", | |
| "Kyrgyzstan - Level 1: Exercise Normal Precautions\n", | |
| "Bulgaria - Level 1: Exercise Normal Precautions\n", | |
| "Eswatini - Level 2: Exercise Increased Caution\n", | |
| "Turkmenistan - Level 1: Exercise Normal Precautions\n", | |
| "Australia - Level 1: Exercise Normal Precautions\n", | |
| "Bermuda - Level 1: Exercise Normal Precautions\n", | |
| "France - Level 2: Exercise Increased Caution\n", | |
| "Sint Maarten - Level 1: Exercise Normal Precautions\n", | |
| "Kuwait - Level 1: Exercise Normal Precautions\n", | |
| "Libya - Level 4: Do Not Travel\n", | |
| "New Zealand - Level 1: Exercise Normal Precautions\n", | |
| "Ethiopia - Level 3: Reconsider Travel\n", | |
| "South Africa - Level 2: Exercise Increased Caution\n", | |
| "Panama - Level 2: Exercise Increased Caution\n", | |
| "Romania - Level 1: Exercise Normal Precautions\n", | |
| "Italy - Level 2: Exercise Increased Caution\n", | |
| "Cote d Ivoire - Level 2: Exercise Increased Caution\n", | |
| "Montserrat - Level 1: Exercise Normal Precautions\n", | |
| "Lebanon - Level 3: Reconsider Travel\n", | |
| "Sint Eustatius - Level 1: Exercise Normal Precautions\n", | |
| "Bonaire - Level 1: Exercise Normal Precautions\n", | |
| "Saba - Level 1: Exercise Normal Precautions\n", | |
| "Mexico - See State Summaries - Level 3: Reconsider Travel\n", | |
| "India - Level 2: Exercise Increased Caution\n", | |
| "Worldwide Caution - Caution\n", | |
| "Vanuatu - Level 1: Exercise Normal Precautions\n", | |
| "Hong Kong - Level 2: Exercise Increased Caution\n", | |
| "Macau - Level 3: Reconsider Travel\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "RPC error: [insert_rows], <MilvusException: (code=1100, message=the length (5479) of 7th string exceeds max length (4096): invalid parameter[expected=valid length string][actual=string length exceeds max length])>, <Time:{'RPC start': '2024-07-04 11:28:36.813465', 'RPC error': '2024-07-04 11:28:36.870016'}>\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<MilvusException: (code=1100, message=the length (5479) of 7th string exceeds max length (4096): invalid parameter[expected=valid length string][actual=string length exceeds max length])>\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import feedparser\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "\n", | |
| "# Function to remove tags\n", | |
| "def remove_tags(html):\n", | |
| "\n", | |
| " # parse html content\n", | |
| " soup = BeautifulSoup(html, \"html.parser\")\n", | |
| "\n", | |
| " for data in soup(['style', 'script']):\n", | |
| " # Remove tags\n", | |
| " data.decompose()\n", | |
| "\n", | |
| " # return data by retrieving the tag content\n", | |
| " return ' '.join(soup.stripped_strings)\n", | |
| "\n", | |
| "# Lets Explore What our Data Looks like\n", | |
| "# Travel Advisories\n", | |
| "feed = feedparser.parse(TRAVEL_URL)\n", | |
| "\n", | |
| "# print(feed)\n", | |
| "\n", | |
| "summaries = [] \n", | |
| "titles = []\n", | |
| "links = []\n", | |
| "publisheddates = []\n", | |
| "\n", | |
| "for post in feed.entries:\n", | |
| " try:\n", | |
| " print(post.title)\n", | |
| " summary = remove_tags(str(post.summary))\n", | |
| " summaries.append(summary)\n", | |
| " titles.append(str(post.title))\n", | |
| " links.append(str(post.link))\n", | |
| " publisheddates.append(str(post.published))\n", | |
| " except Exception as e: \n", | |
| " print(e)\n", | |
| "\n", | |
| "try:\n", | |
| " summaryvector = bm25_ef.encode_documents(summaries)\n", | |
| " i = 0\n", | |
| " \n", | |
| " #entities = [\n", | |
| " # {\n", | |
| " # \"title\": titles[i],\n", | |
| " # \"link\": links[i],\n", | |
| " # \"summary\": summaries[i],\n", | |
| " # \"publisheddate\": publisheddates[i],\n", | |
| " # \"sparse_vector\": embedding.todok(),\n", | |
| " # }\n", | |
| " # for embedding in summaryvector\n", | |
| " # ]\n", | |
| "\n", | |
| " entities = []\n", | |
| " i = 0\n", | |
| " for embedding in summaryvector:\n", | |
| " entities.append({\n", | |
| " \"title\": titles[i],\n", | |
| " \"link\": links[i],\n", | |
| " \"summary\": summaries[i],\n", | |
| " \"publisheddate\": publisheddates[i],\n", | |
| " \"sparse_vector\": embedding.todok(),\n", | |
| " })\n", | |
| " i += 1\n", | |
| "\n", | |
| " # for embedding in summaryvector\n", | |
| " # for i in range(summaryvector.shape[0])\n", | |
| " # Insert entities\n", | |
| " res = milvus_client.insert(collection_name=COLLECTION_NAME, data=entities)\n", | |
| " # print(res)\n", | |
| "except Exception as e: \n", | |
| " print(e)\n", | |
| "\n", | |
| "index_params = milvus_client.prepare_index_params()\n", | |
| "index_params.add_index(\n", | |
| " field_name=\"sparse_vector\",\n", | |
| " index_name=\"sparse_inverted_index\",\n", | |
| " index_type=\"SPARSE_INVERTED_INDEX\",\n", | |
| " metric_type=\"IP\",\n", | |
| " params={\"drop_ratio_build\": 0.2},\n", | |
| ")\n", | |
| "\n", | |
| "milvus_client.create_index(collection_name=COLLECTION_NAME, index_params=index_params)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "2e796aaf-e5dc-4ca1-bbb3-42155be4e6f1", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Collecting feedparser\n", | |
| " Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)\n", | |
| "Collecting sgmllib3k (from feedparser)\n", | |
| " Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)\n", | |
| " Preparing metadata (setup.py) ... \u001b[?25ldone\n", | |
| "\u001b[?25hDownloading feedparser-6.0.11-py3-none-any.whl (81 kB)\n", | |
| "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.3/81.3 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hBuilding wheels for collected packages: sgmllib3k\n", | |
| " Building wheel for sgmllib3k (setup.py) ... \u001b[?25ldone\n", | |
| "\u001b[?25h Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6049 sha256=ee9f7c228edf44b1ad3eaec8ee7c7422c44b552d3a2c0bbe75c69b9f39274883\n", | |
| " Stored in directory: /Users/timothyspann/Library/Caches/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246\n", | |
| "Successfully built sgmllib3k\n", | |
| "Installing collected packages: sgmllib3k, feedparser\n", | |
| "Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip3 install feedparser" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 73, | |
| "id": "1906f66f-bf10-40e8-bcb4-a44757b27de7", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: bs4 in ./milvusvenv/lib/python3.12/site-packages (0.0.2)\n", | |
| "Requirement already satisfied: beautifulsoup4 in ./milvusvenv/lib/python3.12/site-packages (from bs4) (4.12.3)\n", | |
| "Requirement already satisfied: soupsieve>1.2 in ./milvusvenv/lib/python3.12/site-packages (from beautifulsoup4->bs4) (2.5)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip3 install bs4" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "a151f840-91c7-41ab-896f-55143c262a78", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "\n", | |
| "search_results = milvus_client.search(\n", | |
| " collection_name=COLLECTION_NAME, # Collection name\n", | |
| " data=query_vector, # Replace with your query vector\n", | |
| " search_params={\n", | |
| " \"metric_type\": \"IP\"\n", | |
| " }, # Search parameters\n", | |
| " limit=10, # Max. number of search results to return\n", | |
| " output_fields=[\"pk\",\"title\",\"link\",\"summary\",\"publisheddate\"], # Fields to return in the search results\n", | |
| " consistency_level=\"Eventually\"\n", | |
| ")\n", | |
| "\n", | |
| "# Print search results\n", | |
| "for hits in search_results:\n", | |
| " for hit in hits:\n", | |
| " print(f\"Hit: {hit}\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment