Created
February 18, 2023 14:28
-
-
Save hsm207/ffb2122acade111c081b120b1db367e3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'hostname': 'http://[::]:8080',\n", | |
" 'modules': {'ref2vec-centroid': {},\n", | |
" 'text2vec-transformers': {'model': {'_name_or_path': 'sentence-transformers/msmarco-distilroberta-base-v2',\n", | |
" 'add_cross_attention': False,\n", | |
" 'architectures': ['RobertaModel'],\n", | |
" 'attention_probs_dropout_prob': 0.1,\n", | |
" 'bad_words_ids': None,\n", | |
" 'bos_token_id': 0,\n", | |
" 'chunk_size_feed_forward': 0,\n", | |
" 'decoder_start_token_id': None,\n", | |
" 'diversity_penalty': 0,\n", | |
" 'do_sample': False,\n", | |
" 'early_stopping': False,\n", | |
" 'encoder_no_repeat_ngram_size': 0,\n", | |
" 'eos_token_id': 2,\n", | |
" 'finetuning_task': None,\n", | |
" 'forced_bos_token_id': None,\n", | |
" 'forced_eos_token_id': None,\n", | |
" 'gradient_checkpointing': False,\n", | |
" 'hidden_act': 'gelu',\n", | |
" 'hidden_dropout_prob': 0.1,\n", | |
" 'hidden_size': 768,\n", | |
" 'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},\n", | |
" 'initializer_range': 0.02,\n", | |
" 'intermediate_size': 3072,\n", | |
" 'is_decoder': False,\n", | |
" 'is_encoder_decoder': False,\n", | |
" 'label2id': {'LABEL_0': 0, 'LABEL_1': 1},\n", | |
" 'layer_norm_eps': 1e-05,\n", | |
" 'length_penalty': 1,\n", | |
" 'max_length': 20,\n", | |
" 'max_position_embeddings': 514,\n", | |
" 'min_length': 0,\n", | |
" 'model_type': 'roberta',\n", | |
" 'no_repeat_ngram_size': 0,\n", | |
" 'num_attention_heads': 12,\n", | |
" 'num_beam_groups': 1,\n", | |
" 'num_beams': 1,\n", | |
" 'num_hidden_layers': 6,\n", | |
" 'num_return_sequences': 1,\n", | |
" 'output_attentions': False,\n", | |
" 'output_hidden_states': False,\n", | |
" 'output_scores': False,\n", | |
" 'pad_token_id': 1,\n", | |
" 'position_embedding_type': 'absolute',\n", | |
" 'prefix': None,\n", | |
" 'problem_type': None,\n", | |
" 'pruned_heads': {},\n", | |
" 'remove_invalid_values': False,\n", | |
" 'repetition_penalty': 1,\n", | |
" 'return_dict': True,\n", | |
" 'return_dict_in_generate': False,\n", | |
" 'sep_token_id': None,\n", | |
" 'task_specific_params': None,\n", | |
" 'temperature': 1,\n", | |
" 'tie_encoder_decoder': False,\n", | |
" 'tie_word_embeddings': True,\n", | |
" 'tokenizer_class': None,\n", | |
" 'top_k': 50,\n", | |
" 'top_p': 1,\n", | |
" 'torchscript': False,\n", | |
" 'transformers_version': '4.6.1',\n", | |
" 'type_vocab_size': 1,\n", | |
" 'use_bfloat16': False,\n", | |
" 'use_cache': True,\n", | |
" 'vocab_size': 50265}}},\n", | |
" 'version': '1.17.2'}" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from weaviate import Client\n", | |
"\n", | |
"# Connect to the Weaviate instance at weaviate:8080\n", | |
"client = Client(\"http://weaviate:8080\")\n", | |
"\n", | |
"client.batch.configure(batch_size=1)\n", | |
"\n", | |
"# call the meta endpoint\n", | |
"client.get_meta()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# delete the Article and Paragraph classes\n", | |
"client.schema.delete_class(\"Article\")\n", | |
"client.schema.delete_class(\"Paragraph\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create a class called Paragraph with text property named content\n", | |
"client.schema.create_class(\n", | |
" {\n", | |
" \"class\": \"Paragraph\",\n", | |
" \"description\": \"A paragraph of text\",\n", | |
" \"properties\": [\n", | |
" {\n", | |
" \"name\": \"content\",\n", | |
" \"dataType\": [\"text\"],\n", | |
" \"description\": \"The text of the paragraph\",\n", | |
" }\n", | |
" ],\n", | |
" \"vectorizer\": \"text2vec-transformers\",\n", | |
" }\n", | |
")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create a class called Article with string property named title and cross reference to Paragraph\n", | |
"# vectorize using ref2vec\n", | |
"client.schema.create_class(\n", | |
" {\n", | |
" \"class\": \"Article\",\n", | |
" \"description\": \"An article\",\n", | |
" \"properties\": [\n", | |
" {\n", | |
" \"name\": \"title\",\n", | |
" \"dataType\": [\"string\"],\n", | |
" \"description\": \"The title of the article\",\n", | |
" },\n", | |
" {\n", | |
" \"name\": \"paragraphs\",\n", | |
" \"dataType\": [\"Paragraph\"],\n", | |
" \"description\": \"The paragraphs of the article\",\n", | |
" }\n", | |
" ],\n", | |
" \"moduleConfig\": {\n", | |
" \"ref2vec-centroid\": {\n", | |
" \"referenceProperties\": [\"paragraphs\"],\n", | |
" \"method\": \"mean\"\n", | |
" }\n", | |
" },\n", | |
" \"vectorizer\": \"ref2vec-centroid\"\n", | |
" }\n", | |
")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create two articles\n", | |
"# both articles have 2 paragraphs\n", | |
"# the title for the first article is \"A\"\n", | |
"# the title for the second article is \"B\"\n", | |
"# the paragraphs for the first article are [\"lorem ipsum\", \"dolor sit amet\"]\n", | |
"# the paragraphs for the second article are [\"the quick brown fox\", \"jumps over the lazy dog\"]\n", | |
"\n", | |
"articles = {\n", | |
" \"A\": [\"lorem ipsum\", \"dolor sit amet\"],\n", | |
" \"B\": [\"the quick brown fox\", \"jumps over the lazy dog\"]\n", | |
"}\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create article A in weaviate\n", | |
"article_title = \"A\"\n", | |
"article_paragraphs = articles[article_title]\n", | |
"\n", | |
"with client.batch as batch:\n", | |
" article_uuid = batch.add_data_object(\n", | |
" {\n", | |
" \"title\": article_title,\n", | |
" },\n", | |
" class_name=\"Article\",\n", | |
" )\n", | |
"\n", | |
" for paragraph in article_paragraphs:\n", | |
" paragraph_uuid = batch.add_data_object(\n", | |
" {\n", | |
" \"content\": paragraph,\n", | |
" },\n", | |
" class_name=\"Paragraph\",\n", | |
"\n", | |
" )\n", | |
"\n", | |
" batch.add_reference(\n", | |
" from_object_uuid=article_uuid,\n", | |
" from_object_class_name=\"Article\",\n", | |
" from_property_name=\"paragraphs\",\n", | |
" to_object_uuid=paragraph_uuid,\n", | |
" to_object_class_name=\"Paragraph\",\n", | |
" )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create article B in weaviate\n", | |
"article_title = \"B\"\n", | |
"article_paragraphs = articles[article_title]\n", | |
"\n", | |
"article_uuid = client.data_object.create(\n", | |
" {\n", | |
" \"title\": article_title,\n", | |
" },\n", | |
" class_name=\"Article\",\n", | |
")\n", | |
"\n", | |
"for paragraph in article_paragraphs:\n", | |
" paragraph_uuid = client.data_object.create(\n", | |
" {\n", | |
" \"content\": paragraph,\n", | |
" },\n", | |
" class_name=\"Paragraph\",\n", | |
" )\n", | |
"\n", | |
" client.data_object.reference.add(\n", | |
" from_uuid=article_uuid,\n", | |
" from_property_name=\"paragraphs\",\n", | |
" to_uuid=paragraph_uuid,\n", | |
" from_class_name=\"Article\",\n", | |
" to_class_name=\"Paragraph\",\n", | |
" )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# get the title and vectors of the articles\n", | |
"article_A = client.query.get(\"Article\", \"title\")\\\n", | |
" .with_additional(\"vector\")\\\n", | |
" .with_where({\n", | |
" \"path\": [\"title\"],\n", | |
" \"operator\": \"Equal\",\n", | |
" \"valueString\": \"A\"\n", | |
" })\\\n", | |
" .do()[\"data\"][\"Get\"][\"Article\"]\n", | |
"\n", | |
"article_B = client.query.get(\"Article\", \"title\")\\\n", | |
" .with_additional(\"vector\")\\\n", | |
" .with_where({\n", | |
" \"path\": [\"title\"],\n", | |
" \"operator\": \"Equal\",\n", | |
" \"valueString\": \"B\"\n", | |
" })\\\n", | |
" .do()[\"data\"][\"Get\"][\"Article\"]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# assert that article A has no vector\n", | |
"assert not article_A[0][\"_additional\"][\"vector\"]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# assert that article B has a vector\n", | |
"assert article_B[0][\"_additional\"][\"vector\"]\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.9" | |
}, | |
"orig_nbformat": 4, | |
"vscode": { | |
"interpreter": { | |
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment