JonathanLoscalzo · July 1, 2024 18:51
diff --git a/docker-compose.yml b/docker-compose.yml
 version: '3.8'

 services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.14.1
    container_name: elasticsearch
    environment:
      - bootstrap.memory_lock=true
      - discovery.type=single-node
      - xpack.security.enabled=false
      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
    ports:
      - "9200:9200"
      - "9300:9300"

  ollama:
    image: ollama/ollama
    container_name: ollama
    volumes:
      - ollama:/root/.ollama
    ports:
      - "11434:11434"

 volumes:
  ollama:
diff --git a/ex1_insert.py b/ex1_insert.py
 import requests

 docs_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1"
 docs_response = requests.get(docs_url)
 documents_raw = docs_response.json()

 documents = []

 for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

 from elasticsearch import Elasticsearch

 es_client = Elasticsearch("http://localhost:9200")

 index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
 }

 index_name = "course-questions"

 es_client.indices.create(index=index_name, body=index_settings)

 for doc in documents:
    es_client.index(index=index_name, document=doc) #Q1: index
diff --git a/search.py b/search.py
 from elasticsearch import Elasticsearch

 es_client = Elasticsearch("http://localhost:9200")
 index_name = "course-questions"


 def elastic_search(query, course=None):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields",
                    }
                },
            }
        },
    }
    if course is not None:
        search_query["query"]["bool"]["filter"] = {"term": {"course": course}}

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs, response


 query = "How do I execute a command in a running docker container?"
 docs, result = elastic_search(query)
 print(result.body["hits"]["max_score"])  # Q3: 84.05

 docs, result = elastic_search(query, course="machine-learning-zoomcamp")
 print(docs[2]["question"])
 # Q4: How do I copy files from a different folder into docker container’s working directory?


 def build_prompt(query, search_results):
    prompt_template = """
 You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
 Use only the facts from the CONTEXT when answering the QUESTION.

 QUESTION: {question}

 CONTEXT: 
 {context}
 """.strip()

    context_template = """
 Q: {question}
 A: {text}
 """.strip()
    context = ""

    for doc in search_results:
        context += (
            context_template.format(
                question=doc["question"],
                text=doc["text"],
            )
            + "\n\n"
        ).strip()

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


 prompt = build_prompt(query, docs[:3])
 print(len(prompt))  # Q5: 1459

 import tiktoken

 encoding = tiktoken.encoding_for_model("gpt-4o")
 query_encoded = encoding.encode(prompt)
 print(len(query_encoded))  # Q6: 322
	version: '3.8'

	services:
	elasticsearch:
	image: docker.elastic.co/elasticsearch/elasticsearch:8.14.1
	container_name: elasticsearch
	environment:
	- bootstrap.memory_lock=true
	- discovery.type=single-node
	- xpack.security.enabled=false
	- "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
	ports:
	- "9200:9200"
	- "9300:9300"

	ollama:
	image: ollama/ollama
	container_name: ollama
	volumes:
	- ollama:/root/.ollama
	ports:
	- "11434:11434"

	volumes:
	ollama:
	import requests

	docs_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1"
	docs_response = requests.get(docs_url)
	documents_raw = docs_response.json()

	documents = []

	for course in documents_raw:
	course_name = course["course"]

	for doc in course["documents"]:
	doc["course"] = course_name
	documents.append(doc)

	from elasticsearch import Elasticsearch

	es_client = Elasticsearch("http://localhost:9200")

	index_settings = {
	"settings": {"number_of_shards": 1, "number_of_replicas": 0},
	"mappings": {
	"properties": {
	"text": {"type": "text"},
	"section": {"type": "text"},
	"question": {"type": "text"},
	"course": {"type": "keyword"},
	}
	},
	}

	index_name = "course-questions"

	es_client.indices.create(index=index_name, body=index_settings)

	for doc in documents:
	es_client.index(index=index_name, document=doc) #Q1: index