usametov · April 15, 2025 23:39
diff --git a/contextual.md b/contextual.md
diff --git a/generate_qa_pairs.clj b/generate_qa_pairs.clj
 (ns dataset.qageneration
  (:require [clj-http.client :as http]
            [cheshire.core :as json]
            [clojure.string :as str]))

 ;; Function to call a language model API (e.g., Grok 3)
 (defn call-llm-api [prompt]
  (let [api-url "https://api.x.ai/grok3" ;; Replace with actual API endpoint
        api-key "your-api-key" ;; Replace with your API key
        request-body (json/generate-string {:prompt prompt :max_tokens 500})
        response (http/post api-url
                           {:headers {"Authorization" (str "Bearer " api-key)
                                      "Content-Type" "application/json"}
                            :body request-body})]
    (json/parse-string (:body response) true)))

 ;; Prompt template (loaded from artifact)
 (def qa-prompt-template
  (str "You are an expert at generating high-quality question and answer pairs for a dataset similar to the Natural Questions dataset. Given a semantic chunk of text and a list of possible topics, your task is:\n\n"
       "1. Generate one natural, concise, and relevant question based on the content of the chunk.\n"
       "2. Ensure the question aligns with one of the provided topics, selecting the most appropriate topic.\n"
       "3. Provide a short answer (a concise phrase or sentence within the chunk) that directly answers the question.\n"
       "4. Provide a long answer (a sentence or paragraph from the chunk or slightly rephrased for clarity) that fully answers the question.\n"
       "5. Specify the selected topic and the answer type (span, yes/no, or null).\n\n"
       "**Input**:\n- **Chunk**: %s\n- **Possible Topics**: %s\n\n"
       "**Output Format** (JSON):\n```json\n{\n  \"question\": \"<generated question>\",\n  \"short_answer\": \"<concise answer>\",\n  \"long_answer\": \"<detailed answer>\",\n  \"selected_topic\": \"<chosen topic>\",\n  \"answer_type\": \"<span|yes/no|null>\"\n}\n```\n\n"
       "**Guidelines**:\n"
       "- The question should be natural, as if asked by a curious user, and directly related to the chunk’s content.\n"
       "- The short answer must be a verbatim or near-verbatim excerpt from the chunk.\n"
       "- The long answer can be a direct excerpt or a slight rephrasing for clarity, but must remain faithful to the chunk.\n"
       "- Choose the most relevant topic from the provided list based on the chunk’s content.\n"
       "- If the question is unanswerable within the chunk, set `answer_type` to \"null\" and provide empty answers.\n"
       "- For yes/no questions, ensure the short answer is \"Yes\" or \"No,\" and the long answer explains why.\n"
       "- Avoid generic or overly broad questions; focus on specific details in the chunk.\n"
       "- Ensure the question is answerable using only the chunk’s content.\n\n"
       "**Example**:\n"
       "- **Chunk**: \"Rising sea levels, driven by melting ice caps and thermal expansion, threaten coastal cities with increased flooding and erosion. Sustainable practices can mitigate these effects.\"\n"
       "- **Possible Topics**: [\"Climate Change\", \"Sustainability\", \"Urban Planning\"]\n"
       "- **Output**:\n"
       "```json\n{\n  \"question\": \"What threatens coastal cities due to climate change?\",\n  \"short_answer\": \"Rising sea levels\",\n  \"long_answer\": \"Rising sea levels, driven by melting ice caps and thermal expansion, threaten coastal cities with increased flooding and erosion.\",\n  \"selected_topic\": \"Climate Change\",\n  \"answer_type\": \"span\"\n}\n```"))

 ;; Function to generate Q&A pair for a chunk
 (defn generate-qa-pair [chunk topics]
  (let [topics-str (str/join ", " topics)
        prompt (format qa-prompt-template chunk topics-str)
        response (call-llm-api prompt)]
    (:body response)))

 ;; Example usage
 (def sample-chunk
  "Machine learning models require large datasets for training. Overfitting occurs when a model learns noise instead of patterns, leading to poor generalization.")
 (def sample-topics ["Machine Learning", "Data Science", "AI Ethics"])

 (defn -main []
  (let [qa-pair (generate-qa-pair sample-chunk sample-topics)]
    (println "Generated Q&A Pair:" qa-pair)))

 ;; Store Q&A pairs in a dataset
 (defn store-qa-pairs [qa-pairs output-file]
  (spit output-file (json/generate-string qa-pairs {:pretty true})))

 ;; Example: Process multiple chunks
 (def chunks-with-topics
  [{:chunk "Machine learning models require large datasets for training..." 
    :topics ["Machine Learning", "Data Science"]}
   {:chunk "Sustainable practices can reduce carbon emissions..." 
    :topics ["Sustainability", "Climate Change"]}])

 (defn process-chunks [chunks]
  (map (fn [{:keys [chunk topics]}]
         (merge (generate-qa-pair chunk topics) {:chunk chunk}))
       chunks))

 (comment
  ;; Run example
  (-main)
  ;; Process and store multiple chunks
  (let [qa-pairs (process-chunks chunks-with-topics)]
    (store-qa-pairs qa-pairs "qa_dataset.json")))
diff --git a/generate_qa_pairs_contextual.clj b/generate_qa_pairs_contextual.clj
 (ns dataset.qageneration
  (:require [clj-http.client :as http]
            [cheshire.core :as json]
            [clojure.string :as str]))

 ;; Function to call a language model API (e.g., Grok 3)
 (defn call-llm-api [prompt]
  (let [api-url "https://api.x.ai/grok3" ;; Replace with actual API endpoint
        api-key "your-api-key" ;; Replace with your API key
        request-body (json/generate-string {:prompt prompt :max_tokens 500})
        response (http/post api-url
                           {:headers {"Authorization" (str "Bearer " api-key)
                                      "Content-Type" "application/json"}
                            :body request-body})]
    (json/parse-string (:body response) true)))

 ;; Prompt for generating contextual summary
 (def contextual-summary-prompt
  (str "You are an expert at summarizing text for contextual understanding. Given a chunk of text, generate a concise summary (1–2 sentences) that captures its main idea, purpose, or role within the broader document. The summary should provide high-level context without repeating the chunk verbatim.\n\n"
       "**Input**:\n- **Chunk**: %s\n\n"
       "**Output**:\nA summary of the chunk in 1–2 sentences.\n\n"
       "**Example**:\n- **Chunk**: \"Rising sea levels, driven by melting ice caps and thermal expansion, threaten coastal cities with increased flooding and erosion.\"\n- **Output**: \"This chunk discusses the impacts of climate change on coastal regions, focusing on sea level rise and its consequences.\""))

 ;; Prompt for Q&A generation with contextual retrieval (loaded from artifact)
 (def qa-prompt-template
  (str "You are an expert at generating high-quality question and answer pairs for a dataset similar to the Natural Questions dataset. Given a semantic chunk of text, its contextual summary, and a list of possible topics, your task is:\n\n"
       "1. Generate one natural, concise, and relevant question based on the content of the chunk and informed by its contextual summary.\n"
       "2. Ensure the question aligns with one of the provided topics, selecting the most appropriate topic.\n"
       "3. Provide a short answer (a concise phrase or sentence within the chunk) that directly answers the question.\n"
       "4. Provide a long answer (a sentence or paragraph from the chunk or slightly rephrased for clarity) that fully answers the question.\n"
       "5. Specify the selected topic and the answer type (span, yes/no, or null).\n\n"
       "**Input**:\n- **Chunk**: %s\n- **Contextual Summary**: %s\n- **Possible Topics**: %s\n\n"
       "**Output Format** (JSON):\n`json\n{\n \"question\": \"<generated question>\",\n \"short_answer\": \"<concise answer>\",\n \"long_answer\": \"<detailed answer>\",\n \"selected_topic\": \"<chosen topic>\",\n \"answer_type\": \"<span|yes/no|null>\"\n}\n`\n\n"
       "**Guidelines**:\n"
       "- Use the contextual summary to understand the chunk’s broader significance and generate a question that reflects both the chunk’s details and its role in the document.\n"
       "- The question should be natural, as if asked by a curious user, and directly answerable using the chunk’s content.\n"
       "- The short answer must be a verbatim or near-verbatim excerpt from the chunk.\n"
       "- The long answer can be a direct excerpt or a slight rephrasing for clarity, but must remain faithful to the chunk.\n"
       "- Choose the most relevant topic from the provided list, guided by the summary and chunk content.\n"
       "- If the question is unanswerable within the chunk, set `answer_type` to \"null\" and provide empty answers.\n"
       "- For yes/no questions, ensure the short answer is \"Yes\" or \"No,\" and the long answer explains why.\n"
       "- Avoid generic or overly broad questions; focus on specific details in the chunk and its context.\n"
       "- Ensure the question is answerable using only the chunk’s content.\n\n"
       "**Example**:\n"
       "- **Chunk**: \"Rising sea levels, driven by melting ice caps and thermal expansion, threaten coastal cities with increased flooding and erosion.\"\n"
       "- **Contextual Summary**: \"This chunk discusses the impacts of climate change on coastal regions, focusing on sea level rise and its consequences.\"\n"
       "- **Possible Topics**: [\"Climate Change\", \"Sustainability\", \"Urban Planning
diff --git a/generation-prompt.md b/generation-prompt.md
diff --git a/generation1.md b/generation1.md
diff --git a/nq-generation-plan.md b/nq-generation-plan.md
diff --git a/qa-generation-prompt.md b/qa-generation-prompt.md
	(ns dataset.qageneration
	(:require [clj-http.client :as http]
	[cheshire.core :as json]
	[clojure.string :as str]))

	;; Function to call a language model API (e.g., Grok 3)
	(defn call-llm-api [prompt]
	(let [api-url "https://api.x.ai/grok3" ;; Replace with actual API endpoint
	api-key "your-api-key" ;; Replace with your API key
	request-body (json/generate-string {:prompt prompt :max_tokens 500})
	response (http/post api-url
	{:headers {"Authorization" (str "Bearer " api-key)
	"Content-Type" "application/json"}
	:body request-body})]
	(json/parse-string (:body response) true)))

	;; Prompt template (loaded from artifact)
	(def qa-prompt-template
	(str "You are an expert at generating high-quality question and answer pairs for a dataset similar to the Natural Questions dataset. Given a semantic chunk of text and a list of possible topics, your task is:\n\n"
	"1. Generate one natural, concise, and relevant question based on the content of the chunk.\n"
	"2. Ensure the question aligns with one of the provided topics, selecting the most appropriate topic.\n"
	"3. Provide a short answer (a concise phrase or sentence within the chunk) that directly answers the question.\n"
	"4. Provide a long answer (a sentence or paragraph from the chunk or slightly rephrased for clarity) that fully answers the question.\n"
	"5. Specify the selected topic and the answer type (span, yes/no, or null).\n\n"
	"Input:\n- Chunk: %s\n- Possible Topics: %s\n\n"
	"Output Format (JSON):\n```json\n{\n \"question\": \"<generated question>\",\n \"short_answer\": \"<concise answer>\",\n \"long_answer\": \"<detailed answer>\",\n \"selected_topic\": \"<chosen topic>\",\n \"answer_type\": \"<span\|yes/no\|null>\"\n}\n```\n\n"
	"Guidelines:\n"
	"- The question should be natural, as if asked by a curious user, and directly related to the chunk’s content.\n"
	"- The short answer must be a verbatim or near-verbatim excerpt from the chunk.\n"
	"- The long answer can be a direct excerpt or a slight rephrasing for clarity, but must remain faithful to the chunk.\n"
	"- Choose the most relevant topic from the provided list based on the chunk’s content.\n"
	"- If the question is unanswerable within the chunk, set `answer_type` to \"null\" and provide empty answers.\n"
	"- For yes/no questions, ensure the short answer is \"Yes\" or \"No,\" and the long answer explains why.\n"
	"- Avoid generic or overly broad questions; focus on specific details in the chunk.\n"
	"- Ensure the question is answerable using only the chunk’s content.\n\n"
	"Example:\n"
	"- Chunk: \"Rising sea levels, driven by melting ice caps and thermal expansion, threaten coastal cities with increased flooding and erosion. Sustainable practices can mitigate these effects.\"\n"
	"- Possible Topics: [\"Climate Change\", \"Sustainability\", \"Urban Planning\"]\n"
	"- Output:\n"
	"```json\n{\n \"question\": \"What threatens coastal cities due to climate change?\",\n \"short_answer\": \"Rising sea levels\",\n \"long_answer\": \"Rising sea levels, driven by melting ice caps and thermal expansion, threaten coastal cities with increased flooding and erosion.\",\n \"selected_topic\": \"Climate Change\",\n \"answer_type\": \"span\"\n}\n```"))

	;; Function to generate Q&A pair for a chunk
	(defn generate-qa-pair [chunk topics]
	(let [topics-str (str/join ", " topics)
	prompt (format qa-prompt-template chunk topics-str)
	response (call-llm-api prompt)]
	(:body response)))

	;; Example usage
	(def sample-chunk
	"Machine learning models require large datasets for training. Overfitting occurs when a model learns noise instead of patterns, leading to poor generalization.")
	(def sample-topics ["Machine Learning", "Data Science", "AI Ethics"])

	(defn -main []
	(let [qa-pair (generate-qa-pair sample-chunk sample-topics)]
	(println "Generated Q&A Pair:" qa-pair)))

	;; Store Q&A pairs in a dataset
	(defn store-qa-pairs [qa-pairs output-file]
	(spit output-file (json/generate-string qa-pairs {:pretty true})))

	;; Example: Process multiple chunks
	(def chunks-with-topics
	[{:chunk "Machine learning models require large datasets for training..."
	:topics ["Machine Learning", "Data Science"]}
	{:chunk "Sustainable practices can reduce carbon emissions..."
	:topics ["Sustainability", "Climate Change"]}])

	(defn process-chunks [chunks]
	(map (fn [{:keys [chunk topics]}]
	(merge (generate-qa-pair chunk topics) {:chunk chunk}))
	chunks))

	(comment
	;; Run example
	(-main)
	;; Process and store multiple chunks
	(let [qa-pairs (process-chunks chunks-with-topics)]
	(store-qa-pairs qa-pairs "qa_dataset.json")))