rbiswasfc · December 25, 2024 17:20
diff --git a/28_cot_input_prep.ipynb b/28_cot_input_prep.ipynb
 {
  "cells": [
    {
      "metadata": {},
      "id": "5283aef3",
      "cell_type": "markdown",
      "source": "# Imports"
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "0e1161c0",
      "cell_type": "code",
      "source": "import os\nimport pandas as pd\nimport ast\nimport numpy as np\nimport random\nfrom tqdm.auto import tqdm\n\npd.options.display.max_colwidth = None",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "6ea16a33",
      "cell_type": "code",
      "source": "import kagglehub\nfrom collections import defaultdict\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {},
      "id": "bee10d2f",
      "cell_type": "markdown",
      "source": "# Data"
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "2f55fcde",
      "cell_type": "code",
      "source": "comp_dir = kagglehub.competition_download(\"eedi-mining-misconceptions-in-mathematics\")\n\ndf = pd.read_csv(os.path.join(comp_dir, \"train.csv\"))\ncontent_df = pd.read_csv(os.path.join(comp_dir, \"misconception_mapping.csv\"))\ndf.shape, content_df.shape",
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/plain": "((1869, 15), (2587, 2))"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "a2f46f84",
      "cell_type": "code",
      "source": "host_qids = df[\"QuestionId\"].unique().tolist()\nmax(host_qids)",
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 4,
          "data": {
            "text/plain": "1868"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "dc94e09c",
      "cell_type": "code",
      "source": "fold_df = pd.read_parquet(\"../data/scratch/five_folds.parquet\")\ndf = pd.merge(df, fold_df, on=\"QuestionId\", how=\"left\")\ndf[\"kfold\"] = df[\"kfold\"].fillna(99)\n\nprint(df.kfold.value_counts())",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": "kfold\n0    374\n3    374\n1    374\n2    374\n4    373\nName: count, dtype: int64\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "6745bccf",
      "cell_type": "code",
      "source": "syn_dir = kagglehub.dataset_download(\"conjuring92/eedi-silver-v3\")\n\nsyn_df = pd.read_csv(os.path.join(syn_dir, \"train.csv\"))\nsyn_content_df = pd.read_csv(os.path.join(syn_dir, \"misconception_mapping.csv\"))\nsyn_df.shape, syn_content_df.shape",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "execution_count": 6,
          "data": {
            "text/plain": "((12473, 16), (4791, 2))"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "id": "91a53447",
      "cell_type": "markdown",
      "source": "# Load Related Misconceptions"
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "1ef9c172",
      "cell_type": "code",
      "source": "# we can use top retrieved misconception candidates for related misconceptions or use re-ranker predictions\npred_dir = kagglehub.dataset_download(\"conjuring92/eedi-tutor-mix-v7-n8-comp\")\n\ntrain_ret_df = pd.read_parquet(os.path.join(pred_dir, \"train.parquet\"))\nvalid_ret_df = pd.read_parquet(os.path.join(pred_dir, \"valid.parquet\"))\n\nret_df = pd.concat([train_ret_df, valid_ret_df]).reset_index(drop=True)",
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "0a6d5c7d",
      "cell_type": "code",
      "source": "id2name = dict(zip(content_df[\"MisconceptionId\"], content_df[\"MisconceptionName\"]))\nsyn_id2name = dict(zip(syn_content_df[\"MisconceptionId\"], syn_content_df[\"MisconceptionName\"]))",
      "execution_count": 9,
      "outputs": []
    },
    {
      "metadata": {},
      "id": "f56cabda",
      "cell_type": "markdown",
      "source": "# Format"
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "13b57860",
      "cell_type": "code",
      "source": "keep_cols = [\n    \"QuestionId\",\n    \"ConstructId\",\n    \"ConstructName\",\n    \"SubjectId\",\n    \"SubjectName\",\n    \"CorrectAnswer\",\n    \"QuestionText\",\n    \"AnswerAText\",\n    \"AnswerBText\",\n    \"AnswerCText\",\n    \"AnswerDText\",\n    \"MisconceptionAId\",\n    \"MisconceptionBId\",\n    \"MisconceptionCId\",\n    \"MisconceptionDId\",\n]\nmcq_df = df[keep_cols].copy()",
      "execution_count": 10,
      "outputs": []
    },
    {
      "metadata": {},
      "id": "c82c72dd",
      "cell_type": "markdown",
      "source": "# Paired Data"
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "36b4f03f",
      "cell_type": "code",
      "source": "def is_nan(x):\n    return x != x\n\n\ndef eedi_process_df(df, content_df, id2name):\n    df = df.copy()\n    df = df.rename(columns={\"QuestionId\": \"query_id\"})\n    grouped = df.groupby(\"query_id\")\n\n    question_dict = {}\n    for question_id, group in grouped:\n        question_data = group.to_dict(orient=\"records\")[0]\n        del question_data[\"query_id\"]\n        question_dict[question_id] = question_data\n\n    all_questions = list(question_dict.keys())\n\n    queries = []\n    for qid in all_questions:\n        info = question_dict[qid]\n\n        for answer_key in [\"A\", \"B\", \"C\", \"D\"]:\n            if info[\"CorrectAnswer\"] == answer_key:\n                continue\n\n            this_example = dict()\n            this_key = f\"{qid}_{answer_key}\"\n            this_example[\"query_id\"] = this_key\n\n            if is_nan(info[f\"Misconception{answer_key}Id\"]):\n                continue\n\n            mid = int(info[f\"Misconception{answer_key}Id\"])\n            mname = id2name[mid]\n\n            this_example[\"content_id\"] = str(mid)\n            this_example[\"MisconceptionName\"] = mname\n\n            for col in [\"SubjectName\", \"ConstructName\", \"QuestionText\"]:\n                this_example[col] = info[col]\n\n            this_example[\"CorrectAnswerText\"] = info[f\"Answer{info['CorrectAnswer']}Text\"]\n            this_example[\"InCorrectAnswerText\"] = info[f\"Answer{answer_key}Text\"]\n            this_example[\"AllOptionText\"] = \"\\n- \".join([info[f\"Answer{x}Text\"] for x in [\"A\", \"B\", \"C\", \"D\"]])\n            this_example[\"AllOptionText\"] = f\"\\n- {this_example['AllOptionText']}\"\n            queries.append(this_example)\n    # --\n    query_df = pd.DataFrame(queries)\n\n    query_df = query_df.reset_index(drop=True)\n\n    return query_df",
      "execution_count": 12,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "77898cae",
      "cell_type": "code",
      "source": "pair_df = eedi_process_df(mcq_df, content_df, id2name)\npair_df.shape",
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 13,
          "data": {
            "text/plain": "(4370, 9)"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "c75f9c61",
      "cell_type": "code",
      "source": "# ret_df",
      "execution_count": 17,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "11b9d0c9",
      "cell_type": "code",
      "source": "ret_df = ret_df[[\"query_id\", \"content_ids\"]].rename(columns={\"content_ids\": \"negative_ids\"})\npair_df = pair_df.merge(ret_df, on=\"query_id\", how=\"left\")\npair_df[\"negative_ids\"] = pair_df.apply(lambda row: [x for x in row[\"negative_ids\"] if int(x) != int(row[\"content_id\"])], axis=1)\npair_df[\"related_misconceptions\"] = pair_df[\"negative_ids\"].apply(lambda x: [syn_id2name[y] for y in x])",
      "execution_count": 19,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "4ed5908a",
      "cell_type": "code",
      "source": "pair_df[\"related_misconceptions\"] = pair_df[\"related_misconceptions\"].apply(lambda x: \"- \" + \"\\n- \".join(x))",
      "execution_count": 20,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "92976f75",
      "cell_type": "code",
      "source": "x = pair_df.sample().to_dict(orient=\"records\")[0]\nx[\"MisconceptionName\"] in x[\"related_misconceptions\"]",
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 21,
          "data": {
            "text/plain": "False"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "821d1bbe",
      "cell_type": "code",
      "source": "print(x[\"related_misconceptions\"])",
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "text": "- Confuses constant distance with stopping\n- Confuses constant distance with constant speed\n- Believes a less steep gradient corresponds to a faster speed\n- Thinks constant speed means constant displacement\n- Confuses displacement-time graphs with velocity-time graphs\n- Misunderstands continuous motion\n- Believes an upward slope on a distance-time graph means the movement has stopped\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "f20b6d54",
      "cell_type": "code",
      "source": "x[\"MisconceptionName\"]",
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 23,
          "data": {
            "text/plain": "'Confuses rest and movement on a distance time graph'"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "id": "344d942f",
      "cell_type": "markdown",
      "source": "# Dataset"
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "f69d39a5",
      "cell_type": "code",
      "source": "data_dir = \"../data/cot_prep\"\nos.makedirs(data_dir, exist_ok=True)\n\npair_df.to_csv(f\"{data_dir}/pair.csv\", index=False)",
      "execution_count": 24,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "id": "36b275f0",
      "cell_type": "code",
      "source": "kagglehub.dataset_upload(\"conjuring92/eedi-cot-gen-base\", data_dir)",
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Uploading Dataset https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base ...\nStarting upload for file ../data/cot_prep/pair.csv\nWarning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": "Uploading: 100%|███████████████████████████████████████████████████████████████████████████████████████| 4.25M/4.25M [00:03<00:00, 1.22MB/s]",
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": "Upload successful: ../data/cot_prep/pair.csv (4MB)\n",
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": "\n",
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\nYour dataset instance has been created.\nFiles are being processed...\nSee at: https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "b722174a",
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3 (ipykernel)",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.15",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "vscode": {
      "interpreter": {
        "hash": "9966d838c5789fe326f76162c2a6fc0341b2fd9319a92dbbd869a89bb7177318"
      }
    },
    "gist": {
      "id": "",
      "data": {
        "description": "28_cot_input_prep.ipynb",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"metadata": {},
	"id": "5283aef3",
	"cell_type": "markdown",
	"source": "# Imports"
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "0e1161c0",
	"cell_type": "code",
	"source": "import os\nimport pandas as pd\nimport ast\nimport numpy as np\nimport random\nfrom tqdm.auto import tqdm\n\npd.options.display.max_colwidth = None",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "6ea16a33",
	"cell_type": "code",
	"source": "import kagglehub\nfrom collections import defaultdict\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {},
	"id": "bee10d2f",
	"cell_type": "markdown",
	"source": "# Data"
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "2f55fcde",
	"cell_type": "code",
	"source": "comp_dir = kagglehub.competition_download(\"eedi-mining-misconceptions-in-mathematics\")\n\ndf = pd.read_csv(os.path.join(comp_dir, \"train.csv\"))\ncontent_df = pd.read_csv(os.path.join(comp_dir, \"misconception_mapping.csv\"))\ndf.shape, content_df.shape",
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 3,
	"data": {
	"text/plain": "((1869, 15), (2587, 2))"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "a2f46f84",
	"cell_type": "code",
	"source": "host_qids = df[\"QuestionId\"].unique().tolist()\nmax(host_qids)",
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 4,
	"data": {
	"text/plain": "1868"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "dc94e09c",
	"cell_type": "code",
	"source": "fold_df = pd.read_parquet(\"../data/scratch/five_folds.parquet\")\ndf = pd.merge(df, fold_df, on=\"QuestionId\", how=\"left\")\ndf[\"kfold\"] = df[\"kfold\"].fillna(99)\n\nprint(df.kfold.value_counts())",
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"text": "kfold\n0 374\n3 374\n1 374\n2 374\n4 373\nName: count, dtype: int64\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "6745bccf",
	"cell_type": "code",
	"source": "syn_dir = kagglehub.dataset_download(\"conjuring92/eedi-silver-v3\")\n\nsyn_df = pd.read_csv(os.path.join(syn_dir, \"train.csv\"))\nsyn_content_df = pd.read_csv(os.path.join(syn_dir, \"misconception_mapping.csv\"))\nsyn_df.shape, syn_content_df.shape",
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "stream",
	"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
	"name": "stdout"
	},
	{
	"output_type": "execute_result",
	"execution_count": 6,
	"data": {
	"text/plain": "((12473, 16), (4791, 2))"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"id": "91a53447",
	"cell_type": "markdown",
	"source": "# Load Related Misconceptions"
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "1ef9c172",
	"cell_type": "code",
	"source": "# we can use top retrieved misconception candidates for related misconceptions or use re-ranker predictions\npred_dir = kagglehub.dataset_download(\"conjuring92/eedi-tutor-mix-v7-n8-comp\")\n\ntrain_ret_df = pd.read_parquet(os.path.join(pred_dir, \"train.parquet\"))\nvalid_ret_df = pd.read_parquet(os.path.join(pred_dir, \"valid.parquet\"))\n\nret_df = pd.concat([train_ret_df, valid_ret_df]).reset_index(drop=True)",
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "stream",
	"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "0a6d5c7d",
	"cell_type": "code",
	"source": "id2name = dict(zip(content_df[\"MisconceptionId\"], content_df[\"MisconceptionName\"]))\nsyn_id2name = dict(zip(syn_content_df[\"MisconceptionId\"], syn_content_df[\"MisconceptionName\"]))",
	"execution_count": 9,
	"outputs": []
	},
	{
	"metadata": {},
	"id": "f56cabda",
	"cell_type": "markdown",
	"source": "# Format"
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "13b57860",
	"cell_type": "code",
	"source": "keep_cols = [\n \"QuestionId\",\n \"ConstructId\",\n \"ConstructName\",\n \"SubjectId\",\n \"SubjectName\",\n \"CorrectAnswer\",\n \"QuestionText\",\n \"AnswerAText\",\n \"AnswerBText\",\n \"AnswerCText\",\n \"AnswerDText\",\n \"MisconceptionAId\",\n \"MisconceptionBId\",\n \"MisconceptionCId\",\n \"MisconceptionDId\",\n]\nmcq_df = df[keep_cols].copy()",
	"execution_count": 10,
	"outputs": []
	},
	{
	"metadata": {},
	"id": "c82c72dd",
	"cell_type": "markdown",
	"source": "# Paired Data"
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "36b4f03f",
	"cell_type": "code",
	"source": "def is_nan(x):\n return x != x\n\n\ndef eedi_process_df(df, content_df, id2name):\n df = df.copy()\n df = df.rename(columns={\"QuestionId\": \"query_id\"})\n grouped = df.groupby(\"query_id\")\n\n question_dict = {}\n for question_id, group in grouped:\n question_data = group.to_dict(orient=\"records\")[0]\n del question_data[\"query_id\"]\n question_dict[question_id] = question_data\n\n all_questions = list(question_dict.keys())\n\n queries = []\n for qid in all_questions:\n info = question_dict[qid]\n\n for answer_key in [\"A\", \"B\", \"C\", \"D\"]:\n if info[\"CorrectAnswer\"] == answer_key:\n continue\n\n this_example = dict()\n this_key = f\"{qid}_{answer_key}\"\n this_example[\"query_id\"] = this_key\n\n if is_nan(info[f\"Misconception{answer_key}Id\"]):\n continue\n\n mid = int(info[f\"Misconception{answer_key}Id\"])\n mname = id2name[mid]\n\n this_example[\"content_id\"] = str(mid)\n this_example[\"MisconceptionName\"] = mname\n\n for col in [\"SubjectName\", \"ConstructName\", \"QuestionText\"]:\n this_example[col] = info[col]\n\n this_example[\"CorrectAnswerText\"] = info[f\"Answer{info['CorrectAnswer']}Text\"]\n this_example[\"InCorrectAnswerText\"] = info[f\"Answer{answer_key}Text\"]\n this_example[\"AllOptionText\"] = \"\\n- \".join([info[f\"Answer{x}Text\"] for x in [\"A\", \"B\", \"C\", \"D\"]])\n this_example[\"AllOptionText\"] = f\"\\n- {this_example['AllOptionText']}\"\n queries.append(this_example)\n # --\n query_df = pd.DataFrame(queries)\n\n query_df = query_df.reset_index(drop=True)\n\n return query_df",
	"execution_count": 12,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "77898cae",
	"cell_type": "code",
	"source": "pair_df = eedi_process_df(mcq_df, content_df, id2name)\npair_df.shape",
	"execution_count": 13,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 13,
	"data": {
	"text/plain": "(4370, 9)"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "c75f9c61",
	"cell_type": "code",
	"source": "# ret_df",
	"execution_count": 17,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "11b9d0c9",
	"cell_type": "code",
	"source": "ret_df = ret_df[[\"query_id\", \"content_ids\"]].rename(columns={\"content_ids\": \"negative_ids\"})\npair_df = pair_df.merge(ret_df, on=\"query_id\", how=\"left\")\npair_df[\"negative_ids\"] = pair_df.apply(lambda row: [x for x in row[\"negative_ids\"] if int(x) != int(row[\"content_id\"])], axis=1)\npair_df[\"related_misconceptions\"] = pair_df[\"negative_ids\"].apply(lambda x: [syn_id2name[y] for y in x])",
	"execution_count": 19,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "4ed5908a",
	"cell_type": "code",
	"source": "pair_df[\"related_misconceptions\"] = pair_df[\"related_misconceptions\"].apply(lambda x: \"- \" + \"\\n- \".join(x))",
	"execution_count": 20,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "92976f75",
	"cell_type": "code",
	"source": "x = pair_df.sample().to_dict(orient=\"records\")[0]\nx[\"MisconceptionName\"] in x[\"related_misconceptions\"]",
	"execution_count": 21,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 21,
	"data": {
	"text/plain": "False"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "821d1bbe",
	"cell_type": "code",
	"source": "print(x[\"related_misconceptions\"])",
	"execution_count": 22,
	"outputs": [
	{
	"output_type": "stream",
	"text": "- Confuses constant distance with stopping\n- Confuses constant distance with constant speed\n- Believes a less steep gradient corresponds to a faster speed\n- Thinks constant speed means constant displacement\n- Confuses displacement-time graphs with velocity-time graphs\n- Misunderstands continuous motion\n- Believes an upward slope on a distance-time graph means the movement has stopped\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "f20b6d54",
	"cell_type": "code",
	"source": "x[\"MisconceptionName\"]",
	"execution_count": 23,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 23,
	"data": {
	"text/plain": "'Confuses rest and movement on a distance time graph'"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"id": "344d942f",
	"cell_type": "markdown",
	"source": "# Dataset"
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "f69d39a5",
	"cell_type": "code",
	"source": "data_dir = \"../data/cot_prep\"\nos.makedirs(data_dir, exist_ok=True)\n\npair_df.to_csv(f\"{data_dir}/pair.csv\", index=False)",
	"execution_count": 24,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"id": "36b275f0",
	"cell_type": "code",
	"source": "kagglehub.dataset_upload(\"conjuring92/eedi-cot-gen-base\", data_dir)",
	"execution_count": 25,
	"outputs": [
	{
	"output_type": "stream",
	"text": "Uploading Dataset https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base ...\nStarting upload for file ../data/cot_prep/pair.csv\nWarning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
	"name": "stdout"
	},
	{
	"output_type": "stream",
	"text": "Uploading: 100%\|███████████████████████████████████████████████████████████████████████████████████████\| 4.25M/4.25M [00:03<00:00, 1.22MB/s]",
	"name": "stderr"
	},
	{
	"output_type": "stream",
	"text": "Upload successful: ../data/cot_prep/pair.csv (4MB)\n",
	"name": "stdout"
	},
	{
	"output_type": "stream",
	"text": "\n",
	"name": "stderr"
	},
	{
	"output_type": "stream",
	"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\nYour dataset instance has been created.\nFiles are being processed...\nSee at: https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "b722174a",
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3 (ipykernel)",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.10.15",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"vscode": {
	"interpreter": {
	"hash": "9966d838c5789fe326f76162c2a6fc0341b2fd9319a92dbbd869a89bb7177318"
	}
	},
	"gist": {
	"id": "",
	"data": {
	"description": "28_cot_input_prep.ipynb",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}