Skip to content

Instantly share code, notes, and snippets.

@rbiswasfc
Last active December 25, 2024 17:20
Show Gist options
  • Save rbiswasfc/aa5fc1450804f5f02787fb0d2c2e6ea6 to your computer and use it in GitHub Desktop.
Save rbiswasfc/aa5fc1450804f5f02787fb0d2c2e6ea6 to your computer and use it in GitHub Desktop.
Prep for CoT Generation
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"id": "5283aef3",
"cell_type": "markdown",
"source": "# Imports"
},
{
"metadata": {
"trusted": true
},
"id": "0e1161c0",
"cell_type": "code",
"source": "import os\nimport pandas as pd\nimport ast\nimport numpy as np\nimport random\nfrom tqdm.auto import tqdm\n\npd.options.display.max_colwidth = None",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "6ea16a33",
"cell_type": "code",
"source": "import kagglehub\nfrom collections import defaultdict\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity",
"execution_count": 2,
"outputs": []
},
{
"metadata": {},
"id": "bee10d2f",
"cell_type": "markdown",
"source": "# Data"
},
{
"metadata": {
"trusted": true
},
"id": "2f55fcde",
"cell_type": "code",
"source": "comp_dir = kagglehub.competition_download(\"eedi-mining-misconceptions-in-mathematics\")\n\ndf = pd.read_csv(os.path.join(comp_dir, \"train.csv\"))\ncontent_df = pd.read_csv(os.path.join(comp_dir, \"misconception_mapping.csv\"))\ndf.shape, content_df.shape",
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 3,
"data": {
"text/plain": "((1869, 15), (2587, 2))"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"id": "a2f46f84",
"cell_type": "code",
"source": "host_qids = df[\"QuestionId\"].unique().tolist()\nmax(host_qids)",
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 4,
"data": {
"text/plain": "1868"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"id": "dc94e09c",
"cell_type": "code",
"source": "fold_df = pd.read_parquet(\"../data/scratch/five_folds.parquet\")\ndf = pd.merge(df, fold_df, on=\"QuestionId\", how=\"left\")\ndf[\"kfold\"] = df[\"kfold\"].fillna(99)\n\nprint(df.kfold.value_counts())",
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": "kfold\n0 374\n3 374\n1 374\n2 374\n4 373\nName: count, dtype: int64\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "6745bccf",
"cell_type": "code",
"source": "syn_dir = kagglehub.dataset_download(\"conjuring92/eedi-silver-v3\")\n\nsyn_df = pd.read_csv(os.path.join(syn_dir, \"train.csv\"))\nsyn_content_df = pd.read_csv(os.path.join(syn_dir, \"misconception_mapping.csv\"))\nsyn_df.shape, syn_content_df.shape",
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
"name": "stdout"
},
{
"output_type": "execute_result",
"execution_count": 6,
"data": {
"text/plain": "((12473, 16), (4791, 2))"
},
"metadata": {}
}
]
},
{
"metadata": {},
"id": "91a53447",
"cell_type": "markdown",
"source": "# Load Related Misconceptions"
},
{
"metadata": {
"trusted": true
},
"id": "1ef9c172",
"cell_type": "code",
"source": "# we can use top retrieved misconception candidates for related misconceptions or use re-ranker predictions\npred_dir = kagglehub.dataset_download(\"conjuring92/eedi-tutor-mix-v7-n8-comp\")\n\ntrain_ret_df = pd.read_parquet(os.path.join(pred_dir, \"train.parquet\"))\nvalid_ret_df = pd.read_parquet(os.path.join(pred_dir, \"valid.parquet\"))\n\nret_df = pd.concat([train_ret_df, valid_ret_df]).reset_index(drop=True)",
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "0a6d5c7d",
"cell_type": "code",
"source": "id2name = dict(zip(content_df[\"MisconceptionId\"], content_df[\"MisconceptionName\"]))\nsyn_id2name = dict(zip(syn_content_df[\"MisconceptionId\"], syn_content_df[\"MisconceptionName\"]))",
"execution_count": 9,
"outputs": []
},
{
"metadata": {},
"id": "f56cabda",
"cell_type": "markdown",
"source": "# Format"
},
{
"metadata": {
"trusted": true
},
"id": "13b57860",
"cell_type": "code",
"source": "keep_cols = [\n \"QuestionId\",\n \"ConstructId\",\n \"ConstructName\",\n \"SubjectId\",\n \"SubjectName\",\n \"CorrectAnswer\",\n \"QuestionText\",\n \"AnswerAText\",\n \"AnswerBText\",\n \"AnswerCText\",\n \"AnswerDText\",\n \"MisconceptionAId\",\n \"MisconceptionBId\",\n \"MisconceptionCId\",\n \"MisconceptionDId\",\n]\nmcq_df = df[keep_cols].copy()",
"execution_count": 10,
"outputs": []
},
{
"metadata": {},
"id": "c82c72dd",
"cell_type": "markdown",
"source": "# Paired Data"
},
{
"metadata": {
"trusted": true
},
"id": "36b4f03f",
"cell_type": "code",
"source": "def is_nan(x):\n return x != x\n\n\ndef eedi_process_df(df, content_df, id2name):\n df = df.copy()\n df = df.rename(columns={\"QuestionId\": \"query_id\"})\n grouped = df.groupby(\"query_id\")\n\n question_dict = {}\n for question_id, group in grouped:\n question_data = group.to_dict(orient=\"records\")[0]\n del question_data[\"query_id\"]\n question_dict[question_id] = question_data\n\n all_questions = list(question_dict.keys())\n\n queries = []\n for qid in all_questions:\n info = question_dict[qid]\n\n for answer_key in [\"A\", \"B\", \"C\", \"D\"]:\n if info[\"CorrectAnswer\"] == answer_key:\n continue\n\n this_example = dict()\n this_key = f\"{qid}_{answer_key}\"\n this_example[\"query_id\"] = this_key\n\n if is_nan(info[f\"Misconception{answer_key}Id\"]):\n continue\n\n mid = int(info[f\"Misconception{answer_key}Id\"])\n mname = id2name[mid]\n\n this_example[\"content_id\"] = str(mid)\n this_example[\"MisconceptionName\"] = mname\n\n for col in [\"SubjectName\", \"ConstructName\", \"QuestionText\"]:\n this_example[col] = info[col]\n\n this_example[\"CorrectAnswerText\"] = info[f\"Answer{info['CorrectAnswer']}Text\"]\n this_example[\"InCorrectAnswerText\"] = info[f\"Answer{answer_key}Text\"]\n this_example[\"AllOptionText\"] = \"\\n- \".join([info[f\"Answer{x}Text\"] for x in [\"A\", \"B\", \"C\", \"D\"]])\n this_example[\"AllOptionText\"] = f\"\\n- {this_example['AllOptionText']}\"\n queries.append(this_example)\n # --\n query_df = pd.DataFrame(queries)\n\n query_df = query_df.reset_index(drop=True)\n\n return query_df",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "77898cae",
"cell_type": "code",
"source": "pair_df = eedi_process_df(mcq_df, content_df, id2name)\npair_df.shape",
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 13,
"data": {
"text/plain": "(4370, 9)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"id": "c75f9c61",
"cell_type": "code",
"source": "# ret_df",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "11b9d0c9",
"cell_type": "code",
"source": "ret_df = ret_df[[\"query_id\", \"content_ids\"]].rename(columns={\"content_ids\": \"negative_ids\"})\npair_df = pair_df.merge(ret_df, on=\"query_id\", how=\"left\")\npair_df[\"negative_ids\"] = pair_df.apply(lambda row: [x for x in row[\"negative_ids\"] if int(x) != int(row[\"content_id\"])], axis=1)\npair_df[\"related_misconceptions\"] = pair_df[\"negative_ids\"].apply(lambda x: [syn_id2name[y] for y in x])",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "4ed5908a",
"cell_type": "code",
"source": "pair_df[\"related_misconceptions\"] = pair_df[\"related_misconceptions\"].apply(lambda x: \"- \" + \"\\n- \".join(x))",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "92976f75",
"cell_type": "code",
"source": "x = pair_df.sample().to_dict(orient=\"records\")[0]\nx[\"MisconceptionName\"] in x[\"related_misconceptions\"]",
"execution_count": 21,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 21,
"data": {
"text/plain": "False"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"id": "821d1bbe",
"cell_type": "code",
"source": "print(x[\"related_misconceptions\"])",
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"text": "- Confuses constant distance with stopping\n- Confuses constant distance with constant speed\n- Believes a less steep gradient corresponds to a faster speed\n- Thinks constant speed means constant displacement\n- Confuses displacement-time graphs with velocity-time graphs\n- Misunderstands continuous motion\n- Believes an upward slope on a distance-time graph means the movement has stopped\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "f20b6d54",
"cell_type": "code",
"source": "x[\"MisconceptionName\"]",
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 23,
"data": {
"text/plain": "'Confuses rest and movement on a distance time graph'"
},
"metadata": {}
}
]
},
{
"metadata": {},
"id": "344d942f",
"cell_type": "markdown",
"source": "# Dataset"
},
{
"metadata": {
"trusted": true
},
"id": "f69d39a5",
"cell_type": "code",
"source": "data_dir = \"../data/cot_prep\"\nos.makedirs(data_dir, exist_ok=True)\n\npair_df.to_csv(f\"{data_dir}/pair.csv\", index=False)",
"execution_count": 24,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "36b275f0",
"cell_type": "code",
"source": "kagglehub.dataset_upload(\"conjuring92/eedi-cot-gen-base\", data_dir)",
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"text": "Uploading Dataset https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base ...\nStarting upload for file ../data/cot_prep/pair.csv\nWarning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "Uploading: 100%|███████████████████████████████████████████████████████████████████████████████████████| 4.25M/4.25M [00:03<00:00, 1.22MB/s]",
"name": "stderr"
},
{
"output_type": "stream",
"text": "Upload successful: ../data/cot_prep/pair.csv (4MB)\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\nYour dataset instance has been created.\nFiles are being processed...\nSee at: https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": false
},
"id": "b722174a",
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3 (ipykernel)",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.10.15",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"vscode": {
"interpreter": {
"hash": "9966d838c5789fe326f76162c2a6fc0341b2fd9319a92dbbd869a89bb7177318"
}
},
"gist": {
"id": "",
"data": {
"description": "28_cot_input_prep.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment