Last active
December 25, 2024 17:20
-
-
Save rbiswasfc/aa5fc1450804f5f02787fb0d2c2e6ea6 to your computer and use it in GitHub Desktop.
Prep for CoT Generation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"id": "5283aef3", | |
"cell_type": "markdown", | |
"source": "# Imports" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "0e1161c0", | |
"cell_type": "code", | |
"source": "import os\nimport pandas as pd\nimport ast\nimport numpy as np\nimport random\nfrom tqdm.auto import tqdm\n\npd.options.display.max_colwidth = None", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "6ea16a33", | |
"cell_type": "code", | |
"source": "import kagglehub\nfrom collections import defaultdict\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"id": "bee10d2f", | |
"cell_type": "markdown", | |
"source": "# Data" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "2f55fcde", | |
"cell_type": "code", | |
"source": "comp_dir = kagglehub.competition_download(\"eedi-mining-misconceptions-in-mathematics\")\n\ndf = pd.read_csv(os.path.join(comp_dir, \"train.csv\"))\ncontent_df = pd.read_csv(os.path.join(comp_dir, \"misconception_mapping.csv\"))\ndf.shape, content_df.shape", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 3, | |
"data": { | |
"text/plain": "((1869, 15), (2587, 2))" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "a2f46f84", | |
"cell_type": "code", | |
"source": "host_qids = df[\"QuestionId\"].unique().tolist()\nmax(host_qids)", | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 4, | |
"data": { | |
"text/plain": "1868" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "dc94e09c", | |
"cell_type": "code", | |
"source": "fold_df = pd.read_parquet(\"../data/scratch/five_folds.parquet\")\ndf = pd.merge(df, fold_df, on=\"QuestionId\", how=\"left\")\ndf[\"kfold\"] = df[\"kfold\"].fillna(99)\n\nprint(df.kfold.value_counts())", | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "kfold\n0 374\n3 374\n1 374\n2 374\n4 373\nName: count, dtype: int64\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "6745bccf", | |
"cell_type": "code", | |
"source": "syn_dir = kagglehub.dataset_download(\"conjuring92/eedi-silver-v3\")\n\nsyn_df = pd.read_csv(os.path.join(syn_dir, \"train.csv\"))\nsyn_content_df = pd.read_csv(os.path.join(syn_dir, \"misconception_mapping.csv\"))\nsyn_df.shape, syn_content_df.shape", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 6, | |
"data": { | |
"text/plain": "((12473, 16), (4791, 2))" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"id": "91a53447", | |
"cell_type": "markdown", | |
"source": "# Load Related Misconceptions" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "1ef9c172", | |
"cell_type": "code", | |
"source": "# we can use top retrieved misconception candidates for related misconceptions or use re-ranker predictions\npred_dir = kagglehub.dataset_download(\"conjuring92/eedi-tutor-mix-v7-n8-comp\")\n\ntrain_ret_df = pd.read_parquet(os.path.join(pred_dir, \"train.parquet\"))\nvalid_ret_df = pd.read_parquet(os.path.join(pred_dir, \"valid.parquet\"))\n\nret_df = pd.concat([train_ret_df, valid_ret_df]).reset_index(drop=True)", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "0a6d5c7d", | |
"cell_type": "code", | |
"source": "id2name = dict(zip(content_df[\"MisconceptionId\"], content_df[\"MisconceptionName\"]))\nsyn_id2name = dict(zip(syn_content_df[\"MisconceptionId\"], syn_content_df[\"MisconceptionName\"]))", | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"id": "f56cabda", | |
"cell_type": "markdown", | |
"source": "# Format" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "13b57860", | |
"cell_type": "code", | |
"source": "keep_cols = [\n \"QuestionId\",\n \"ConstructId\",\n \"ConstructName\",\n \"SubjectId\",\n \"SubjectName\",\n \"CorrectAnswer\",\n \"QuestionText\",\n \"AnswerAText\",\n \"AnswerBText\",\n \"AnswerCText\",\n \"AnswerDText\",\n \"MisconceptionAId\",\n \"MisconceptionBId\",\n \"MisconceptionCId\",\n \"MisconceptionDId\",\n]\nmcq_df = df[keep_cols].copy()", | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"id": "c82c72dd", | |
"cell_type": "markdown", | |
"source": "# Paired Data" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "36b4f03f", | |
"cell_type": "code", | |
"source": "def is_nan(x):\n return x != x\n\n\ndef eedi_process_df(df, content_df, id2name):\n df = df.copy()\n df = df.rename(columns={\"QuestionId\": \"query_id\"})\n grouped = df.groupby(\"query_id\")\n\n question_dict = {}\n for question_id, group in grouped:\n question_data = group.to_dict(orient=\"records\")[0]\n del question_data[\"query_id\"]\n question_dict[question_id] = question_data\n\n all_questions = list(question_dict.keys())\n\n queries = []\n for qid in all_questions:\n info = question_dict[qid]\n\n for answer_key in [\"A\", \"B\", \"C\", \"D\"]:\n if info[\"CorrectAnswer\"] == answer_key:\n continue\n\n this_example = dict()\n this_key = f\"{qid}_{answer_key}\"\n this_example[\"query_id\"] = this_key\n\n if is_nan(info[f\"Misconception{answer_key}Id\"]):\n continue\n\n mid = int(info[f\"Misconception{answer_key}Id\"])\n mname = id2name[mid]\n\n this_example[\"content_id\"] = str(mid)\n this_example[\"MisconceptionName\"] = mname\n\n for col in [\"SubjectName\", \"ConstructName\", \"QuestionText\"]:\n this_example[col] = info[col]\n\n this_example[\"CorrectAnswerText\"] = info[f\"Answer{info['CorrectAnswer']}Text\"]\n this_example[\"InCorrectAnswerText\"] = info[f\"Answer{answer_key}Text\"]\n this_example[\"AllOptionText\"] = \"\\n- \".join([info[f\"Answer{x}Text\"] for x in [\"A\", \"B\", \"C\", \"D\"]])\n this_example[\"AllOptionText\"] = f\"\\n- {this_example['AllOptionText']}\"\n queries.append(this_example)\n # --\n query_df = pd.DataFrame(queries)\n\n query_df = query_df.reset_index(drop=True)\n\n return query_df", | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "77898cae", | |
"cell_type": "code", | |
"source": "pair_df = eedi_process_df(mcq_df, content_df, id2name)\npair_df.shape", | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 13, | |
"data": { | |
"text/plain": "(4370, 9)" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "c75f9c61", | |
"cell_type": "code", | |
"source": "# ret_df", | |
"execution_count": 17, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "11b9d0c9", | |
"cell_type": "code", | |
"source": "ret_df = ret_df[[\"query_id\", \"content_ids\"]].rename(columns={\"content_ids\": \"negative_ids\"})\npair_df = pair_df.merge(ret_df, on=\"query_id\", how=\"left\")\npair_df[\"negative_ids\"] = pair_df.apply(lambda row: [x for x in row[\"negative_ids\"] if int(x) != int(row[\"content_id\"])], axis=1)\npair_df[\"related_misconceptions\"] = pair_df[\"negative_ids\"].apply(lambda x: [syn_id2name[y] for y in x])", | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "4ed5908a", | |
"cell_type": "code", | |
"source": "pair_df[\"related_misconceptions\"] = pair_df[\"related_misconceptions\"].apply(lambda x: \"- \" + \"\\n- \".join(x))", | |
"execution_count": 20, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "92976f75", | |
"cell_type": "code", | |
"source": "x = pair_df.sample().to_dict(orient=\"records\")[0]\nx[\"MisconceptionName\"] in x[\"related_misconceptions\"]", | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 21, | |
"data": { | |
"text/plain": "False" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "821d1bbe", | |
"cell_type": "code", | |
"source": "print(x[\"related_misconceptions\"])", | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "- Confuses constant distance with stopping\n- Confuses constant distance with constant speed\n- Believes a less steep gradient corresponds to a faster speed\n- Thinks constant speed means constant displacement\n- Confuses displacement-time graphs with velocity-time graphs\n- Misunderstands continuous motion\n- Believes an upward slope on a distance-time graph means the movement has stopped\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "f20b6d54", | |
"cell_type": "code", | |
"source": "x[\"MisconceptionName\"]", | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 23, | |
"data": { | |
"text/plain": "'Confuses rest and movement on a distance time graph'" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"id": "344d942f", | |
"cell_type": "markdown", | |
"source": "# Dataset" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "f69d39a5", | |
"cell_type": "code", | |
"source": "data_dir = \"../data/cot_prep\"\nos.makedirs(data_dir, exist_ok=True)\n\npair_df.to_csv(f\"{data_dir}/pair.csv\", index=False)", | |
"execution_count": 24, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "36b275f0", | |
"cell_type": "code", | |
"source": "kagglehub.dataset_upload(\"conjuring92/eedi-cot-gen-base\", data_dir)", | |
"execution_count": 25, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Uploading Dataset https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base ...\nStarting upload for file ../data/cot_prep/pair.csv\nWarning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "Uploading: 100%|███████████████████████████████████████████████████████████████████████████████████████| 4.25M/4.25M [00:03<00:00, 1.22MB/s]", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "Upload successful: ../data/cot_prep/pair.csv (4MB)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "Warning: Looks like you're using an outdated `kagglehub` version, please consider updating (latest version: 0.3.6)\nYour dataset instance has been created.\nFiles are being processed...\nSee at: https://www.kaggle.com/datasets/conjuring92/eedi-cot-gen-base\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"id": "b722174a", | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.10.15", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "9966d838c5789fe326f76162c2a6fc0341b2fd9319a92dbbd869a89bb7177318" | |
} | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "28_cot_input_prep.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment