isaaccorley · February 6, 2025 18:41
diff --git a/InfraredSolarModules_RandomForest.ipynb b/InfraredSolarModules_RandomForest.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget https://raw.githubusercontent.com/RaptorMaps/InfraredSolarModules/master/2020-02-14_InfraredSolarModules.zip\n",
    "!unzip 2020-02-14_InfraredSolarModules.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install numpy pillow tqdm scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20000/20000 [00:03<00:00, 5425.94it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(20000, 960) (20000,)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "from PIL import Image\n",
    "from tqdm import tqdm\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "root = \"InfraredSolarModules\"\n",
    "with open(os.path.join(root, \"module_metadata.json\"), \"r\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "classes = sorted(list(set([v[\"anomaly_class\"] for v in data.values()])))\n",
    "cls2idx = {cls: i for i, cls in enumerate(classes)}\n",
    "images = [os.path.join(root, v[\"image_filepath\"]) for v in data.values()]\n",
    "x = np.stack([np.array(Image.open(image)) for image in tqdm(images)])\n",
    "x = x.reshape(x.shape[0], -1)\n",
    "y = np.array([cls2idx[v[\"anomaly_class\"]] for v in data.values()])\n",
    "y_binary = np.array([0 if v[\"anomaly_class\"] == \"No-Anomaly\" else 1 for v in data.values()])\n",
    "print(x.shape, y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train(x, y, test_size=0.1, seed=0):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, stratify=y)\n",
    "    clf = RandomForestClassifier(random_state=seed, n_jobs=-1)\n",
    "    clf.fit(X_train, y_train)\n",
    "    y_pred_train = clf.predict(X_train)\n",
    "    y_pred_test = clf.predict(X_test)\n",
    "    train_acc = (y_pred_train == y_train).mean()\n",
    "    test_acc = (y_pred_test == y_test).mean()\n",
    "    return train_acc, test_acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train accuracy: 0.9997222222222222\n",
      "Test accuracy: 0.6725\n",
      "Train accuracy: 0.9998333333333334\n",
      "Test accuracy: 0.657\n",
      "Train accuracy: 0.9997222222222222\n",
      "Test accuracy: 0.6765\n",
      "Train accuracy: 0.9996666666666667\n",
      "Test accuracy: 0.6595\n",
      "Train accuracy: 0.9996111111111111\n",
      "Test accuracy: 0.6775\n",
      "Train accuracy: 0.9997222222222222\n",
      "Test accuracy: 0.6565\n",
      "Train accuracy: 0.9997222222222222\n",
      "Test accuracy: 0.6575\n",
      "Train accuracy: 0.9997222222222222\n",
      "Test accuracy: 0.6675\n",
      "Train accuracy: 0.9996666666666667\n",
      "Test accuracy: 0.668\n",
      "Train accuracy: 0.9997222222222222\n",
      "Test accuracy: 0.6695\n",
      "Train accuracy (averaged across seeds): 0.999711111111111 5.4433105395173477e-05\n",
      "Test accuracy: (averaged across seeds) 0.6662 0.007672027111526655\n"
     ]
    }
   ],
   "source": [
    "train_acc, test_acc = [], []\n",
    "\n",
    "for seed in range(10):\n",
    "    train_acc_, test_acc_ = train(x, y, test_size=0.1, seed=seed)\n",
    "    train_acc.append(train_acc_)\n",
    "    test_acc.append(test_acc_)\n",
    "    print(\"Train accuracy:\", train_acc_)\n",
    "    print(\"Test accuracy:\", test_acc_)\n",
    "\n",
    "print(\"Train accuracy (averaged across seeds):\", np.mean(train_acc), np.std(train_acc)) \n",
    "print(\"Test accuracy: (averaged across seeds)\", np.mean(test_acc), np.std(test_acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.8445\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.8405\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.8495\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.8505\n",
      "Train accuracy: 0.9999444444444444\n",
      "Test accuracy: 0.8315\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.841\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.828\n",
      "Train accuracy: 0.9999444444444444\n",
      "Test accuracy: 0.8385\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.8415\n",
      "Train accuracy: 1.0\n",
      "Test accuracy: 0.84\n",
      "Train accuracy (averaged across seeds): 0.999988888888889 2.2222222222234576e-05\n",
      "Test accuracy: (averaged across seeds) 0.84055 0.006631176366226449\n"
     ]
    }
   ],
   "source": [
    "train_acc, test_acc = [], []\n",
    "\n",
    "for seed in range(10):\n",
    "    train_acc_, test_acc_ = train(x, y_binary, test_size=0.1, seed=seed)\n",
    "    train_acc.append(train_acc_)\n",
    "    test_acc.append(test_acc_)\n",
    "    print(\"Train accuracy:\", train_acc_)\n",
    "    print(\"Test accuracy:\", test_acc_)\n",
    "\n",
    "print(\"Train accuracy (averaged across seeds):\", np.mean(train_acc), np.std(train_acc)) \n",
    "print(\"Test accuracy: (averaged across seeds)\", np.mean(test_acc), np.std(test_acc))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torchenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!wget https://raw.githubusercontent.com/RaptorMaps/InfraredSolarModules/master/2020-02-14_InfraredSolarModules.zip\n",
	"!unzip 2020-02-14_InfraredSolarModules.zip"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install numpy pillow tqdm scikit-learn"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 20000/20000 [00:03<00:00, 5425.94it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(20000, 960) (20000,)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"\n"
	]
	}
	],
	"source": [
	"import os\n",
	"import json\n",
	"import numpy as np\n",
	"from PIL import Image\n",
	"from tqdm import tqdm\n",
	"from sklearn.ensemble import RandomForestClassifier\n",
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"\n",
	"root = \"InfraredSolarModules\"\n",
	"with open(os.path.join(root, \"module_metadata.json\"), \"r\") as f:\n",
	" data = json.load(f)\n",
	"\n",
	"classes = sorted(list(set([v[\"anomaly_class\"] for v in data.values()])))\n",
	"cls2idx = {cls: i for i, cls in enumerate(classes)}\n",
	"images = [os.path.join(root, v[\"image_filepath\"]) for v in data.values()]\n",
	"x = np.stack([np.array(Image.open(image)) for image in tqdm(images)])\n",
	"x = x.reshape(x.shape[0], -1)\n",
	"y = np.array([cls2idx[v[\"anomaly_class\"]] for v in data.values()])\n",
	"y_binary = np.array([0 if v[\"anomaly_class\"] == \"No-Anomaly\" else 1 for v in data.values()])\n",
	"print(x.shape, y.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def train(x, y, test_size=0.1, seed=0):\n",
	" X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, stratify=y)\n",
	" clf = RandomForestClassifier(random_state=seed, n_jobs=-1)\n",
	" clf.fit(X_train, y_train)\n",
	" y_pred_train = clf.predict(X_train)\n",
	" y_pred_test = clf.predict(X_test)\n",
	" train_acc = (y_pred_train == y_train).mean()\n",
	" test_acc = (y_pred_test == y_test).mean()\n",
	" return train_acc, test_acc"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train accuracy: 0.9997222222222222\n",
	"Test accuracy: 0.6725\n",
	"Train accuracy: 0.9998333333333334\n",
	"Test accuracy: 0.657\n",
	"Train accuracy: 0.9997222222222222\n",
	"Test accuracy: 0.6765\n",
	"Train accuracy: 0.9996666666666667\n",
	"Test accuracy: 0.6595\n",
	"Train accuracy: 0.9996111111111111\n",
	"Test accuracy: 0.6775\n",
	"Train accuracy: 0.9997222222222222\n",
	"Test accuracy: 0.6565\n",
	"Train accuracy: 0.9997222222222222\n",
	"Test accuracy: 0.6575\n",
	"Train accuracy: 0.9997222222222222\n",
	"Test accuracy: 0.6675\n",
	"Train accuracy: 0.9996666666666667\n",
	"Test accuracy: 0.668\n",
	"Train accuracy: 0.9997222222222222\n",
	"Test accuracy: 0.6695\n",
	"Train accuracy (averaged across seeds): 0.999711111111111 5.4433105395173477e-05\n",
	"Test accuracy: (averaged across seeds) 0.6662 0.007672027111526655\n"
	]
	}
	],
	"source": [
	"train_acc, test_acc = [], []\n",
	"\n",
	"for seed in range(10):\n",
	" train_acc_, test_acc_ = train(x, y, test_size=0.1, seed=seed)\n",
	" train_acc.append(train_acc_)\n",
	" test_acc.append(test_acc_)\n",
	" print(\"Train accuracy:\", train_acc_)\n",
	" print(\"Test accuracy:\", test_acc_)\n",
	"\n",
	"print(\"Train accuracy (averaged across seeds):\", np.mean(train_acc), np.std(train_acc)) \n",
	"print(\"Test accuracy: (averaged across seeds)\", np.mean(test_acc), np.std(test_acc))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.8445\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.8405\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.8495\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.8505\n",
	"Train accuracy: 0.9999444444444444\n",
	"Test accuracy: 0.8315\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.841\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.828\n",
	"Train accuracy: 0.9999444444444444\n",
	"Test accuracy: 0.8385\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.8415\n",
	"Train accuracy: 1.0\n",
	"Test accuracy: 0.84\n",
	"Train accuracy (averaged across seeds): 0.999988888888889 2.2222222222234576e-05\n",
	"Test accuracy: (averaged across seeds) 0.84055 0.006631176366226449\n"
	]
	}
	],
	"source": [
	"train_acc, test_acc = [], []\n",
	"\n",
	"for seed in range(10):\n",
	" train_acc_, test_acc_ = train(x, y_binary, test_size=0.1, seed=seed)\n",
	" train_acc.append(train_acc_)\n",
	" test_acc.append(test_acc_)\n",
	" print(\"Train accuracy:\", train_acc_)\n",
	" print(\"Test accuracy:\", test_acc_)\n",
	"\n",
	"print(\"Train accuracy (averaged across seeds):\", np.mean(train_acc), np.std(train_acc)) \n",
	"print(\"Test accuracy: (averaged across seeds)\", np.mean(test_acc), np.std(test_acc))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "torchenv",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.14"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}