Skip to content

Instantly share code, notes, and snippets.

@Raahul-Singh
Created June 28, 2020 20:49
Show Gist options
  • Save Raahul-Singh/2e99856951b8b648508896a9cca067bb to your computer and use it in GitHub Desktop.
Save Raahul-Singh/2e99856951b8b648508896a9cca067bb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Correaltion between Flare Production and AR Complexity"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The purpose of this note book is to find the correlation, if any exists, between Flare Production and AR Complexity."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.stats import pointbiserialr, shapiro\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading the All Clear dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"properties = pd.read_csv(\"data/all_clear/lookup_properties.csv\", delimiter=';')\n",
"properties.set_index('#id', inplace=True)\n",
"rankings = pd.read_csv(\"data/all_clear/rankings.csv\", delimiter=';')\n",
"rankings.set_index('image_id', inplace=True)\n",
"rankings.sort_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>zooniverse_id</th>\n",
" <th>angle</th>\n",
" <th>area</th>\n",
" <th>areafrac</th>\n",
" <th>areathesh</th>\n",
" <th>bipolesep</th>\n",
" <th>c1flr24hr</th>\n",
" <th>id_filename</th>\n",
" <th>flux</th>\n",
" <th>...</th>\n",
" <th>hcpos_x</th>\n",
" <th>hcpos_y</th>\n",
" <th>m1flr12hr</th>\n",
" <th>m5flr12hr</th>\n",
" <th>n_nar</th>\n",
" <th>noaa</th>\n",
" <th>pxpos_x</th>\n",
" <th>pxpos_y</th>\n",
" <th>sszn</th>\n",
" <th>zurich</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>530be1183ae74079c300000d.jpg</td>\n",
" <td>ASZ000090y</td>\n",
" <td>37.8021</td>\n",
" <td>34400.0</td>\n",
" <td>0.12</td>\n",
" <td>2890.0</td>\n",
" <td>3.72</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2.180000e+22</td>\n",
" <td>...</td>\n",
" <td>452.26991</td>\n",
" <td>443.92976</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8809</td>\n",
" <td>229.19344</td>\n",
" <td>166.87700</td>\n",
" <td>1</td>\n",
" <td>bxo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>530be1183ae74079c300000f.jpg</td>\n",
" <td>ASZ000090o</td>\n",
" <td>37.3590</td>\n",
" <td>78700.0</td>\n",
" <td>-0.00</td>\n",
" <td>6170.0</td>\n",
" <td>7.28</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>5.760000e+22</td>\n",
" <td>...</td>\n",
" <td>149.64301</td>\n",
" <td>621.53865</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8810</td>\n",
" <td>200.41511</td>\n",
" <td>154.54088</td>\n",
" <td>2</td>\n",
" <td>fao</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>530be1183ae74079c3000011.jpg</td>\n",
" <td>ASZ0000946</td>\n",
" <td>58.6197</td>\n",
" <td>37900.0</td>\n",
" <td>0.08</td>\n",
" <td>937.0</td>\n",
" <td>3.88</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>2.150000e+22</td>\n",
" <td>...</td>\n",
" <td>704.04967</td>\n",
" <td>-436.33152</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8812</td>\n",
" <td>205.30165</td>\n",
" <td>154.98689</td>\n",
" <td>3</td>\n",
" <td>axx</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>530be1183ae74079c3000013.jpg</td>\n",
" <td>ASZ000090v</td>\n",
" <td>32.3099</td>\n",
" <td>31200.0</td>\n",
" <td>0.12</td>\n",
" <td>1720.0</td>\n",
" <td>4.90</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>1.660000e+22</td>\n",
" <td>...</td>\n",
" <td>-449.47446</td>\n",
" <td>-234.01929</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8813</td>\n",
" <td>207.95782</td>\n",
" <td>169.12196</td>\n",
" <td>4</td>\n",
" <td>dro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>530be1183ae74079c3000015.jpg</td>\n",
" <td>ASZ000090x</td>\n",
" <td>49.9221</td>\n",
" <td>88400.0</td>\n",
" <td>0.05</td>\n",
" <td>6480.0</td>\n",
" <td>12.48</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>6.130000e+22</td>\n",
" <td>...</td>\n",
" <td>-735.40990</td>\n",
" <td>208.46232</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>8814</td>\n",
" <td>183.10649</td>\n",
" <td>165.00398</td>\n",
" <td>5</td>\n",
" <td>hax</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>530be1183ae74079c3000017.jpg</td>\n",
" <td>ASZ000090q</td>\n",
" <td>41.7276</td>\n",
" <td>66500.0</td>\n",
" <td>-0.04</td>\n",
" <td>5450.0</td>\n",
" <td>7.45</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>4.510000e+22</td>\n",
" <td>...</td>\n",
" <td>307.16437</td>\n",
" <td>621.26440</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8810</td>\n",
" <td>200.89942</td>\n",
" <td>158.90942</td>\n",
" <td>6</td>\n",
" <td>fao</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>530be1183ae74079c3000019.jpg</td>\n",
" <td>ASZ000090w</td>\n",
" <td>21.4421</td>\n",
" <td>31300.0</td>\n",
" <td>0.05</td>\n",
" <td>1930.0</td>\n",
" <td>5.32</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>1.580000e+22</td>\n",
" <td>...</td>\n",
" <td>-232.85591</td>\n",
" <td>-226.07368</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8813</td>\n",
" <td>202.77628</td>\n",
" <td>166.14022</td>\n",
" <td>7</td>\n",
" <td>dso</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>530be1183ae74079c300001b.jpg</td>\n",
" <td>ASZ000090p</td>\n",
" <td>30.3312</td>\n",
" <td>49700.0</td>\n",
" <td>0.28</td>\n",
" <td>2670.0</td>\n",
" <td>1.86</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>2.430000e+22</td>\n",
" <td>...</td>\n",
" <td>-464.25581</td>\n",
" <td>230.57762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8814</td>\n",
" <td>191.95662</td>\n",
" <td>171.67525</td>\n",
" <td>8</td>\n",
" <td>hhx</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>530be1183ae74079c300001d.jpg</td>\n",
" <td>ASZ00008tg</td>\n",
" <td>42.8451</td>\n",
" <td>33600.0</td>\n",
" <td>-0.20</td>\n",
" <td>3850.0</td>\n",
" <td>6.03</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>2.840000e+22</td>\n",
" <td>...</td>\n",
" <td>-649.56237</td>\n",
" <td>212.13884</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8815</td>\n",
" <td>197.90666</td>\n",
" <td>174.86663</td>\n",
" <td>9</td>\n",
" <td>cro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>530be1183ae74079c300001f.jpg</td>\n",
" <td>ASZ00008sz</td>\n",
" <td>48.1351</td>\n",
" <td>51300.0</td>\n",
" <td>0.06</td>\n",
" <td>4750.0</td>\n",
" <td>7.52</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>4.060000e+22</td>\n",
" <td>...</td>\n",
" <td>452.20395</td>\n",
" <td>616.94779</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8810</td>\n",
" <td>203.09814</td>\n",
" <td>158.22423</td>\n",
" <td>10</td>\n",
" <td>eao</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" filename zooniverse_id angle area areafrac \\\n",
"#id \n",
"1 530be1183ae74079c300000d.jpg ASZ000090y 37.8021 34400.0 0.12 \n",
"2 530be1183ae74079c300000f.jpg ASZ000090o 37.3590 78700.0 -0.00 \n",
"3 530be1183ae74079c3000011.jpg ASZ0000946 58.6197 37900.0 0.08 \n",
"4 530be1183ae74079c3000013.jpg ASZ000090v 32.3099 31200.0 0.12 \n",
"5 530be1183ae74079c3000015.jpg ASZ000090x 49.9221 88400.0 0.05 \n",
"6 530be1183ae74079c3000017.jpg ASZ000090q 41.7276 66500.0 -0.04 \n",
"7 530be1183ae74079c3000019.jpg ASZ000090w 21.4421 31300.0 0.05 \n",
"8 530be1183ae74079c300001b.jpg ASZ000090p 30.3312 49700.0 0.28 \n",
"9 530be1183ae74079c300001d.jpg ASZ00008tg 42.8451 33600.0 -0.20 \n",
"10 530be1183ae74079c300001f.jpg ASZ00008sz 48.1351 51300.0 0.06 \n",
"\n",
" areathesh bipolesep c1flr24hr id_filename flux ... \\\n",
"#id ... \n",
"1 2890.0 3.72 0 1 2.180000e+22 ... \n",
"2 6170.0 7.28 0 2 5.760000e+22 ... \n",
"3 937.0 3.88 0 3 2.150000e+22 ... \n",
"4 1720.0 4.90 0 4 1.660000e+22 ... \n",
"5 6480.0 12.48 0 5 6.130000e+22 ... \n",
"6 5450.0 7.45 0 6 4.510000e+22 ... \n",
"7 1930.0 5.32 1 7 1.580000e+22 ... \n",
"8 2670.0 1.86 0 8 2.430000e+22 ... \n",
"9 3850.0 6.03 1 9 2.840000e+22 ... \n",
"10 4750.0 7.52 1 10 4.060000e+22 ... \n",
"\n",
" hcpos_x hcpos_y m1flr12hr m5flr12hr n_nar noaa pxpos_x \\\n",
"#id \n",
"1 452.26991 443.92976 0 0 1 8809 229.19344 \n",
"2 149.64301 621.53865 0 0 1 8810 200.41511 \n",
"3 704.04967 -436.33152 0 0 1 8812 205.30165 \n",
"4 -449.47446 -234.01929 0 0 1 8813 207.95782 \n",
"5 -735.40990 208.46232 0 0 2 8814 183.10649 \n",
"6 307.16437 621.26440 0 0 1 8810 200.89942 \n",
"7 -232.85591 -226.07368 0 0 1 8813 202.77628 \n",
"8 -464.25581 230.57762 0 0 1 8814 191.95662 \n",
"9 -649.56237 212.13884 0 0 1 8815 197.90666 \n",
"10 452.20395 616.94779 0 0 1 8810 203.09814 \n",
"\n",
" pxpos_y sszn zurich \n",
"#id \n",
"1 166.87700 1 bxo \n",
"2 154.54088 2 fao \n",
"3 154.98689 3 axx \n",
"4 169.12196 4 dro \n",
"5 165.00398 5 hax \n",
"6 158.90942 6 fao \n",
"7 166.14022 7 dso \n",
"8 171.67525 8 hhx \n",
"9 174.86663 9 cro \n",
"10 158.22423 10 eao \n",
"\n",
"[10 rows x 22 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"properties.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#id</th>\n",
" <th>count</th>\n",
" <th>k_value</th>\n",
" <th>score</th>\n",
" <th>std_dev</th>\n",
" </tr>\n",
" <tr>\n",
" <th>image_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2779</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1126.778324</td>\n",
" <td>1.707604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>10010</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1312.434736</td>\n",
" <td>2.397493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>962.280235</td>\n",
" <td>1.945574</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5718</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1199.623395</td>\n",
" <td>1.894883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>8599</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1270.270911</td>\n",
" <td>1.207558</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>9385</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1293.377418</td>\n",
" <td>1.454489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7288</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1236.575624</td>\n",
" <td>2.318417</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>6152</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1209.593634</td>\n",
" <td>1.790289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>9607</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1299.871029</td>\n",
" <td>3.015982</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>9828</td>\n",
" <td>50</td>\n",
" <td>8</td>\n",
" <td>1306.593595</td>\n",
" <td>1.143226</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #id count k_value score std_dev\n",
"image_id \n",
"1 2779 50 8 1126.778324 1.707604\n",
"2 10010 50 8 1312.434736 2.397493\n",
"3 10 50 8 962.280235 1.945574\n",
"4 5718 50 8 1199.623395 1.894883\n",
"5 8599 50 8 1270.270911 1.207558\n",
"6 9385 50 8 1293.377418 1.454489\n",
"7 7288 50 8 1236.575624 2.318417\n",
"8 6152 50 8 1209.593634 1.790289\n",
"9 9607 50 8 1299.871029 3.015982\n",
"10 9828 50 8 1306.593595 1.143226"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rankings.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Preparation\n",
"Making a new dataframe that holds information for any flare being produced and the associated AR complexity."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"observed_columns = ['c1flr24hr', 'm1flr12hr', 'm5flr12hr', 'noaa']\n",
"flares_complexities = copy.deepcopy(properties[observed_columns])\n",
"flares_complexities['complexity'] = rankings.score"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>c1flr24hr</th>\n",
" <th>m1flr12hr</th>\n",
" <th>m5flr12hr</th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8809</td>\n",
" <td>1126.778324</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8810</td>\n",
" <td>1312.434736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8812</td>\n",
" <td>962.280235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8813</td>\n",
" <td>1199.623395</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8814</td>\n",
" <td>1270.270911</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8810</td>\n",
" <td>1293.377418</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8813</td>\n",
" <td>1236.575624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8814</td>\n",
" <td>1209.593634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8815</td>\n",
" <td>1299.871029</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8810</td>\n",
" <td>1306.593595</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" c1flr24hr m1flr12hr m5flr12hr noaa complexity\n",
"#id \n",
"1 0 0 0 8809 1126.778324\n",
"2 0 0 0 8810 1312.434736\n",
"3 0 0 0 8812 962.280235\n",
"4 0 0 0 8813 1199.623395\n",
"5 0 0 0 8814 1270.270911\n",
"6 0 0 0 8810 1293.377418\n",
"7 1 0 0 8813 1236.575624\n",
"8 0 0 0 8814 1209.593634\n",
"9 1 0 0 8815 1299.871029\n",
"10 1 0 0 8810 1306.593595"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flares_complexities.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def combine_flaring_columns(ar):\n",
" ar['flares'] = ar.m5flr12hr | ar.m1flr12hr | ar.c1flr24hr\n",
" ar.drop(['m5flr12hr', 'm1flr12hr', 'c1flr24hr'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"combine_flaring_columns(flares_complexities)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8809</td>\n",
" <td>1126.778324</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8810</td>\n",
" <td>1312.434736</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8812</td>\n",
" <td>962.280235</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8813</td>\n",
" <td>1199.623395</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>8814</td>\n",
" <td>1270.270911</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8810</td>\n",
" <td>1293.377418</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8813</td>\n",
" <td>1236.575624</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8814</td>\n",
" <td>1209.593634</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>8815</td>\n",
" <td>1299.871029</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>8810</td>\n",
" <td>1306.593595</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares\n",
"#id \n",
"1 8809 1126.778324 0\n",
"2 8810 1312.434736 0\n",
"3 8812 962.280235 0\n",
"4 8813 1199.623395 0\n",
"5 8814 1270.270911 0\n",
"6 8810 1293.377418 0\n",
"7 8813 1236.575624 1\n",
"8 8814 1209.593634 0\n",
"9 8815 1299.871029 1\n",
"10 8810 1306.593595 1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flares_complexities.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Seperating the positive and negative class\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"does_flare = flares_complexities[flares_complexities.flares == 1]\n",
"does_not_flare = flares_complexities[flares_complexities.flares == 0]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8813</td>\n",
" <td>1236.575624</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>8815</td>\n",
" <td>1299.871029</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>8810</td>\n",
" <td>1306.593595</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>8810</td>\n",
" <td>1300.302886</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>8816</td>\n",
" <td>1180.945863</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>8814</td>\n",
" <td>1369.772259</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>8829</td>\n",
" <td>1099.494215</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>8824</td>\n",
" <td>1349.701272</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>8829</td>\n",
" <td>1353.643280</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>8824</td>\n",
" <td>1414.873927</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares\n",
"#id \n",
"7 8813 1236.575624 1\n",
"9 8815 1299.871029 1\n",
"10 8810 1306.593595 1\n",
"14 8810 1300.302886 1\n",
"23 8816 1180.945863 1\n",
"32 8814 1369.772259 1\n",
"40 8829 1099.494215 1\n",
"41 8824 1349.701272 1\n",
"48 8829 1353.643280 1\n",
"51 8824 1414.873927 1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_flare.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8809</td>\n",
" <td>1126.778324</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8810</td>\n",
" <td>1312.434736</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8812</td>\n",
" <td>962.280235</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8813</td>\n",
" <td>1199.623395</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>8814</td>\n",
" <td>1270.270911</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8810</td>\n",
" <td>1293.377418</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8814</td>\n",
" <td>1209.593634</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>8813</td>\n",
" <td>1243.681834</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>8814</td>\n",
" <td>1191.450319</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>8815</td>\n",
" <td>1218.807609</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares\n",
"#id \n",
"1 8809 1126.778324 0\n",
"2 8810 1312.434736 0\n",
"3 8812 962.280235 0\n",
"4 8813 1199.623395 0\n",
"5 8814 1270.270911 0\n",
"6 8810 1293.377418 0\n",
"8 8814 1209.593634 0\n",
"11 8813 1243.681834 0\n",
"12 8814 1191.450319 0\n",
"13 8815 1218.807609 0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_not_flare.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Point-Biserial Correlation\n",
"A point-biserial correlation is used to measure the strength and direction of the association that exists between one continuous variable and one dichotomous variable. It is a special case of the Pearson’s product-moment correlation, which is applied when you have two continuous variables, whereas in this case one of the variables is measured on a dichotomous scale.\n",
"\n",
"## Assumptions for using Point-Biserial Correlation\n",
"\n",
"* **Assumption 1**: One of the two variables should be measured on a continuous scale. In this analysis, the `Complexity` is continous.\n",
"\n",
"* **Assumption 2**: The other variable should be dichotomous. In this analysis, the whether an AR `flares` is dichotomous, **_0_** denoting no flaring and **_1_** denoting flaring.\n",
"\n",
"* **Assumption 3**: The continuous variable should have equal variances for each category of the dichotomous variable.\n",
"\n",
"* **Assumption 4**: There should be no outliers for the continuous variable for each category of the dichotomous variable.\n",
"\n",
"* **Assumption 5**: The continuous variable should be approximately normally distributed for each category of the dichotomous variable.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### For Assumption 3"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8871.866873500114"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_flare.complexity.var()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10177.684638071736"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_not_flare.complexity.var()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### This violates assumption 3.\n",
"To fix it, we normalize the complexities in each class."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/apollo/anaconda3/envs/Andromeda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"/home/apollo/anaconda3/envs/Andromeda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \n"
]
}
],
"source": [
"does_flare['normalized_complexity'] = (does_flare.complexity - does_flare.complexity.mean()) / does_flare.complexity.std()\n",
"does_not_flare['normalized_complexity'] = (does_not_flare.complexity - does_not_flare.complexity.mean()) / does_not_flare.complexity.std()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" <th>normalized_complexity</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8813</td>\n",
" <td>1236.575624</td>\n",
" <td>1</td>\n",
" <td>-0.927213</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>8815</td>\n",
" <td>1299.871029</td>\n",
" <td>1</td>\n",
" <td>-0.255221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>8810</td>\n",
" <td>1306.593595</td>\n",
" <td>1</td>\n",
" <td>-0.183849</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>8810</td>\n",
" <td>1300.302886</td>\n",
" <td>1</td>\n",
" <td>-0.250636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>8816</td>\n",
" <td>1180.945863</td>\n",
" <td>1</td>\n",
" <td>-1.517822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>8814</td>\n",
" <td>1369.772259</td>\n",
" <td>1</td>\n",
" <td>0.486905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>8829</td>\n",
" <td>1099.494215</td>\n",
" <td>1</td>\n",
" <td>-2.382575</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>8824</td>\n",
" <td>1349.701272</td>\n",
" <td>1</td>\n",
" <td>0.273816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>8829</td>\n",
" <td>1353.643280</td>\n",
" <td>1</td>\n",
" <td>0.315667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>8824</td>\n",
" <td>1414.873927</td>\n",
" <td>1</td>\n",
" <td>0.965739</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares normalized_complexity\n",
"#id \n",
"7 8813 1236.575624 1 -0.927213\n",
"9 8815 1299.871029 1 -0.255221\n",
"10 8810 1306.593595 1 -0.183849\n",
"14 8810 1300.302886 1 -0.250636\n",
"23 8816 1180.945863 1 -1.517822\n",
"32 8814 1369.772259 1 0.486905\n",
"40 8829 1099.494215 1 -2.382575\n",
"41 8824 1349.701272 1 0.273816\n",
"48 8829 1353.643280 1 0.315667\n",
"51 8824 1414.873927 1 0.965739"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_flare.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" <th>normalized_complexity</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8809</td>\n",
" <td>1126.778324</td>\n",
" <td>0</td>\n",
" <td>-0.680140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8810</td>\n",
" <td>1312.434736</td>\n",
" <td>0</td>\n",
" <td>1.160147</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8812</td>\n",
" <td>962.280235</td>\n",
" <td>0</td>\n",
" <td>-2.310698</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8813</td>\n",
" <td>1199.623395</td>\n",
" <td>0</td>\n",
" <td>0.041924</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>8814</td>\n",
" <td>1270.270911</td>\n",
" <td>0</td>\n",
" <td>0.742205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8810</td>\n",
" <td>1293.377418</td>\n",
" <td>0</td>\n",
" <td>0.971244</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8814</td>\n",
" <td>1209.593634</td>\n",
" <td>0</td>\n",
" <td>0.140752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>8813</td>\n",
" <td>1243.681834</td>\n",
" <td>0</td>\n",
" <td>0.478646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>8814</td>\n",
" <td>1191.450319</td>\n",
" <td>0</td>\n",
" <td>-0.039090</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>8815</td>\n",
" <td>1218.807609</td>\n",
" <td>0</td>\n",
" <td>0.232084</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares normalized_complexity\n",
"#id \n",
"1 8809 1126.778324 0 -0.680140\n",
"2 8810 1312.434736 0 1.160147\n",
"3 8812 962.280235 0 -2.310698\n",
"4 8813 1199.623395 0 0.041924\n",
"5 8814 1270.270911 0 0.742205\n",
"6 8810 1293.377418 0 0.971244\n",
"8 8814 1209.593634 0 0.140752\n",
"11 8813 1243.681834 0 0.478646\n",
"12 8814 1191.450319 0 -0.039090\n",
"13 8815 1218.807609 0 0.232084"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_not_flare.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_not_flare.normalized_complexity.var()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9999999999999967"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"does_flare.normalized_complexity.var()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### This assumption 3 holds.\n",
"Continous variable in both positive and negative has equal variance."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"****"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Combining the positive and negative classes"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"all_flares = pd.concat([does_flare, does_not_flare])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" <th>normalized_complexity</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7326</th>\n",
" <td>9965</td>\n",
" <td>1233.908404</td>\n",
" <td>0</td>\n",
" <td>0.381768</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6722</th>\n",
" <td>9868</td>\n",
" <td>1242.323301</td>\n",
" <td>0</td>\n",
" <td>0.465179</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12464</th>\n",
" <td>10773</td>\n",
" <td>1281.222179</td>\n",
" <td>0</td>\n",
" <td>0.850758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6002</th>\n",
" <td>9744</td>\n",
" <td>1121.378700</td>\n",
" <td>0</td>\n",
" <td>-0.733663</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6400</th>\n",
" <td>9799</td>\n",
" <td>1279.962526</td>\n",
" <td>0</td>\n",
" <td>0.838272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11038</th>\n",
" <td>10554</td>\n",
" <td>1363.359946</td>\n",
" <td>0</td>\n",
" <td>1.664934</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1576</th>\n",
" <td>9054</td>\n",
" <td>1279.053561</td>\n",
" <td>0</td>\n",
" <td>0.829262</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10426</th>\n",
" <td>10454</td>\n",
" <td>1137.855032</td>\n",
" <td>0</td>\n",
" <td>-0.570344</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11102</th>\n",
" <td>10564</td>\n",
" <td>1333.140412</td>\n",
" <td>1</td>\n",
" <td>0.097993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12626</th>\n",
" <td>10794</td>\n",
" <td>1171.999114</td>\n",
" <td>1</td>\n",
" <td>-1.612808</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares normalized_complexity\n",
"#id \n",
"7326 9965 1233.908404 0 0.381768\n",
"6722 9868 1242.323301 0 0.465179\n",
"12464 10773 1281.222179 0 0.850758\n",
"6002 9744 1121.378700 0 -0.733663\n",
"6400 9799 1279.962526 0 0.838272\n",
"11038 10554 1363.359946 0 1.664934\n",
"1576 9054 1279.053561 0 0.829262\n",
"10426 10454 1137.855032 0 -0.570344\n",
"11102 10564 1333.140412 1 0.097993\n",
"12626 10794 1171.999114 1 -1.612808"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_flares.sample(frac=1).head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### For Assumption 4"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1200x1200 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(12,12), dpi=100)\n",
"sns.boxplot(y=all_flares.normalized_complexity, x=all_flares.flares)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1200x1200 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(12,12), dpi=100)\n",
"sns.violinplot(y=all_flares.normalized_complexity, x=all_flares.flares)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### There are some outliers for the positive class (Need help to deal with them)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### For Assumption 5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Plotting the distribution of positive and negative classes"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1200x1200 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(12,12), dpi=100)\n",
"sns.distplot(does_flare.normalized_complexity, color='green')\n",
"sns.distplot(does_not_flare.normalized_complexity, color='red')\n",
"plt.legend(labels=['Flare occurs', 'Flare does not occur'])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### To test if distrbution can be assumed gaussian"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shapiro-Wilks test for normality\n",
"The Shapiro-Wilks test for normality is used to detect all departures from normality.\n",
"\n",
"The test rejects the hypothesis of normality when the p-value is less than or equal to 0.05. \n",
"\n",
"Passing the normality test only indicates that no significant departure from normality was found."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The p value for this test is 1.3814005237797574e-26\n"
]
}
],
"source": [
"statistic, p_value = shapiro(does_flare.complexity)\n",
"print(f\"The p value for this test is {p_value}\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The p value for this test is 8.128411264416912e-19\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/apollo/anaconda3/envs/Andromeda/lib/python3.6/site-packages/scipy/stats/morestats.py:1676: UserWarning: p-value may not be accurate for N > 5000.\n",
" warnings.warn(\"p-value may not be accurate for N > 5000.\")\n"
]
}
],
"source": [
"statistic, p_value = shapiro(does_not_flare.complexity)\n",
"print(f\"The p value for this test is {p_value}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Both distributions fail the Shapiro test. Need help to fix this."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Point-Biserial Correlation\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noaa</th>\n",
" <th>complexity</th>\n",
" <th>flares</th>\n",
" <th>normalized_complexity</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7728</th>\n",
" <td>10028</td>\n",
" <td>1119.386192</td>\n",
" <td>0</td>\n",
" <td>-0.753413</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7182</th>\n",
" <td>9933</td>\n",
" <td>1120.718174</td>\n",
" <td>0</td>\n",
" <td>-0.740210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12101</th>\n",
" <td>10723</td>\n",
" <td>1139.305594</td>\n",
" <td>0</td>\n",
" <td>-0.555966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6023</th>\n",
" <td>9745</td>\n",
" <td>1151.497316</td>\n",
" <td>0</td>\n",
" <td>-0.435117</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10321</th>\n",
" <td>10432</td>\n",
" <td>1078.626608</td>\n",
" <td>0</td>\n",
" <td>-1.157435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>829</th>\n",
" <td>8943</td>\n",
" <td>1234.441695</td>\n",
" <td>0</td>\n",
" <td>0.387054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6193</th>\n",
" <td>9765</td>\n",
" <td>1258.035947</td>\n",
" <td>0</td>\n",
" <td>0.620928</td>\n",
" </tr>\n",
" <tr>\n",
" <th>302</th>\n",
" <td>8851</td>\n",
" <td>1112.909907</td>\n",
" <td>0</td>\n",
" <td>-0.817608</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10300</th>\n",
" <td>10429</td>\n",
" <td>1174.858872</td>\n",
" <td>0</td>\n",
" <td>-0.203550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8266</th>\n",
" <td>10110</td>\n",
" <td>1234.766115</td>\n",
" <td>0</td>\n",
" <td>0.390270</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noaa complexity flares normalized_complexity\n",
"#id \n",
"7728 10028 1119.386192 0 -0.753413\n",
"7182 9933 1120.718174 0 -0.740210\n",
"12101 10723 1139.305594 0 -0.555966\n",
"6023 9745 1151.497316 0 -0.435117\n",
"10321 10432 1078.626608 0 -1.157435\n",
"829 8943 1234.441695 0 0.387054\n",
"6193 9765 1258.035947 0 0.620928\n",
"302 8851 1112.909907 0 -0.817608\n",
"10300 10429 1174.858872 0 -0.203550\n",
"8266 10110 1234.766115 0 0.390270"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_flares.sample(frac=1).head(10)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PointbiserialrResult(correlation=0.4596014988642637, pvalue=0.0)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pointbiserialr(all_flares.flares, all_flares.complexity)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### The Point-Biserial Correlation shows a moderate positive correlation between AR complexity and Flare Production."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment