Soccolo · September 3, 2021 21:24
diff --git a/Rain.py b/Rain.py
 # This Python 3 environment comes with many helpful analytics libraries installed
 # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
 # For example, here's several helpful packages to load

 # This code can be run at https://www.kaggle.com/cristiancalin/using-logistic-regression-to-predict-rainfall

 import numpy as np  #linear algebra
 import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
 import torch
 from tqdm import tqdm
 from pylab import rcParams
 import matplotlib.pyplot as plt
 from matplotlib import rc
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import confusion_matrix, classification_report
 from torch import nn, optim
 import torch.nn.functional as F

 # Input data files are available in the read-only "../input/" directory
 # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

 import os
 for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

 # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
 # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

 # We must decide what data we need in order to deduce whether it will rain tomorrow or not.
 # We shall only consider rainfall, humidity, pressure, rain today and rain tomorrow.
 # We aren't considering whether there were clouds or not because the data is incomplete, and what we have can already be turned into a model.
 # After deleting all the unnecessary data, we shall also turn the data into numerical values.

 cols = ['Rainfall', 'Humidity3pm', 'Pressure9am', 'RainToday', 'RainTomorrow']
 df = df[cols]
 df['RainToday']=df['RainToday'].replace({'Yes':1, 'No':0})
 df['RainTomorrow']=df['RainTomorrow'].replace({'Yes':1, 'No':0})
 df=df.dropna()

 # We shall use the first four indices in order to determine the fifth one, whether it will rain tomorrow.
 # We shall split the data into a training set and a test set using scikit-learn.

 X=df[['Rainfall','Humidity3pm','Pressure9am', 'RainToday']]
 y=df[['RainTomorrow']]
 X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=21)

 # Since we have two possible outcomes (whether it will rain or not), we shall use the Logistic Regression to predict the weather.

 logreg=LogisticRegression()
 logreg.fit(X_train, y_train)
 y_pred=logreg.predict(X_test)
 print(classification_report(y_test, y_pred))
 print(confusion_matrix(y_test, y_pred))

 # We observe that, through our model, we obtain a very large amount of false positives (even bigger than the number of true negatives!). 
 # This is mainly due to the fact that there are very few days in which it rains.
	# This Python 3 environment comes with many helpful analytics libraries installed
	# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
	# For example, here's several helpful packages to load

	# This code can be run at https://www.kaggle.com/cristiancalin/using-logistic-regression-to-predict-rainfall

	import numpy as np #linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
	import torch
	from tqdm import tqdm
	from pylab import rcParams
	import matplotlib.pyplot as plt
	from matplotlib import rc
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix, classification_report
	from torch import nn, optim
	import torch.nn.functional as F

	# Input data files are available in the read-only "../input/" directory
	# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

	import os
	for dirname, _, filenames in os.walk('/kaggle/input'):
	for filename in filenames:
	print(os.path.join(dirname, filename))

	# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
	# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

	df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

	# We must decide what data we need in order to deduce whether it will rain tomorrow or not.
	# We shall only consider rainfall, humidity, pressure, rain today and rain tomorrow.
	# We aren't considering whether there were clouds or not because the data is incomplete, and what we have can already be turned into a model.
	# After deleting all the unnecessary data, we shall also turn the data into numerical values.

	cols = ['Rainfall', 'Humidity3pm', 'Pressure9am', 'RainToday', 'RainTomorrow']
	df = df[cols]
	df['RainToday']=df['RainToday'].replace({'Yes':1, 'No':0})
	df['RainTomorrow']=df['RainTomorrow'].replace({'Yes':1, 'No':0})
	df=df.dropna()

	# We shall use the first four indices in order to determine the fifth one, whether it will rain tomorrow.
	# We shall split the data into a training set and a test set using scikit-learn.

	X=df[['Rainfall','Humidity3pm','Pressure9am', 'RainToday']]
	y=df[['RainTomorrow']]
	X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=21)

	# Since we have two possible outcomes (whether it will rain or not), we shall use the Logistic Regression to predict the weather.

	logreg=LogisticRegression()
	logreg.fit(X_train, y_train)
	y_pred=logreg.predict(X_test)
	print(classification_report(y_test, y_pred))
	print(confusion_matrix(y_test, y_pred))

	# We observe that, through our model, we obtain a very large amount of false positives (even bigger than the number of true negatives!).
	# This is mainly due to the fact that there are very few days in which it rains.