Created
September 2, 2016 22:00
-
-
Save rahit/5dcbee8f63c0be06eec2b51110d21272 to your computer and use it in GitHub Desktop.
Time Series Prediction Using ARMA and Linear Regression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = "K.M. Tahsin Hassan Rahit" | |
__email__ = "[email protected]" | |
import pandas as pd | |
import statsmodels.api as sm | |
import matplotlib.pylab as plt | |
from matplotlib import * | |
from spectrum import * | |
from pylab import * | |
import spectrum.arma | |
from pylab import plot, log10, hold, legend | |
from statsmodels.tsa.stattools import acf, pacf | |
from statsmodels.tsa.arima_model import ARIMA | |
from sklearn.cross_validation import train_test_split | |
##### | |
# data = pd.read_csv('air_data.csv', | |
# parse_dates=[['Date', 'Time']], index_col='Date_Time') | |
# dta = data.ix[:,['T']] | |
# res = sm.tsa.ARMA(dta, (6, 23), freq='H').fit() | |
# #print res.predict(7000, 10000) | |
# fig, ax = plt.subplots() | |
# ax = dta.ix[6500:].plot(ax=ax) | |
# res.plot_predict(8000, 10000, dynamic=True, ax=ax, | |
# plot_insample=False) | |
# plt.show() | |
##### | |
def check_pq_value(data): | |
data_diff = data - data.shift() | |
lag_acf = acf([float(i) for i in data['L'].tolist()], nlags=10) | |
lag_pacf = pacf([float(i) for i in data['L'].tolist()], nlags=10, method='ols') | |
plt.subplot(121) | |
plt.plot(lag_acf) | |
plt.axhline(y=0,linestyle='--',color='gray') | |
plt.axhline(y=2/np.sqrt(len(data_diff)),linestyle='--',color='gray') | |
plt.axhline(y=4/np.sqrt(len(data_diff)),linestyle='--',color='gray') | |
plt.title('Autocorrelation Function') | |
plt.subplot(122) | |
plt.plot(lag_pacf) | |
plt.axhline(y=0,linestyle='--',color='gray') | |
plt.axhline(y=2/np.sqrt(len(data_diff)),linestyle='--',color='gray') | |
plt.axhline(y=4/np.sqrt(len(data_diff)),linestyle='--',color='gray') | |
plt.title('Partial Autocorrelation Function') | |
plt.tight_layout() | |
plt.show() | |
def do_arma(data, target, freq, train_start, pred_start, pred_end): | |
plt.style.use('ggplot') | |
y = [float(i) for i in data[target].tolist()] | |
nobs = len(y) | |
# dates = sm.tsa.datetools.dates_from_range('1680m1', length=nobs) | |
dates = pd.period_range(train_start, periods=nobs, freq=freq) | |
y = pd.Series(y, index=dates.to_timestamp(freq=freq)) | |
arma_mod = sm.tsa.ARMA(y, order=(2, 2)) | |
arma_res = arma_mod.fit() | |
pred = arma_res.predict(pred_start, pred_end) | |
# pred_idx = pd.period_range('1755-12-31', end='1888-12-31', freq='A') | |
fig, ax = plt.subplots() | |
ax.plot(y[100:300]) | |
ax.plot(pred) | |
plt.show() | |
# data = pd.read_csv('air_data.csv') | |
# do_arma(data, 'T', 'H', '1755-12-31', '1756-01-05', '1756-01-10') | |
data = pd.read_csv('nile-water-level.csv') | |
# check_pq_value(data) | |
do_arma(data, 'L', 'A', '1755-12-31', '2025-12-31', '2150-12-31') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = "K.M. Tahsin Hassan Rahit" | |
__email__ = "[email protected]" | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pylab as plt | |
from matplotlib import * | |
from matplotlib.pylab import rcParams | |
from sklearn.ensemble.forest import RandomForestRegressor | |
from sklearn.cross_validation import train_test_split | |
from sklearn.linear_model import LinearRegression | |
from sklearn.metrics import mean_squared_error | |
plt.style.use('ggplot') | |
data = pd.read_csv('air_data.csv', parse_dates=[['Date', 'Time']], index_col='Date_Time') | |
# print data | |
data = data[data["T"] > 0] | |
data = data.dropna() | |
train, test = train_test_split(data, random_state=1, test_size=.1) | |
print data.corr()["T"] | |
columns = data.columns.tolist() | |
target = 'T' | |
columns = [c for c in columns if c not in ['T', 'Date', 'Time', 'RH', 'AH']] | |
model = LinearRegression() | |
# model = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=1) | |
#print columns | |
# Fit the model to the training data. | |
model.fit(train[columns], train[target]) | |
predictions = model.predict(test[columns]) | |
print mean_squared_error(predictions, test[target]) | |
test_indecies = test['T'].index.tolist() | |
fig, ax = plt.subplots() | |
ax.scatter(test_indecies, predictions, c='r', marker="s", label='Predicted') | |
ax.scatter(test_indecies, test['T'], c='g', marker="o", label='Original') | |
plt.legend(loc='upper left') | |
plt.grid(True) | |
# ax.plot([test[target].index.min(), test[target].index.max()], [predictions.min(), predictions.max()], 'k--', lw=4) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment