Skip to content

Instantly share code, notes, and snippets.

View Akramz's full-sized avatar
🎯
Focusing

Akram Zaytar Akramz

🎯
Focusing
View GitHub Profile
@Akramz
Akramz / rain-vs-nonrain.py
Last active October 25, 2015 00:55
math analysis of the null hypothesis
import numpy as np
from ggplot import *
import scipy
import scipy.stats
import pandas
df = pandas.read_csv('new.csv')
with_rain_mean = np.mean(df['ENTRIESn_hourly'][df['rain'] == 1])
without_rain_mean = np.mean(df['ENTRIESn_hourly'][df['rain'] == 0])
U, p = scipy.stats.mannwhitneyu(df['ENTRIESn_hourly'][df['rain'] == 1], df['ENTRIESn_hourly'][df['rain'] == 0])
@Akramz
Akramz / distro_visualisation.py
Created October 23, 2015 15:33
To Check distributions of different vars
import numpy as np
from ggplot import *
import scipy
import scipy.stats
import pandas
from data_model import turnstileData
one = turnstileData(filePath='improved_data_set/old.csv')
justEntriesH = one.select(['ENTRIESn_hourly'])
justExistsH = one.select(['EXITSn_hourly'])
import numpy as np
from ggplot import *
import scipy
import scipy.stats
import pandas
from data_model import turnstileData
def plot_cost_history(alpha, cost_history):
"""This function is for viewing the plot of your cost history.
You can run it by uncommenting this
import pandas
import numpy as np
class turnstileData(object):
"""Just a Class to easily get data and manipulate it"""
def __init__(self, filePath):
self.df = pandas.read_csv(filePath)
def select(self, elements):
return self.df[elements]
@Akramz
Akramz / linear_model_OLS.py
Created October 24, 2015 12:11
to do linear regression of tempi -> hourly entries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
df = pd.read_csv('../improved_data_set/turnstile_weather_v2.csv', index_col=0)
dk = pd.DataFrame(df.groupby('tempi')['ENTRIESn_hourly'].mean())
dk['tempi'] = dk.index
y = dk.ENTRIESn_hourly # response
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sys import exit
df = pd.read_csv('../improved_data_set/turnstile_weather_v2.csv', index_col=0)
dk = pd.DataFrame(df.groupby('hour')['ENTRIESn_hourly'].mean())
dk['hour'] = dk.index
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sys import exit
# try whatever you want
listX = ['day_week', 'fog', 'rain', 'weekday']
df = pd.read_csv('../improved_data_set/turnstile_weather_v2.csv', index_col=0)
import numpy as np
from ggplot import *
import pandas
df = pandas.read_csv('../../improved_data_set/old.csv')
hourly_entries = df[['ENTRIESn_hourly']]
p = ggplot(aes(x='ENTRIESn_hourly'), data=hourly_entries) + geom_histogram(color='white', fill='red') + xlab("hourly entries") + ylab("Frequencies")
print p
import numpy as np
import pandas
import matplotlib.pyplot as plt
df = pandas.read_csv('../../improved_data_set/old.csv')
plt.figure()
df['ENTRIESn_hourly'][df['rain'] == 0].hist()
df['ENTRIESn_hourly'][df['rain'] == 1].hist()
plt.xlabel('hourly entries')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sys
# try whatever you want
element = 'hour'
df = pd.read_csv('../improved_data_set/turnstile_weather_v2.csv', index_col=0)