import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from statsmodels.api import OLS
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline
and the last column 'count' contains the response variable, total number of bike rentals on the day.
As a first step, identify important characteristics of the data using suitable visualizations when necessary. Some of the questions you may ask include (but are not limited to):
df_train = pd.read_csv('data/Bikeshare_train.csv', index_col = 0)
df_test = pd.read_csv('data/Bikeshare_test.csv', index_col = 0)
df_train.head()
df_train.describe()
f, axes = plt.subplots(6, 2, figsize = (10, 20))
plt.subplots_adjust(hspace = .3)
i, j = 0, 0
for col in df_train.columns:
axes[i, j].hist(df_train[col])
axes[i, j].set_title(col)
axes[i, j].spines['top'].set_visible(False)
axes[i, j].spines['right'].set_visible(False)
j +=1
if j == 2:
j = 0
i +=1
axes[i, j].spines['top'].set_visible(False)
axes[i, j].spines['right'].set_visible(False)
axes[i, j].spines['left'].set_visible(False)
axes[i, j].spines['bottom'].set_visible(False)
axes[i, j].tick_params(axis = 'both', which = 'both', bottom = False, left = False,
labelbottom = False, labelleft = False)
count_means_by_day = df_train.groupby('day_of_week')['count'].mean().tolist()
weekend_count_mean = (count_means_by_day[0] + count_means_by_day[-1])/2
weekday_count_mean = np.mean(count_means_by_day[1:6])
weekend_count_mean, weekday_count_mean
count_means_by_holiday = df_train.groupby('holiday')['count'].mean()
count_means_by_season = df_train.groupby('season')['count'].mean()
count_means_by_weather = df_train.groupby('weather')['count'].mean()
count_means_by_temp = df_train.groupby('temp')['count'].mean()
humidity = df_train.groupby('windspeed')['count'].mean().sort_values()
plt.scatter(humidity.index, humidity.values)
plt.show()
df = df_train.copy()
df['weekday_status'] = df.day_of_week.isin([0, 6])
colors = ['#f2c18c', '#c65503']
for g, group in df.groupby('weekday_status'):
plt.hist(group['count'], color = colors.pop())
plt.legend(['Weekday', 'Weekend'])
We cannot use categorically numbered attributes directly as predictors because these numbers do not represent some type of 'distance' between categories. Thus we must treat them as separate binary cases by creating dummy variables.
df_train = pd.get_dummies(df_train, columns = ['weather', 'season', 'day_of_week', 'month'])
df_test = pd.get_dummies(df_test, columns = ['weather', 'season', 'day_of_week', 'month'])
df_train = df_train.drop(['weather_3.0', 'month_12.0', 'season_4.0', 'day_of_week_6.0' ], axis = 1)
df_test = df_test.drop(['weather_3.0', 'month_12.0', 'season_4.0', 'day_of_week_6.0' ], axis = 1)
df_train_cont = df_train[['windspeed', 'atemp', 'temp', 'humidity']]
df_train[['windspeed', 'atemp', 'temp', 'humidity']] = (df_train_cont - df_train_cont.mean())/df_train_cont.std()
df_test[['windspeed', 'atemp', 'temp', 'humidity']] = (df_test[['windspeed', 'atemp', 'temp', 'humidity']] - df_train_cont.mean())/df_train_cont.std()
df_train.to_csv('bike_train_df.csv')
df_test.to_csv('bike_test_df.csv')
df_train.head()
Xtrain, ytrain = df_train.drop('count', axis =1), df_train['count'].reshape(-1,1)
Xtest, ytest = df_test.drop('count', axis =1), df_test['count'].reshape(-1, 1)
mult_reg = sm.OLS(ytrain, sm.add_constant(Xtrain)).fit()
ytest_pred = mult_reg.predict(sm.add_constant(Xtest))
ytrain_pred = mult_reg.predict(sm.add_constant(Xtrain))
print("Train R2 score:", r2_score(ytrain, mult_reg.predict(sm.add_constant(Xtrain))))
print("Test R2 score:", r2_score(ytest, ytest_pred))
#print(mult_reg.params.apply(lambda x: 'positive correlation' if x >0 else 'negative correlation'))
#print(mult_reg.params)
#print(mult_reg.pvalues)
print(mult_reg.summary())
From our analysis there is not enough evidence to reject the null hypothesis, that the day of the week does not predict the number of bike rentals, therefore we have no evidence of a relationship between day of the week and bike rentals
The above is true for the month as well
There is no evidence holiday effects the number of bike rentals
The temp variable is assigned a higher coefficent value than atemp. Since temp and atemp are relatively similar in value, this could mean that in the regression model, temp contributes more to the predicted outcome. Looking at the p values we see that there is sufficient evidence that there is a relationship between temp and bike rental numbers, while there is not sufficient evidence of a relationship between atemp and bike rentals.
plt.scatter(ytrain_pred.values, pd.Series(ytrain.reshape(1, -1)[0])-ytrain_pred)
plt.axhline(y=0, color = 'red')
plt.hist(ytrain_pred - pd.Series(ytrain.reshape(1, -1)[0]))
pd.plotting.scatter_matrix(df_train_cont, figsize = (10,10))
plt.show()
corr = df_train.drop('count', axis = 1).corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr)
We need a way of selecting only the significant features to use in our model. Two useful methods for doing this are forward and backward selection. These are not implimented in scikit-learn, so we can write a quick function for each of these algorithms.
def forward_selection(df_train, ytrain):
S = []
results = {}
for i in range(len(df_train.columns)-1):
model_results = {}
for column in df_train.drop(S+['count'], axis =1).columns:
test_set = S+[str(column)]
model = sm.OLS(ytrain, sm.add_constant(df_train[test_set])).fit()
model_results[model.bic] = test_set
minimum = min(model_results.keys())
minimum_set= model_results[minimum]
S+=minimum_set
S = list(set(S))
results[minimum] = minimum_set
key = min(results.keys())
return results[key], key
def backward_selection(df_train, ytrain):
dropped =['count']
S = list(df_train.columns)
results = {}
for i in range(len(df_train.columns)-2):
model_results = {}
for column in df_train.drop(dropped, axis =1).columns:
test_set = [x for x in df_train.columns if x not in dropped +[str(column)]]
model = sm.OLS(ytrain, sm.add_constant(df_train[test_set])).fit()
model_results[model.bic] = test_set
minimum = min(model_results.keys())
minimum_key = model_results[minimum]
dropped = [x for x in df_train.columns if x not in minimum_key]
results[minimum] = minimum_key
key = min(results.keys())
return results[key], key
forward_set, backward_set = forward_selection(df_train, ytrain)[0], backward_selection(df_train, ytrain)[0]
forward_params, backward_params = df_train[forward_set], df_train[backward_set ]
forward_df, backward_df = df_train[forward_set + ['count']], df_train[backward_set + ['count']]
forward_test_params, backward_test_params = df_test[forward_set], df_test[backward_set]
forward_test_df, backward_test_df = df_train[forward_set + ['count']], df_train[backward_set + ['count']]
forward_model = sm.OLS(ytrain, sm.add_constant(forward_params)).fit()
forward_pred = forward_model.predict(sm.add_constant(forward_test_params))
print('Forward Selection')
print('Train R2:', r2_score(ytrain, forward_model.predict(sm.add_constant(forward_params))))
print('Test R2:', r2_score(ytest, forward_pred))
print('\nParameters:', forward_set)
backward_model = sm.OLS(ytrain, sm.add_constant(backward_params)).fit()
backward_pred = backward_model.predict(sm.add_constant(backward_test_params))
print('\nBackward Selection')
print('Train R2:',r2_score(ytrain, backward_model.predict(sm.add_constant(backward_params))))
print('Test R2:', r2_score(ytest, backward_pred))
print('\nParameters:', backward_set)
Next we will use k fold cross validation to validate our model, averaging over each fitted model's R2 score.
from sklearn.cross_validation import KFold
def fit_model_r2(X, ytrain, Xtest, ytest):
model = sm.OLS(ytrain, sm.add_constant(X)).fit()
results = model.predict(sm.add_constant(Xtest))
return r2_score(ytest,results )
def cross_fold_10(model_df):
r2 = []
for train, test in KFold(model_df.shape[0], 10):
Xytrain = model_df.iloc[train]
Xytest = model_df.iloc[test]
r2.append(fit_model_r2(Xytrain.drop('count', axis = 1), Xytrain['count'],
Xytest.drop('count', axis = 1), Xytest['count']))
return np.mean(r2)
print('R2 Scores\n\nForward Cross Fold:', cross_fold_10(forward_df),
'\nLinear Cross Fold:', cross_fold_10(df_train),
'\nBackward Cross Fold:', cross_fold_10(backward_df))
The issue with doing this is that since we used the the training data to choose our parameters we've used our data twice, thus increasing our scores. However we should be able to to use our test data to evaluate how the models generalize to new data.
In the next post we will try to improve our model's preformance using regularization, which can help adjust for any high variance we are seeing due to multicolinearity.