import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
with open('bikes_hourly.csv','rb') as UCI_bike:
data = pd.read_csv(UCI_bike, delimiter=',', header=0)
colnames = data.columns
print np.array(colnames)
We'll first plot a scatterplot showing the relative frequency and value of the number of bike shared for a given hour, temperature or wind speed. The colour refers to the four different seasons (for example, in the winter we'll have less bikes shared on average, as well as lower temperatures and higher wind speeds).
Plotting the data with a lower opacity allows us to see what the frequency of a given number of bikes shared actually is. For example, we can see that at 8 o'clock, we'll generally have either around 700 or around 400 or around 200 bikes shared, probably depending on the season
sns.set_style("darkgrid",{"grid.color": ".9", "axes.facecolor": "White"})
sns.set_palette("YlGnBu")
g = sns.PairGrid(data,
x_vars=colnames[[3, -4,-2]],
y_vars=["number_bikes_shared"],
aspect=1, size=5, hue='season')
g.map(plt.scatter, alpha=.15, ).add_legend();
#sns.plt.savefig("img1.png", dpi=600)
A kde plot allows us to see the concentration of a given number of bikes shared for a given temperature
for name in ['temperature','humidity','windspeed']:
sns.jointplot("number_bikes_shared", name, data=data, kind='kde', stat_func = None)
plt.show()
The pairgrid allows us to uncover relationships between two given variables (a row and a column variable), as well as the relative frequency (plotted on an histogram) for any variable on the diagonal
g = sns.PairGrid(data, vars=['weather_class', 'temperature', 'humidity', 'windspeed', 'number_bikes_shared'], hue='season')
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter, alpha=.1)
The distplot allows us to show the relative frequency as well as the inferred distribution for each variable (although of course it's kind of pointless for categorical variables')
for name in colnames:
plt.figure(figsize=(4,2))
g.fig.suptitle(name)
sns.distplot(data[name], color='teal', norm_hist = True);
By dividing the data in a train and a test set, we can estimate our model first (on the train set), and actually measure its performances with entirely new data - the test set
from sklearn.cross_validation import train_test_split
predictors = np.asarray(data.iloc[:,0:-1])
target = np.asarray(data.iloc[:,-1])
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, random_state=42)
from sklearn import linear_model
from sklearn import metrics
glm = linear_model.LinearRegression()
glm.fit(X_train, y_train)
prediction = glm.predict(X_test)
print(metrics.mean_squared_error(prediction, y_test)), "mean squared error"
print(metrics.r2_score(prediction, y_test)), "r2 score"
import tensorflow.contrib.learn as skflow
from sklearn import datasets, metrics
DNN = skflow.DNNRegressor(hidden_units=[200, 100, 40])
DNN.fit(X_train, y_train, steps=20000, batch_size=80)
prediction_tf = DNN.predict(X_test)
print(metrics.mean_squared_error(prediction_tf, y_test)), "mean squared error"
print(metrics.r2_score(prediction_tf, y_test)), "r2"
split = 600
x_as = np.arange(len(prediction))[0:split]
ar2 = np.round(zip(prediction[0:split],y_test[0:split]))
ar1 = zip(x_as,x_as)
fig = plt.figure(figsize=(20,10))
fig = plt.subplot()
for i in range(len(ar1)):
plt.plot(ar1[i], ar2[i], 'k-', lw=1, color='#2fa1bc')
plt.scatter(x_as, prediction[0:split], color="#94cfb8")
plt.scatter(x_as,y_test[0:split], color="#a8eb7a")
plt.title("for linear regression")
plt.legend([plot_pred,plot_test], ["predicted","actual"])
#plt.savefig("lr_res.png", dpi=200)
plt.show()
split = 600
x_as = np.arange(len(prediction_tf))[0:split]
ar2 = np.round(zip(prediction_tf[0:split],y_test[0:split]))
ar1 = zip(x_as,x_as)
fig = plt.figure(figsize=(20,10))
fig = plt.subplot()
for i in range(len(ar1)):
plt.plot(ar1[i], ar2[i], 'k-', lw=1, color='#2fa1bc')
plot_pred = plt.scatter(x_as, prediction_tf[0:split], color="#94cfb8")
plot_test = plt.scatter(x_as,y_test[0:split], color="#a8eb7a")
plt.title("for DNN")
plt.legend([plot_pred,plot_test], ["predicted","actual"])
#plt.savefig("dnn_res.png", dpi=200)
plt.show()
There are just as many values on this plot as on the first one — the relative emptiness is simply due to the DNN’s model redictive power (as smaller differences between predicted and actual values lead to much shorter blue segments).