%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()
boston.feature_names
print(boston.DESCR)
boston.data.shape
Lets create a Pandas DataFrame object for this dataset.
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data.head()
Lets add the target column also to the dataset.
data['PRICE'] = boston.target
data.head()
data.describe()
First thing to do when we get a dataset is get a sense of the dataset by visualizing it. Lets try to see if there is any correlation between various columns.
data.plot(kind="scatter", x='RM', y='PRICE')
plt.hist(data.CRIM)
plt.title("CRIM")
plt.xlabel("Crime rate per capita")
plt.ylabel("Frequencey")
plt.show()
It looks like boston was very peaceful city back then.
data.PRICE.hist()
from sklearn.linear_model import LinearRegression
X = data.drop('PRICE', axis = 1)
model = LinearRegression()
model.fit(X, data.PRICE)
print('Estimated intercept coefficient:', model.intercept_)
print("coefficients:")
print(model.coef_)
for col, coef in zip(X.columns, model.coef_):
print("{:+6.4f} {}".format(coef, col))
The coefficients indicate a lot about the important features.
The house price seems to be highly effected by RM, the average number of rooms per dwelling and RAD, accessibility to radial highways and negatively effected by NOX, the pollution and DIS, the average distance to employment centers.
plt.scatter(np.log(data.DIS), data.PRICE)
import sklearn
import numpy as np
X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(
X, data.PRICE, test_size=0.33, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
model = LinearRegression()
model.fit(X_train, Y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print("MSE train:", np.mean((Y_train - model.predict(X_train)) ** 2))
print("MSE test:", np.mean((Y_test - model.predict(X_test)) ** 2))
m = model
plt.scatter(m.predict(X_train), m.predict(X_train) - Y_train, c='b', s=40, alpha=0.5, label="train")
plt.scatter(m.predict(X_test), m.predict(X_test) - Y_test, c='g', s=40, label="test")
plt.hlines(y = 0, xmin=0, xmax = 50)
plt.ylabel('Residuals')
plt.legend()
model
model.residues_
model.get_params()
import statsmodels