Cognizant, Bangalore
March, 2015
Jigsaw Academy
Instructor: Anand Chitipothu
This live notes are avaialble online at http://bit.ly/cognizant-py.
We are going to look at various classification algorithms today.
Wine Quality - Download
Wine recognition data from UCI machine learning datasets.
We are going to use the following function for drawing the clasification boundaries to get a visual sense of how the classfication algorithm is working on the given data.
import os
from IPython.display import Image
from sklearn import tree
# https://gist.github.com/anandology/772d44d291a9daa198d4
def plot_decision_boundaries(X, y, model_class, **model_params):
"""Function to plot the decision boundaries of a classification model.
This uses just the first two columns of the data for fitting
the model as we need to find the predicted value for every point in
scatter plot.
One possible improvement could be to use all columns fot fitting
and using the first 2 columns and median of all other columns
for predicting.
Adopted from:
http://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_decision_regions.html
http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html
"""
reduced_data = X[:, :2]
model = model_class(**model_params)
model.fit(reduced_data, y)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh using the model.
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)
return plt
def export_tree(model, filename="tree.dot", simple=False):
"""Exports the decision tree as a graphviz file.
"""
params = {}
if not simple:
params = dict(filled=True, rounded=True, special_characters=True)
with open(filename, "w") as f:
tree.export_graphviz(model, out_file=f,
class_names=iris.target_names,
**params)
def show_tree(model, filename="tree.dot", simple=False, text=False):
"""Displays the tree as image.
This requires the graphviz package to be installed on
the computer. If it is not installed it displays the
graphviz file as text.
"""
dot_exists = os.system("dot -V") == 0
if not dot_exists or text:
simple = True
export_tree(model, filename, simple=simple)
pngfile = filename.replace(".dot", ".png")
if dot_exists and not text:
os.system("dot -Tpng {} -o {}".format(filename, pngfile))
return Image(pngfile)
else:
txt = open(filename).read().replace("\\n", "\n ").replace(";", ";\n")
print(txt)
$y = b + b1 * X$
$logit(p) = b + b1*X$
where:
$logit(p) = log(\frac {p}{1-p})$
If you work hard, you can figure out that:
$p = \frac{1}{1 + e^{\beta_0 + \beta_1X}}$
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Lets take the iris data.
from sklearn.datasets import load_iris
iris = load_iris()
# convert the data into two classes
X = iris.data
y = (iris.target > 0).astype(np.int8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="summer")
pd.Series(y).value_counts()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
model.predict(X[:5])
y[:5]
model.predict_proba(X[:5])
Problem: Find the class for sample [5, 4, 3, 1]. What is the probablity for it to belong to class 0?
X[:5]
X[:1]
Lets try to plot the probability of being in class 0 for every point.
p0 = model.predict_proba(X)[:, 0]
plt.plot(sorted(p0))
iris.target_names
plot_decision_boundaries(X, y, LogisticRegression)
Q: Can we try with three classes?
plot_decision_boundaries(X, iris.target, LogisticRegression)
Problem: Take the wine data and try to fit a logistic regression model.
The first column is the class label.
wine = pd.read_csv("datasets/wine.data", header=None)
wine.shape
wine.columns
wine.head()
wine.columns = ["C"+str(n) for n in wine.columns]
wine.head()
X = wine.drop("C0", axis=1)
y = wine.C0
X.shape
y.shape
plot_decision_boundaries(X.values, y.values, LogisticRegression)
wine.C0.value_counts()
model = LogisticRegression()
model.fit(X,y)
yp = model.predict(X)
from sklearn.metrics import accuracy_score
accuracy_score(y, yp)
y.shape
(y==yp).sum()
173/178.0
Split the data into train and test.
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)
Xtrain.shape
Xtest.shape
pd.Series(ytrain).value_counts()
pd.Series(ytest).value_counts()
model = LogisticRegression()
model.fit(Xtrain, ytrain)
ytrainp = model.predict(Xtrain)
print("accurary on training data is ", accuracy_score(ytrain, ytrainp))
ytestp = model.predict(Xtest)
print("accurary on training data is ", accuracy_score(ytest, ytestp))
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, ytestp)
pd.Series(ytest).value_counts()
pd.Series(ytestp).value_counts()
from sklearn.tree import DecisionTreeClassifier
X = iris.data[:, [2, 3]] # petal-length and petal-width
y = iris.target
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="hot")
plot_decision_boundaries(X, y, DecisionTreeClassifier, max_depth=1)
plot_decision_boundaries(X, y, DecisionTreeClassifier, max_depth=2, random_state=0)
plot_decision_boundaries(X, y, DecisionTreeClassifier, max_depth=3)
model = DecisionTreeClassifier(max_depth=2, random_state=0)
model.fit(X, y)
show_tree(model)
yp = model.predict(X)
accuracy_score(y, yp)
confusion_matrix(y, yp)
show_tree(model, text=True)
Lets see what happens if we allow the decision tree to go very deep.
plot_decision_boundaries(X, y, DecisionTreeClassifier, max_depth=5)
Problem: Build a decision tree model using all the features of iris. split the data into training and test and see what is the model accuracy on training and test datasets.
Find out at what max_depth
, the model gives best accuracy.
X = iris.data
y = iris.target
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=y)
def try_model(max_depth):
model = DecisionTreeClassifier(max_depth=max_depth)
model.fit(Xtrain, ytrain)
ytrainp = model.predict(Xtrain)
ytestp = model.predict(Xtest)
return [max_depth,
accuracy_score(ytrain, ytrainp),
accuracy_score(ytest, ytestp)]
data = pd.DataFrame([try_model(i) for i in range(1, 10)])
data.set_index(0, inplace=True)
data.plot()
model = DecisionTreeClassifier(max_depth=2)
model.fit(Xtrain, ytrain)
show_tree(model)
from sklearn.tree import DecisionTreeRegressor
d = pd.read_csv("datasets/chirps.tsv", delimiter="\t")
d.head()
model = DecisionTreeRegressor()
model.fit(d[["temp"]], d["chirps"])
show_tree(model)
yp = model.predict(d[["temp"]])
!cat datasets/wine.names
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris.data)
data2 = scaler.transform(iris.data)
d = pd.DataFrame(data2)
d.describe()
x1 = [3, 4, 5, 2]
scaler.transform([x1])
scaler.mean_
scaler.std_
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scale', StandardScaler()),
('dt', DecisionTreeClassifier())])
pipe.fit(X, y)
yp = pipe.predict(X)
accuracy_score(y, yp)
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=1000, factor=0.2, noise=0.1)
plt.scatter(X[:, 0], X[:, 1], c=y)
d = pd.DataFrame(X, columns=["x1", "x2"])
d["y"] = y
d.head()
d.plot(kind="scatter", x="x1", y="x2", c="y", cmap="summer")
plt.scatter(d.x1, d.x1*d.x1+d.x2*d.x2, c=d.y)
plt.scatter(d.x1, (d.x1*d.x1+d.x2*d.x2)**0.5, c=d.y)
d["x3"] = (d.x1*d.x1+d.x2*d.x2)**0.5
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(d[["x1", "x2", "x3"]], d.y)
yp = model.predict(d[["x1", "x2", "x3"]])
accuracy_score(y, yp)
plot_decision_boundaries(d[["x1", "x3"]].values, d.y.values, LogisticRegression)
X[:5]
plot_decision_boundaries(X, y, LogisticRegression)
from sklearn.svm import SVC
plot_decision_boundaries(X, y, SVC, kernel="linear")
plot_decision_boundaries(X, y, SVC, kernel="rbf")
Problem: There is another interesting generated dataset called moons. Try it with all the classification algorithms and see how it works.
from sklearn.datasets import make_moons
X, y = make_moons(1000)
plt.scatter(X[:, 0], X[:, 1], c = y)
X, y = make_circles(n_samples=1000, factor=0.2, noise=0.1)
d = pd.DataFrame(X, columns=["x1", "x2"])
d['y'] = y
d['x3'] = (d.x1*d.x1+d.x2*d.x2)**0.5
model = SVC(kernel="linear")
model.fit(d[['x1', 'x3']], y)
yp = model.predict(d[['x1', 'x3']])
d['yp'] = yp
plt.scatter(d.x1, d.x3, c=d.yp)
plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], c='y')
model.support_vectors_
Q Can we get three concentric rings?
def scatter_circles(n, factor, noise=0.05):
X, y = make_circles(n, factor=factor, noise=noise)
plt.scatter(X[:, 0], X[:, 1], c=y)
return X, y
X1, y1 = scatter_circles(1000, 0.1)
X2, y2 = scatter_circles(1000, 0.6)
X1[:5], y1[:5]
X2[:5], y2[:5]
plt.scatter(X1[:, 0], X1[:, 1], c=y1)
plt.scatter(X2[:, 0], X2[:, 1], c=y2)
d = pd.DataFrame(X1, columns=["x1", "x2"])
d["y"] = y1
d[y1==0].plot(x="x1", y="x2", kind="scatter")
d[y1==1].plot(x="x1", y="x2", kind="scatter")
d3 = d[y1==1]
d3.y.head()
d2 = pd.DataFrame(X2, columns=["x1", "x2"])
d2["y"] = y2
d2.head()
d3.head()
d4 = np.hstack?
d4 = d2.append(d3)
d4.plot(kind="scatter", x="x1", y="x2", c="y")
def circles(n, factor):
X, y = make_circles(n, factor=factor, noise=0.05)
d = pd.DataFrame(X, columns=["x1", "x2"])
d["y"] = y
return d
d1 = circles(1000, 0.2)
d2 = circles(1000, 0.6)
d3 = d2[d2.y==1]
d3.y = 2
d4 = d1.append(d3)
d4.plot(kind="scatter", x="x1", y="x2", c="y", cmap="cool")
plot_decision_boundaries(d4[["x1", "x2"]].values, d4.y.values, SVC)
d4["x12"] = (d4.x1*d4.x1+d4.x2*d4.x2)
plot_decision_boundaries(d4[["x1", "x12"]].values, d4.y.values, LogisticRegression)
model = SVC()
model.fit(d4[["x1", "x2"]], d4.y)
plt.scatter(d4.x1, d4.x2, c=d4.y)
plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], c='y')
plot_decision_boundaries(d4[["x1", "x2"]].values, d4.y.values,
DecisionTreeClassifier, max_depth=8)
xx, yy = np.meshgrid(np.arange(5), np.arange(10, 15))
xx
xx.ravel()
xx.reshape(-1)
np.array([[x, y] for x in np.arange(5) for y in np.arange(10, 15)])
np.c_[xx.reshape(-1), yy.reshape(-1)]