In this tutorial we will assume basic familiarity with decision trees, but we will recap some important ideas. We can use decision trees for both classification and regression. During training, we decide which feature to use to split a tree node into two, and what value of the feature should be used as a splitting criterion. The prediction made by each leaf is as follows: for regression it is the mean label of all instances in the leaf; for classification it is the majority vote.
For regression, we choose the feature and and splitting values based upon which combination will give us the least mean squared error.
Example We will create a random dataset for $x$-axis values, and generate labels for them by adding some gaussian and random noise to $sin(x)$. We use max_depth in range 1-5 for the tree. We see underfitting for max_depth < 3, overfitting for max_depth > 3. When we use max_depth=3, we get the best fit.
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
def make_stepwise(y):
for i in range(len(y)-1):
if y[i]!=y[i+1]:
y[i]=np.nan
return y
def plot_decision(X,y,X_range, y_range,label='Training Data',color="darkorange"):
tmp=make_stepwise(np.copy(y_range))
plt.figure()
plt.scatter(X, y, s=25, edgecolor="black", c=color, label=label)
plt.plot(X_range, tmp, color="cornflowerblue",
label="Predicted Values", linewidth=2)
plt.plot(X_range, y_range, color="green",
label="Decision Boundary", linewidth=0.2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
#plt.legend()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
def loss(a,b):
return np.mean((a-b)**2)
def fit_predict_draw(X,y,X_test,y_test, max_depth=2):
regr = DecisionTreeRegressor(max_depth=max_depth)
regr.fit(X, y)
# Predict values to draw the learned model
X_range = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_range = regr.predict(X_range)
# Plot the results
plot_decision(X,y, X_range, y_range)
y_pred=regr.predict(X)
y_test_pred=regr.predict(X_test)
plot_decision(X_test, y_test, X_range, y_range,label='Testing Data',color='green')
print('Training Loss:%.4f\nTesting Loss:%.4f' %(loss(y,y_pred),loss(y_test,y_test_pred)))
return regr
# Create a random training dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))
y+=0.05*np.random.randn(len(y))
# Create a random testing dataset from some distribution as training dataset
rng = np.random.RandomState(2)
X_test = np.sort(5 * rng.rand(80, 1), axis=0)
y_test = np.sin(X_test).ravel()
y_test[::5] += 3 * (0.5 - rng.rand(16))
y_test+=0.05*np.random.randn(len(y_test))
# Fit regression model
regr=fit_predict_draw(X,y,X_test,y_test, max_depth=1)
Training Loss:0.1949 Testing Loss:0.2028
regr=fit_predict_draw(X,y,X_test,y_test, max_depth=2)
Training Loss:0.1332 Testing Loss:0.1883
best_regr=fit_predict_draw(X,y,X_test,y_test, max_depth=3)
Training Loss:0.0965 Testing Loss:0.1608
regr=fit_predict_draw(X,y,X_test,y_test, max_depth=4)
Training Loss:0.0499 Testing Loss:0.1780
regr=fit_predict_draw(X,y,X_test,y_test, max_depth=5)
Training Loss:0.0246 Testing Loss:0.2573
Here we show the decision tree for the best max_depth=3
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(best_regr, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("data")
graph
import numpy as np
import pandas as pd
import scipy.sparse
import pickle
import xgboost as xgb
import os
import sklearn as sk
from sklearn.ensemble import GradientBoostingClassifier
higgs_df=pd.read_csv('/home/ahsanm1/umair_wlab/jupyter_notebooks/xgboost_example/small_higgs.csv',header=None)
data=np.array(higgs_df)[:,1:]
labels=np.array(higgs_df)[:,0].astype(np.int8)
from sklearn.model_selection import train_test_split
train_x, test_x, train_label, test_label = train_test_split(data,labels, test_size=0.2)
train_x.shape,test_x.shape
((800000, 28), (200000, 28))
xgmat_train=xgb.DMatrix(train_x,label=train_label)
xgmat_test=xgb.DMatrix(test_x,label=test_label)
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time
param = {'max_depth': 6, 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':16}
num_round = 50
t=time.time()
param = {'max_depth': 6, 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':16}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
26.4411 seconds Training Accuracy: 0.7408 Testing Accuracy: 0.7334
t=time.time()
param = {'max_depth': 6, 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':16,'colsample_bylevel':0.7}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
22.6471 seconds Training Accuracy: 0.7393 Testing Accuracy: 0.7316
t=time.time()
param = {'max_depth': 6, 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':16, 'gamma':0.1}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
25.0765 seconds Training Accuracy: 0.7403 Testing Accuracy: 0.7328
t=time.time()
param = {'max_depth': 6, 'eta': 0.1,'num_class':1, 'objective': 'binary:logistic','nthread':16}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
25.6166 seconds Training Accuracy: 0.7283 Testing Accuracy: 0.7240
t=time.time()
param = {'max_depth': 6, 'eta': 1,'num_class':1, 'objective': 'binary:logistic','nthread':16}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
25.9076 seconds Training Accuracy: 0.7467 Testing Accuracy: 0.7272
t=time.time()
param = {'max_depth': 2, 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':16}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
6.8508 seconds Training Accuracy: 0.7025 Testing Accuracy: 0.7015
t=time.time()
param = { 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':16,'booster':'gblinear' }
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
preds=bst.predict(xgmat_train)
pred_labels=(preds>=0.5).astype(np.int8)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=bst.predict(xgmat_test)
pred_labels=(preds>=0.5).astype(np.int8)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
2.7138 seconds Training Accuracy: 0.6168 Testing Accuracy: 0.6167
t=time.time()
param = {'max_depth': 6, 'eta': 0.3,'num_class':1, 'objective': 'binary:logistic','nthread':8}
bst = xgb.train(param, xgmat_train, num_round )
t1=time.time()
print('%.4f seconds' %(t1-t))
21.4985 seconds
t=time.time()
gbm = GradientBoostingClassifier(n_estimators=50, max_depth=6, verbose=2)
gbm.fit(train_x,train_label)
t1=time.time()
print('%.4f seconds' %(t1-t))
Iter Train Loss Remaining Time
1 1.3482 12.08m
2 1.3194 11.88m
3 1.2955 11.68m
4 1.2749 11.45m
5 1.2575 11.23m
6 1.2417 11.01m
7 1.2285 10.77m
8 1.2159 10.53m
9 1.2059 10.23m
10 1.1955 9.92m
11 1.1868 9.62m
12 1.1796 9.34m
13 1.1721 9.06m
14 1.1652 8.81m
15 1.1598 8.55m
16 1.1535 8.28m
17 1.1478 8.02m
18 1.1422 7.75m
19 1.1372 7.49m
20 1.1328 7.24m
21 1.1287 6.99m
22 1.1245 6.75m
23 1.1211 6.50m
24 1.1176 6.25m
25 1.1146 6.01m
26 1.1117 5.76m
27 1.1092 5.52m
28 1.1066 5.27m
29 1.1038 5.03m
30 1.1014 4.78m
31 1.0990 4.54m
32 1.0971 4.30m
33 1.0951 4.06m
34 1.0932 3.82m
35 1.0911 3.58m
36 1.0895 3.34m
37 1.0879 3.10m
38 1.0864 2.86m
39 1.0848 2.62m
40 1.0834 2.38m
41 1.0820 2.14m
42 1.0809 1.90m
43 1.0798 1.67m
44 1.0779 1.43m
45 1.0767 1.19m
46 1.0757 57.06s
47 1.0747 42.78s
48 1.0735 28.52s
49 1.0724 14.25s
50 1.0714 0.00s
712.9405 seconds
preds=gbm.predict(train_x)
pred_labels=(preds>=0.5).astype(np.int8)
sum(train_label==pred_labels)/len(train_label)
print('Training Accuracy: %.4f' %(sum(train_label==pred_labels)/len(train_label)))
preds=gbm.predict(test_x)
pred_labels=(preds>=0.5).astype(np.int8)
sum(test_label==pred_labels)/len(test_label)
print('Testing Accuracy: %.4f' %(sum(test_label==pred_labels)/len(test_label)))
Training Accuracy: 0.7293 Testing Accuracy: 0.7247