Classification using Decision Trees in R

Posted on Updated on

# Loading the required libraries

install.packages("ISLR")
library(ISLR)
install.packages("tree")
library(tree)
attach(Carseats)

head(Carseats, n=10)
dim(Carseats)
range(Sales)
# Creating a categorical variable for Sales data depending on the below condition
High = ifelse(Sales >=8, "Yes", "No")
# Appending this column "High" to the Carseats dataset
Carseats = data.frame(Carseats, High)
dim(Carseats)
# Remove the Sales columns from the dataset
Carseats = Carseats[,-1]
dim(Carseats)

# Split the dataset into traning and testing
set.seed(2)

# Generating the traning and testing datasets
train = sample(1:nrow(Carseats),nrow(Carseats)/2)
test = -train
training_data = Carseats[train,]
testing_data = Carseats[test,]

# Creating this variable to compare our prediction with the actual data
testing_High = High[test]

# Fit the tree model (full model) using training data
tree_model = tree(High~., training_data)

plot(tree_model)
text(tree_model, pretty=0)

Decision_Trees_1

# We will evaluate how our model is performing using the testing data

# We are going to predict using the tree model on the testing data and pass the
# parameter as "class" for the type of prediction
tree_pred = predict(tree_model, testing_data, type="class")
# To compare the means - we check the misclassification error
mean (tree_pred != testing_High)
#0.295 - 29.5% is a high number, which we can reduce this

# Now can prune our tree to reduce the misclassification error
# We will perform cross validation to check at what level we will stop pruning
set.seed(3)
# Generate a cross validation tree
cv_tree = cv.tree(tree_model, FUN = prune.misclass)
names(cv_tree)

# We will plot the size of the tree versus the deviance (that is the error rate)
plot(cv_tree$size, cv_tree$dev, type = "b")

Decision_Trees_2

# We can see below that minimum error rate is at tree size 9. So let’s create a pruned model below:
pruned_model = prune.misclass(tree_model, best=9)
plot(pruned_model)
text(pruned_model, pretty=0)

Decision_Trees_3

# Check how our model is performing
tree_pred = predict(pruned_model, testing_data, type = "class")
# Mean of the tree predicted from testing high
mean(tree_pred != testing_High)
#[1] 0.29 - we have reduced the misclassification rate by pruning out tree
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s