R

Classification using Decision Trees in R

Posted on Updated on

# Loading the required libraries

install.packages("ISLR")
library(ISLR)
install.packages("tree")
library(tree)
attach(Carseats)

head(Carseats, n=10)
dim(Carseats)
range(Sales)
# Creating a categorical variable for Sales data depending on the below condition
High = ifelse(Sales >=8, "Yes", "No")
# Appending this column "High" to the Carseats dataset
Carseats = data.frame(Carseats, High)
dim(Carseats)
# Remove the Sales columns from the dataset
Carseats = Carseats[,-1]
dim(Carseats)

# Split the dataset into traning and testing
set.seed(2)

# Generating the traning and testing datasets
train = sample(1:nrow(Carseats),nrow(Carseats)/2)
test = -train
training_data = Carseats[train,]
testing_data = Carseats[test,]

# Creating this variable to compare our prediction with the actual data
testing_High = High[test]

# Fit the tree model (full model) using training data
tree_model = tree(High~., training_data)

plot(tree_model)
text(tree_model, pretty=0)

Decision_Trees_1

# We will evaluate how our model is performing using the testing data

# We are going to predict using the tree model on the testing data and pass the
# parameter as "class" for the type of prediction
tree_pred = predict(tree_model, testing_data, type="class")
# To compare the means - we check the misclassification error
mean (tree_pred != testing_High)
#0.295 - 29.5% is a high number, which we can reduce this

# Now can prune our tree to reduce the misclassification error
# We will perform cross validation to check at what level we will stop pruning
set.seed(3)
# Generate a cross validation tree
cv_tree = cv.tree(tree_model, FUN = prune.misclass)
names(cv_tree)

# We will plot the size of the tree versus the deviance (that is the error rate)
plot(cv_tree$size, cv_tree$dev, type = "b")

Decision_Trees_2

# We can see below that minimum error rate is at tree size 9. So let’s create a pruned model below:
pruned_model = prune.misclass(tree_model, best=9)
plot(pruned_model)
text(pruned_model, pretty=0)

Decision_Trees_3

# Check how our model is performing
tree_pred = predict(pruned_model, testing_data, type = "class")
# Mean of the tree predicted from testing high
mean(tree_pred != testing_High)
#[1] 0.29 - we have reduced the misclassification rate by pruning out tree
Advertisements

Visualizing data using Box Plots in R

Posted on Updated on

R provides quick way of performing exploratory analysis of your data using boxplots

data<-data.frame(Stat11=rnorm(100,mean=3,sd=2), 
Stat21=rnorm(100,mean=4,sd=1), Stat31=rnorm(100,mean=6,sd=0.5), Stat41=rnorm(100,mean=10,sd=0.5), Stat12=rnorm(100,mean=4,sd=2), Stat22=rnorm(100,mean=4.5,sd=2), Stat32=rnorm(100,mean=7,sd=0.5), Stat42=rnorm(100,mean=8,sd=3), Stat13=rnorm(100,mean=6,sd=0.5), Stat23=rnorm(100,mean=5,sd=3), Stat33=rnorm(100,mean=8,sd=0.2), Stat43=rnorm(100,mean=4,sd=4)) 
df = data.frame(data)

Box_Plots_1

boxplot(data, las = 2, names = c("Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4"))

Box_Plots_2

boxplot(data, ylab ="APR (%)", xlab ="Time", las = 2, names = c("Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4"))

Box_Plots_3