# R

### Classification using Decision Trees in R

# Loading the required libraries install.packages("ISLR") library(ISLR) install.packages("tree") library(tree) attach(Carseats) head(Carseats, n=10) dim(Carseats) range(Sales) # Creating a categorical variable for Sales data depending on the below condition High = ifelse(Sales >=8, "Yes", "No") # Appending this column "High" to the Carseats dataset Carseats = data.frame(Carseats, High) dim(Carseats) # Remove the Sales columns from the dataset Carseats = Carseats[,-1] dim(Carseats) # Split the dataset into traning and testing set.seed(2) # Generating the traning and testing datasets train = sample(1:nrow(Carseats),nrow(Carseats)/2) test = -train training_data = Carseats[train,] testing_data = Carseats[test,] # Creating this variable to compare our prediction with the actual data testing_High = High[test] # Fit the tree model (full model) using training data tree_model = tree(High~., training_data) plot(tree_model) text(tree_model, pretty=0)

# We will evaluate how our model is performing using the testing data # We are going to predict using the tree model on the testing data and pass the # parameter as "class" for the type of prediction tree_pred = predict(tree_model, testing_data, type="class") # To compare the means - we check the misclassification error mean (tree_pred != testing_High) #0.295 - 29.5% is a high number, which we can reduce this # Now can prune our tree to reduce the misclassification error # We will perform cross validation to check at what level we will stop pruning set.seed(3) # Generate a cross validation tree cv_tree = cv.tree(tree_model, FUN = prune.misclass) names(cv_tree) # We will plot the size of the tree versus the deviance (that is the error rate) plot(cv_tree$size, cv_tree$dev, type = "b")

# We can see below that minimum error rate is at tree size 9. So letâ€™s create a pruned model below: pruned_model = prune.misclass(tree_model, best=9) plot(pruned_model) text(pruned_model, pretty=0)

# Check how our model is performing tree_pred = predict(pruned_model, testing_data, type = "class") # Mean of the tree predicted from testing high mean(tree_pred != testing_High) #[1] 0.29 - we have reduced the misclassification rate by pruning out tree

Advertisements

### Visualizing data using Box Plots in R

R provides quick way of performing exploratory analysis of your data using boxplots

data<-data.frame(Stat11=rnorm(100,mean=3,sd=2), Stat21=rnorm(100,mean=4,sd=1), Stat31=rnorm(100,mean=6,sd=0.5), Stat41=rnorm(100,mean=10,sd=0.5), Stat12=rnorm(100,mean=4,sd=2), Stat22=rnorm(100,mean=4.5,sd=2), Stat32=rnorm(100,mean=7,sd=0.5), Stat42=rnorm(100,mean=8,sd=3), Stat13=rnorm(100,mean=6,sd=0.5), Stat23=rnorm(100,mean=5,sd=3), Stat33=rnorm(100,mean=8,sd=0.2), Stat43=rnorm(100,mean=4,sd=4)) df = data.frame(data)

boxplot(data, las = 2, names = c("Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4"))

boxplot(data, ylab ="APR (%)", xlab ="Time", las = 2, names = c("Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4"))