# Month: August 2014

### Guided Selling using R

# Setting current working directory getwd() setwd("D:/R") getwd() # Read data from Last.FM frequency matrix data.germany head(data.germany[,c(1,3:8)]) # Drop the user column and make a new data frame data.germany.ibs # Create a helper function to calculate the cosine between two vectors getCosine { this.cosine return(this.cosine) } # Create a placeholder dataframe listing item vs. item holder data.germany.ibs.similarity # Lets fill in those empty spaces with cosine similarities for(i in 1:ncol(data.germany.ibs)) { for(j in 1:ncol(data.germany.ibs)) { data.germany.ibs.similarity[i,j]= getCosine(data.germany.ibs[i],data.germany.ibs[j]) } } # Output similarity results to a file write.csv(data.germany.ibs.similarity,file="final-germany-similarity.csv") # Get the top 10 neighbours for each data.germany.neighbours for(i in 1:ncol(data.germany.ibs)) { data.germany.neighbours[i,] } # Output neighbour results to a file write.csv(file="final-germany-item-neighbours.csv",x=data.germany.neighbours[,-1])

### Cluster Analysis using R

# Load the data set europe # View the first 10 rows of data head(europe, n=10) # Perform the cluster analysis euroclust<-hclust(dist(europe[-1])) # Plot the dendrogram plot(euroclust, labels=europe$Country) # Add the rectangles to identify the five clusters rect.hclust(euroclust, 5)

### Classification using Decision Trees in R

# Loading the required libraries install.packages("ISLR") library(ISLR) install.packages("tree") library(tree) attach(Carseats) head(Carseats, n=10) dim(Carseats) range(Sales) # Creating a categorical variable for Sales data depending on the below condition High = ifelse(Sales >=8, "Yes", "No") # Appending this column "High" to the Carseats dataset Carseats = data.frame(Carseats, High) dim(Carseats) # Remove the Sales columns from the dataset Carseats = Carseats[,-1] dim(Carseats) # Split the dataset into traning and testing set.seed(2) # Generating the traning and testing datasets train = sample(1:nrow(Carseats),nrow(Carseats)/2) test = -train training_data = Carseats[train,] testing_data = Carseats[test,] # Creating this variable to compare our prediction with the actual data testing_High = High[test] # Fit the tree model (full model) using training data tree_model = tree(High~., training_data) plot(tree_model) text(tree_model, pretty=0)

# We will evaluate how our model is performing using the testing data # We are going to predict using the tree model on the testing data and pass the # parameter as "class" for the type of prediction tree_pred = predict(tree_model, testing_data, type="class") # To compare the means - we check the misclassification error mean (tree_pred != testing_High) #0.295 - 29.5% is a high number, which we can reduce this # Now can prune our tree to reduce the misclassification error # We will perform cross validation to check at what level we will stop pruning set.seed(3) # Generate a cross validation tree cv_tree = cv.tree(tree_model, FUN = prune.misclass) names(cv_tree) # We will plot the size of the tree versus the deviance (that is the error rate) plot(cv_tree$size, cv_tree$dev, type = "b")

# We can see below that minimum error rate is at tree size 9. So let’s create a pruned model below: pruned_model = prune.misclass(tree_model, best=9) plot(pruned_model) text(pruned_model, pretty=0)

# Check how our model is performing tree_pred = predict(pruned_model, testing_data, type = "class") # Mean of the tree predicted from testing high mean(tree_pred != testing_High) #[1] 0.29 - we have reduced the misclassification rate by pruning out tree

### Market Basket Analysis and Association Rules using R

Market basket analysis provides great insights into purchasing behaviors of customers. Based on customer purchase data and association rules, we arrive at groups of related products which typically people buy together.

# Load the required libraries install.packages("arules") library(arules) library(datasets) # Load the data set myData

# Fetch the rules with support as 0.001 and confidence as 0.7 rules

rules<-sort(rules, by="confidence", decreasing=TRUE) options(digits=2) inspect(rules[1:10])

#Visualizing the results install.packages("igraph") install.packages("arulesViz") library(arulesViz) library(tcltk) rulesImp

References:

1. Arules Package: http://cran.at.r-project.org/web/packages/arules/arules.pdf

2. ArulesViz Package: http://cran.r-project.org/web/packages/arulesViz/vignettes/arulesViz.pdf

### Using Spotfire for Predictive Analytics (Regression Modeling)

We are building a model using Linear Regression to forecast sales

`Sales` ~ `Order Quantity` + `Discount` + `Shipping Cost` + `Profit` + `Unit Price` + `Product Base Margin`

This is the model with “Sales” as the Response variable and all the subsequent columns after the “~” considered as Predictor variables.

Let us click on “OK” to examine the results for the model

In the Model Summary pane, we can check the summary metrics:

Residual standard error: 1421 on 8329 degrees of freedom(63 observations deleted due to missingness)Multiple R-squared: 0.8421, Adjusted R-squared: 0.842F-statistic: 7406 on 6 and 8329 DF, p-value: 0

Below is the significance of the model parameters:

Residual Standard Error: A lower value indicates the model is better fit for our data.

Adjusted R-Squared: This is a commonly used measure of fit of a regression equation. It penalizes the addition of too many variables while rewarding a good fit of the regression equation. A higher Adjusted R-Squared value represents the model to be a better fit.

p-value: Predictors with this value closer to zero are better contributing to the model

Some of the other factors which will influence our model are Collinearity and multicollinearity, and Variance Inflation Factor (VIF), AIC and BIC values can help assess our model.

Collinearity is a case of an independent variable being a linear function of another. And in Mulitcollinearity, a variable is a linear function of two or more variables. These issues can increase the likelihood of making false conclusions from our estimates.

High VIF means that multicollinearity significantly impacts the equation whereas lower AIC and BIC are better.

The Table of Coefficients will have various p-values for various predictors (also called Regressors). Lower p-values will give the significance of each predictor in the model

If there are patterns in the the “Residuals vs. Fitted” plot, then the current model could be improved.

A simple horizontal bar signifying the relative importance of each predictor used in the model. Discount is the least important predictor.

If the normal QQ plot closely approximates to the line y=x, then the model fits the data well.

In the above plot, the larger values represent points (data points) which are more influential and have to be further investigated.

Depending on these various factors, the model has to go through a series of investigative steps till a satisfactory level of fit is reached.

In addition to the knowledge of statistics, domain specific understanding is also quite crucial in assessing the inputs and the results. For example when analyzing sales, we examine specific types of sales broken into tiers depending on various criteria such as quarter of the year, geographic factors, economic indicators, seasonal influences etc.

We can exclude the outliers which will skew our results. Further, appropriate weights could be distributed on each input parameter to identify whether the specific type of sale is profitable to our business.

### Visualizing data using Box Plots in R

R provides quick way of performing exploratory analysis of your data using boxplots

data<-data.frame(Stat11=rnorm(100,mean=3,sd=2), Stat21=rnorm(100,mean=4,sd=1), Stat31=rnorm(100,mean=6,sd=0.5), Stat41=rnorm(100,mean=10,sd=0.5), Stat12=rnorm(100,mean=4,sd=2), Stat22=rnorm(100,mean=4.5,sd=2), Stat32=rnorm(100,mean=7,sd=0.5), Stat42=rnorm(100,mean=8,sd=3), Stat13=rnorm(100,mean=6,sd=0.5), Stat23=rnorm(100,mean=5,sd=3), Stat33=rnorm(100,mean=8,sd=0.2), Stat43=rnorm(100,mean=4,sd=4)) df = data.frame(data)

boxplot(data, las = 2, names = c("Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4"))

boxplot(data, ylab ="APR (%)", xlab ="Time", las = 2, names = c("Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4","Station 1","Station 2","Station 3","Station 4"))