Adventures with R - Cricket Analysis (Clustering players based on ODI data)

Continuing on the Cricket Analysis Series

I wanted to take a deep dive into Clustering. I have the ODI database and I thought it would be instructive to put the data to use

The main guide I used can be found HERE
This is an excellent guide to cluster analysis in R and I highly recommend it

The main code can be found HERE
The final output file can be found HERE

The final Tableau public dashboard can be found HERE


The code walk through is as

## Different packages that you need
library(mclust)
library(tidyverse)
library(cluster)
library(factoextra)
library(data.table)
library(reshape2)
library(sqldf)

setwd('C:/Training/R/CricketAnalysis/')

myData <- read.csv("ODIData.csv")
myData <- sqldf("select Player, sum(Runs) Runs, sum(Mins) Mins, sum(BF) BF, sum(Fours) Fours, sum(Sixes) Sixes, 
                  ((100 * sum(Runs))/sum(BF)) SR from myData
                group by Player")

## Removing players that score less than the median runs
myData <- sqldf("Select * from myData where Runs > 116")

## Data has to be standardized for Cluster analysis
df <- myData
df[, -c(1)] <- scale(myData[, -c(1)])


## Looked at different clustering techniques
# Kmeans Clustering
#Finding optimal number of clusters using the silhouette method
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
  km.res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(df))
  mean(ss[, 3])
}

# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15

# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)

plot(k.values, avg_sil_values,
     type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of clusters K",
     ylab = "Average Silhouettes")

fviz_nbclust(df, kmeans, method = "silhouette")

## Another way of determining number of clusters is the Gap statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 500)
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)


## Look at the plot, this one provides the ideal value for Kmeans as 3 clusters in both the methods
# Compute kmeans with clustering k = 3
set.seed(123)
km.res <- kmeans(df, 3, nstart = 25)

# Plot the viz
fviz_cluster(km.res, data = df)

# Output Results for future work
y <- fviz_cluster(km.res, data = df)

km_final <- df
km_final$X <- y$data$x
km_final$Y <- y$data$y
km_final$CLUST <- km.res$cluster

## Partitioning around Medoids - PAM Approach

## PAM - Optimal number of clusters
fviz_nbclust(df,pam, method = "silhouette") 

# Finding optimal clusters in PAM using the Gap Statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = pam, 
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

## Check the plot - it shows ideal is 3 clusters


pam.res <- pam(df, 3)
fviz_cluster(pam.res,
             palette = c("#00AFBB", "#FC4E07", "#CF4420"),
             ellipse.type = "t",
             repel = TRUE,
             ggtheme = theme_classic()
)

y <- fviz_cluster(pam.res, ellipse.type = "t", repel = TRUE)

pam_final <- df
pam_final$CLUST <- pam.res$clustering
pam_final$X <- y$data$x
pam_final$Y <- y$data$y


# CLARA - Clustering Large Applications
fviz_nbclust(df, clara, method = "silhouette" ) + 
  theme_classic()

## Another way of determining number of clusters is the Gap statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = clara, 
                    K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

clara.res <- clara(df, 3, samples = 50, pamLike = TRUE)

y <- fviz_cluster(clara.res, palette = c("#00AFBB", "#FC4E07", "#CF4420"), ellipse.type = "t",
                  geom = "point", pointsize = 1, ggtheme = theme_classic())

clara_final <- df
clara_final$CLUST <- clara.res$clustering
clara_final$X <- y$data$x
clara_final$Y <- y$data$y




## Another way of creating optimal clusters without providing number of clusters is the McClust package
## This package is based on the concept Model based Clustering
myMclust <- Mclust(df)

mySummary <- summary(myMclust$BIC, data=myData)

myData$MB_Clust <- myMclust$classification
myData$Uncertainty <- myMclust$uncertainty



# Visualizing cluster results
fviz_mclust <- function(object, 
                        what = c("classification", "uncertainty", "BIC"),
                        ellipse.type = "norm", ellipse.level = 0.4, 
                        ggtheme = theme_classic(), ...)
{
  uncertainty <- cluster <- NULL
  what <- match.arg(what)
  if(what == "classification")
    p <- fviz_cluster(object, ellipse.type = ellipse.type, ellipse.level =ellipse.level,
                      ggtheme = theme_classic(), ...)+
    labs(subtitle = "Classification")
  if(what == "uncertainty")
    p <- fviz_cluster(object, ellipse.type = ellipse.type, ellipse.level =ellipse.level,
                      ggtheme = theme_classic(), geom = "none", ...)+
    geom_point(aes(size = uncertainty, color = cluster))+
    scale_size(range =c(0, 2))+
    labs(subtitle = "Uncertainty")+
    guides(size = FALSE)
  else if(what == "BIC") p <- fviz_mclust_bic(object, ggtheme = theme_classic(),  ...)
  
  return(p)
}

fviz_mclust(myMclust, "classification", geom = "point")

# Assigning x and y coordinates to the original data set
y <- fviz_mclust(myMclust, "classification", geom = "point")

myData$X <- y$data$x
myData$Y <- y$data$y

## Merge all the clustering products together to create one final combined set
final <- sqldf("Select 
                  myData.*, 
               km_final.CLUST as Clust_KM, km_final.X as X_KM, km_final.Y as Y_KM,
               pam_final.CLUST as Clust_PAM, pam_final.X as X_PAM, pam_final.Y as Y_PAM,
               clara_final.CLUST as Clust_clara, clara_final.X as X_Clara, clara_final.Y as Y_Clara
               from myData 
               left join km_final on myData.Player = km_final.Player
               left join pam_final on myData.Player = pam_final.Player
               left join clara_final on myData.Player = clara_final.Player
               ")


PlayerTable <- read.csv("PlayerTable.csv")

final <- sqldf("Select 
               PlayerTable.PlayerName, 
               myData.*, 
               km_final.CLUST as Clust_KM, km_final.X as X_KM, km_final.Y as Y_KM,
               pam_final.CLUST as Clust_PAM, pam_final.X as X_PAM, pam_final.Y as Y_PAM,
               clara_final.CLUST as Clust_clara, clara_final.X as X_Clara, clara_final.Y as Y_Clara
               from myData 
               left join km_final on myData.Player = km_final.Player
               left join pam_final on myData.Player = pam_final.Player
               left join clara_final on myData.Player = clara_final.Player
               left join PlayerTable on myData.Player  = PlayerTable.PlayerID
               ")

write.csv(final,
          file="FinalClusterOutput.csv",
          row.names = FALSE,
          quote = FALSE)

Comments

Popular posts from this blog

Outsourcing - the new wave !!

The Dragon vs the Tiger

Dekh Sako to Dekh Lo, Lekin Hathoda maare bina