Adventures with R - Cricket Analysis (Clustering players based on ODI data)
Continuing on the Cricket Analysis Series
I wanted to take a deep dive into Clustering. I have the ODI database and I thought it would be instructive to put the data to use
The main guide I used can be found HERE
This is an excellent guide to cluster analysis in R and I highly recommend it
The main code can be found HERE
The final output file can be found HERE
The final Tableau public dashboard can be found HERE
The code walk through is as
## Different packages that you need
library(mclust)
library(tidyverse)
library(cluster)
library(factoextra)
library(data.table)
library(reshape2)
library(sqldf)
setwd('C:/Training/R/CricketAnalysis/')
myData <- read.csv("ODIData.csv")
myData <- sqldf("select Player, sum(Runs) Runs, sum(Mins) Mins, sum(BF) BF, sum(Fours) Fours, sum(Sixes) Sixes,
((100 * sum(Runs))/sum(BF)) SR from myData
group by Player")
## Removing players that score less than the median runs
myData <- sqldf("Select * from myData where Runs > 116")
## Data has to be standardized for Cluster analysis
df <- myData
df[, -c(1)] <- scale(myData[, -c(1)])
## Looked at different clustering techniques
# Kmeans Clustering
#Finding optimal number of clusters using the silhouette method
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(df, kmeans, method = "silhouette")
## Another way of determining number of clusters is the Gap statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 500)
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)
## Look at the plot, this one provides the ideal value for Kmeans as 3 clusters in both the methods
# Compute kmeans with clustering k = 3
set.seed(123)
km.res <- kmeans(df, 3, nstart = 25)
# Plot the viz
fviz_cluster(km.res, data = df)
# Output Results for future work
y <- fviz_cluster(km.res, data = df)
km_final <- df
km_final$X <- y$data$x
km_final$Y <- y$data$y
km_final$CLUST <- km.res$cluster
## Partitioning around Medoids - PAM Approach
## PAM - Optimal number of clusters
fviz_nbclust(df,pam, method = "silhouette")
# Finding optimal clusters in PAM using the Gap Statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = pam,
K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
## Check the plot - it shows ideal is 3 clusters
pam.res <- pam(df, 3)
fviz_cluster(pam.res,
palette = c("#00AFBB", "#FC4E07", "#CF4420"),
ellipse.type = "t",
repel = TRUE,
ggtheme = theme_classic()
)
y <- fviz_cluster(pam.res, ellipse.type = "t", repel = TRUE)
pam_final <- df
pam_final$CLUST <- pam.res$clustering
pam_final$X <- y$data$x
pam_final$Y <- y$data$y
# CLARA - Clustering Large Applications
fviz_nbclust(df, clara, method = "silhouette" ) +
theme_classic()
## Another way of determining number of clusters is the Gap statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = clara,
K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
clara.res <- clara(df, 3, samples = 50, pamLike = TRUE)
y <- fviz_cluster(clara.res, palette = c("#00AFBB", "#FC4E07", "#CF4420"), ellipse.type = "t",
geom = "point", pointsize = 1, ggtheme = theme_classic())
clara_final <- df
clara_final$CLUST <- clara.res$clustering
clara_final$X <- y$data$x
clara_final$Y <- y$data$y
## Another way of creating optimal clusters without providing number of clusters is the McClust package
## This package is based on the concept Model based Clustering
myMclust <- Mclust(df)
mySummary <- summary(myMclust$BIC, data=myData)
myData$MB_Clust <- myMclust$classification
myData$Uncertainty <- myMclust$uncertainty
# Visualizing cluster results
fviz_mclust <- function(object,
what = c("classification", "uncertainty", "BIC"),
ellipse.type = "norm", ellipse.level = 0.4,
ggtheme = theme_classic(), ...)
{
uncertainty <- cluster <- NULL
what <- match.arg(what)
if(what == "classification")
p <- fviz_cluster(object, ellipse.type = ellipse.type, ellipse.level =ellipse.level,
ggtheme = theme_classic(), ...)+
labs(subtitle = "Classification")
if(what == "uncertainty")
p <- fviz_cluster(object, ellipse.type = ellipse.type, ellipse.level =ellipse.level,
ggtheme = theme_classic(), geom = "none", ...)+
geom_point(aes(size = uncertainty, color = cluster))+
scale_size(range =c(0, 2))+
labs(subtitle = "Uncertainty")+
guides(size = FALSE)
else if(what == "BIC") p <- fviz_mclust_bic(object, ggtheme = theme_classic(), ...)
return(p)
}
fviz_mclust(myMclust, "classification", geom = "point")
# Assigning x and y coordinates to the original data set
y <- fviz_mclust(myMclust, "classification", geom = "point")
myData$X <- y$data$x
myData$Y <- y$data$y
## Merge all the clustering products together to create one final combined set
final <- sqldf("Select
myData.*,
km_final.CLUST as Clust_KM, km_final.X as X_KM, km_final.Y as Y_KM,
pam_final.CLUST as Clust_PAM, pam_final.X as X_PAM, pam_final.Y as Y_PAM,
clara_final.CLUST as Clust_clara, clara_final.X as X_Clara, clara_final.Y as Y_Clara
from myData
left join km_final on myData.Player = km_final.Player
left join pam_final on myData.Player = pam_final.Player
left join clara_final on myData.Player = clara_final.Player
")
PlayerTable <- read.csv("PlayerTable.csv")
final <- sqldf("Select
PlayerTable.PlayerName,
myData.*,
km_final.CLUST as Clust_KM, km_final.X as X_KM, km_final.Y as Y_KM,
pam_final.CLUST as Clust_PAM, pam_final.X as X_PAM, pam_final.Y as Y_PAM,
clara_final.CLUST as Clust_clara, clara_final.X as X_Clara, clara_final.Y as Y_Clara
from myData
left join km_final on myData.Player = km_final.Player
left join pam_final on myData.Player = pam_final.Player
left join clara_final on myData.Player = clara_final.Player
left join PlayerTable on myData.Player = PlayerTable.PlayerID
")
write.csv(final,
file="FinalClusterOutput.csv",
row.names = FALSE,
quote = FALSE)
I wanted to take a deep dive into Clustering. I have the ODI database and I thought it would be instructive to put the data to use
The main guide I used can be found HERE
This is an excellent guide to cluster analysis in R and I highly recommend it
The main code can be found HERE
The final output file can be found HERE
The final Tableau public dashboard can be found HERE
The code walk through is as
## Different packages that you need
library(mclust)
library(tidyverse)
library(cluster)
library(factoextra)
library(data.table)
library(reshape2)
library(sqldf)
setwd('C:/Training/R/CricketAnalysis/')
myData <- read.csv("ODIData.csv")
myData <- sqldf("select Player, sum(Runs) Runs, sum(Mins) Mins, sum(BF) BF, sum(Fours) Fours, sum(Sixes) Sixes,
((100 * sum(Runs))/sum(BF)) SR from myData
group by Player")
## Removing players that score less than the median runs
myData <- sqldf("Select * from myData where Runs > 116")
## Data has to be standardized for Cluster analysis
df <- myData
df[, -c(1)] <- scale(myData[, -c(1)])
## Looked at different clustering techniques
# Kmeans Clustering
#Finding optimal number of clusters using the silhouette method
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(df, kmeans, method = "silhouette")
## Another way of determining number of clusters is the Gap statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 500)
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)
## Look at the plot, this one provides the ideal value for Kmeans as 3 clusters in both the methods
# Compute kmeans with clustering k = 3
set.seed(123)
km.res <- kmeans(df, 3, nstart = 25)
# Plot the viz
fviz_cluster(km.res, data = df)
# Output Results for future work
y <- fviz_cluster(km.res, data = df)
km_final <- df
km_final$X <- y$data$x
km_final$Y <- y$data$y
km_final$CLUST <- km.res$cluster
## Partitioning around Medoids - PAM Approach
## PAM - Optimal number of clusters
fviz_nbclust(df,pam, method = "silhouette")
# Finding optimal clusters in PAM using the Gap Statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = pam,
K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
## Check the plot - it shows ideal is 3 clusters
pam.res <- pam(df, 3)
fviz_cluster(pam.res,
palette = c("#00AFBB", "#FC4E07", "#CF4420"),
ellipse.type = "t",
repel = TRUE,
ggtheme = theme_classic()
)
y <- fviz_cluster(pam.res, ellipse.type = "t", repel = TRUE)
pam_final <- df
pam_final$CLUST <- pam.res$clustering
pam_final$X <- y$data$x
pam_final$Y <- y$data$y
# CLARA - Clustering Large Applications
fviz_nbclust(df, clara, method = "silhouette" ) +
theme_classic()
## Another way of determining number of clusters is the Gap statistic method
set.seed(123)
gap_stat <- clusGap(df, FUN = clara,
K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
clara.res <- clara(df, 3, samples = 50, pamLike = TRUE)
y <- fviz_cluster(clara.res, palette = c("#00AFBB", "#FC4E07", "#CF4420"), ellipse.type = "t",
geom = "point", pointsize = 1, ggtheme = theme_classic())
clara_final <- df
clara_final$CLUST <- clara.res$clustering
clara_final$X <- y$data$x
clara_final$Y <- y$data$y
## Another way of creating optimal clusters without providing number of clusters is the McClust package
## This package is based on the concept Model based Clustering
myMclust <- Mclust(df)
mySummary <- summary(myMclust$BIC, data=myData)
myData$MB_Clust <- myMclust$classification
myData$Uncertainty <- myMclust$uncertainty
# Visualizing cluster results
fviz_mclust <- function(object,
what = c("classification", "uncertainty", "BIC"),
ellipse.type = "norm", ellipse.level = 0.4,
ggtheme = theme_classic(), ...)
{
uncertainty <- cluster <- NULL
what <- match.arg(what)
if(what == "classification")
p <- fviz_cluster(object, ellipse.type = ellipse.type, ellipse.level =ellipse.level,
ggtheme = theme_classic(), ...)+
labs(subtitle = "Classification")
if(what == "uncertainty")
p <- fviz_cluster(object, ellipse.type = ellipse.type, ellipse.level =ellipse.level,
ggtheme = theme_classic(), geom = "none", ...)+
geom_point(aes(size = uncertainty, color = cluster))+
scale_size(range =c(0, 2))+
labs(subtitle = "Uncertainty")+
guides(size = FALSE)
else if(what == "BIC") p <- fviz_mclust_bic(object, ggtheme = theme_classic(), ...)
return(p)
}
fviz_mclust(myMclust, "classification", geom = "point")
# Assigning x and y coordinates to the original data set
y <- fviz_mclust(myMclust, "classification", geom = "point")
myData$X <- y$data$x
myData$Y <- y$data$y
## Merge all the clustering products together to create one final combined set
final <- sqldf("Select
myData.*,
km_final.CLUST as Clust_KM, km_final.X as X_KM, km_final.Y as Y_KM,
pam_final.CLUST as Clust_PAM, pam_final.X as X_PAM, pam_final.Y as Y_PAM,
clara_final.CLUST as Clust_clara, clara_final.X as X_Clara, clara_final.Y as Y_Clara
from myData
left join km_final on myData.Player = km_final.Player
left join pam_final on myData.Player = pam_final.Player
left join clara_final on myData.Player = clara_final.Player
")
PlayerTable <- read.csv("PlayerTable.csv")
final <- sqldf("Select
PlayerTable.PlayerName,
myData.*,
km_final.CLUST as Clust_KM, km_final.X as X_KM, km_final.Y as Y_KM,
pam_final.CLUST as Clust_PAM, pam_final.X as X_PAM, pam_final.Y as Y_PAM,
clara_final.CLUST as Clust_clara, clara_final.X as X_Clara, clara_final.Y as Y_Clara
from myData
left join km_final on myData.Player = km_final.Player
left join pam_final on myData.Player = pam_final.Player
left join clara_final on myData.Player = clara_final.Player
left join PlayerTable on myData.Player = PlayerTable.PlayerID
")
write.csv(final,
file="FinalClusterOutput.csv",
row.names = FALSE,
quote = FALSE)
Comments