Did you know that you can navigate the posts by swiping left and right?

Virat Kohli Vs MS Dhoni -- Statistically!

21 Nov 2016 . category: math . Comments
#data science #kaggle #cricket

A Statistical Analysis on two of the biggest stars of IPL

matches = read.csv("matches.csv", stringsAsFactors = TRUE)
deliveries = read.csv("deliveries.csv", stringsAsFactors = TRUE)

df = merge(matches, deliveries, by.x = "id", by.y = "match_id")
df$season = as.factor(df$season)

vk = subset(df, df$batsman == "V Kohli")
msd = subset(df, df$batsman == "MS Dhoni")

#Plotting Kohli Vs Dhoni runs by season
vk_season = aggregate(batsman_runs ~ season, data = vk, FUN = sum)
colnames(vk_season) = c("season", "runs_kohli")
msd_season = aggregate(batsman_runs ~ season, data = msd, FUN = sum)
colnames(msd_season) = c("season", "runs_dhoni")
vk_msd_season = merge(vk_season, msd_season)

library(ggplot2)
library(reshape2)
library(RColorBrewer)

vk_msd_season_long = melt(vk_msd_season) #Transforms the data frame to one, with dhoni/kohli as factors
## Using season as id variables
ggplot(vk_msd_season_long, aes(x = season, y = value, fill = variable)) + 
  geom_bar(stat="identity", position = "dodge") + #dodge -- place bars side-to-side
  scale_fill_manual(values = c("red","yellow")) + #scale_fill_manual for barplots
  ggtitle("Kohli vs Dhoni -- Runs by Seasons") +
  labs(x = "Season", y = "Runs")

#Dismissal Analysis
vk_dismissal = subset(vk, vk$player_dismissed == "V Kohli")[,c("season", "dismissal_kind")]
vk_dismissal_long = melt(vk_dismissal)
## Using season, dismissal_kind as id variables
ggplot(vk_dismissal_long, aes(x = season, y = ..count.. , fill = dismissal_kind)) +
  geom_bar(stat="count") + 
  ggtitle("Kohli -- Dismissals by Seasons") +
  labs(x = "Season", y = "Dismissal Kind")+
  scale_fill_brewer(palette = "Set2")

msd_dismissal = subset(msd, msd$player_dismissed == "MS Dhoni")[,c("season", "dismissal_kind")]
msd_dismissal_long = melt(msd_dismissal)
## Using season, dismissal_kind as id variables
ggplot(msd_dismissal_long, aes(x = season, y = ..count.. , fill = dismissal_kind)) + 
  geom_bar(stat="count") + 
  ggtitle("Dhoni -- Dismissals by Seasons") +
  labs(x = "Season", y = "Dismissal Kind")+
  scale_fill_brewer(palette = "Set2")

#Dismissal counts -- by seasons
vk_dismissal_count = as.data.frame(table(vk_dismissal$season))
colnames(vk_dismissal_count) = c("season", "vk_dismissal")
msd_dismissal_count = as.data.frame(table(msd_dismissal$season))
colnames(msd_dismissal_count) = c("season", "msd_dismissal")
vk_msd_dismissal = merge(vk_dismissal_count, msd_dismissal_count)

vk_msd_dismissal_long = melt(vk_msd_dismissal)
## Using season as id variables
ggplot(vk_msd_dismissal_long, aes(x = season, y = value, fill = variable)) + 
  geom_bar(stat="identity", position = "dodge") + 
  scale_fill_manual(values = c("red","yellow")) + 
  ggtitle("Kohli vs Dhoni -- Dismissals by Seasons") +
  labs(x = "Season", y = "Dismissals")

#Share of runs made 
msd_share = as.data.frame(table(msd$total_runs))
#pos = cumsum(msd_share$Freq) - msd_share$Freq/2

ggplot(msd_share, aes(x = factor(1), y = msd_share$Freq, fill = factor(msd_share$Var1))) + 
  geom_bar(stat = "identity")+
  coord_polar(theta = "y")+
  ggtitle("Dhoni's Run Share")+
  labs(x = "",y = "")+
  scale_fill_discrete(guide_legend(title = "Run Color"))

  #geom_text(aes(x= factor(1), y=pos, label = factor(msd_share$Freq)), size=3)

vk_share = as.data.frame(table(vk$total_runs))
#pos = cumsum(vk_share$Freq) - vk_share$Freq/2

ggplot(vk_share, aes(x = factor(1), y = vk_share$Freq, fill = factor(vk_share$Var1))) + 
  geom_bar(stat = "identity")+
  coord_polar(theta = "y")+
  ggtitle("Kohli's Run Share")+
  labs(x="", y="")+
  scale_fill_discrete(guide_legend(title = "Run Color"))

  #geom_text(aes(x= factor(1), y=pos, label = factor(vk_share$Freq)), size=3)

#Favorite/Worst team
vk$opposition = vk$team1
vk$opposition[vk$opposition == "Royal Challengers Bangalore"] = vk$team2[vk$opposition == "Royal Challengers Bangalore"]
vk_fav_team = aggregate(batsman_runs ~ opposition, data = vk, FUN = sum)
vk_fav_team[with(vk_fav_team, order(-batsman_runs)),]
##                 opposition batsman_runs
## 1      Chennai Super Kings          706
## 3         Delhi Daredevils          603
## 5          Kings XI Punjab          477
## 12     Sunrisers Hyderabad          439
## 7    Kolkata Knight Riders          386
## 8           Mumbai Indians          365
## 2          Deccan Chargers          306
## 10        Rajasthan Royals          258
## 4            Gujarat Lions          209
## 11 Rising Pune Supergiants          188
## 9            Pune Warriors          128
## 6     Kochi Tuskers Kerala           50
vk_balls_faced = as.data.frame(table(vk$opposition))
colnames(vk_balls_faced) = c("opposition", "balls_faced")

vk_fav_team = merge(vk_fav_team, vk_balls_faced)

vk_fav_team$average_runs_per_ball = vk_fav_team$batsman_runs/vk_fav_team$balls_faced
vk_fav_team[with(vk_fav_team, order(-average_runs_per_ball)),c("opposition", "average_runs_per_ball")]
##                 opposition average_runs_per_ball
## 4            Gujarat Lions             1.7272727
## 11 Rising Pune Supergiants             1.4687500
## 12     Sunrisers Hyderabad             1.4070513
## 3         Delhi Daredevils             1.3735763
## 2          Deccan Chargers             1.3304348
## 9            Pune Warriors             1.2673267
## 5          Kings XI Punjab             1.2486911
## 1      Chennai Super Kings             1.2385965
## 8           Mumbai Indians             1.2046205
## 7    Kolkata Knight Riders             1.2024922
## 6     Kochi Tuskers Kerala             0.9615385
## 10        Rajasthan Royals             0.9280576
msd$opposition = msd$team1
msd$opposition[msd$opposition == "Chennai Super Kings"] = msd$team2[msd$opposition == "Chennai Super Kings"]
msd$opposition[msd$opposition == "Rising Pune Supergiants"] = msd$team2[msd$opposition == "Rising Pune Supergiants"]

msd_fav_team = aggregate(batsman_runs ~ opposition, data = msd, FUN = sum)
msd_fav_team[with(msd_fav_team, order(-batsman_runs)),]
##                     opposition batsman_runs
## 10 Royal Challengers Bangalore          559
## 7               Mumbai Indians          512
## 4              Kings XI Punjab          448
## 2             Delhi Daredevils          403
## 6        Kolkata Knight Riders          359
## 1              Deccan Chargers          281
## 9             Rajasthan Royals          249
## 11         Sunrisers Hyderabad          249
## 8                Pune Warriors          135
## 3                Gujarat Lions           52
## 5         Kochi Tuskers Kerala           23
msd_balls_faced = as.data.frame(table(msd$opposition))
colnames(msd_balls_faced) = c("opposition", "balls_faced")

msd_fav_team = merge(msd_fav_team, msd_balls_faced)

msd_fav_team$average_runs_per_ball = msd_fav_team$batsman_runs/msd_fav_team$balls_faced
msd_fav_team[with(msd_fav_team, order(-average_runs_per_ball)),c("opposition", "average_runs_per_ball")]
##                     opposition average_runs_per_ball
## 3                Gujarat Lions              1.733333
## 4              Kings XI Punjab              1.523810
## 8                Pune Warriors              1.500000
## 11         Sunrisers Hyderabad              1.500000
## 2             Delhi Daredevils              1.404181
## 7               Mumbai Indians              1.358090
## 10 Royal Challengers Bangalore              1.337321
## 6        Kolkata Knight Riders              1.334572
## 5         Kochi Tuskers Kerala              1.210526
## 9             Rajasthan Royals              1.180095
## 1              Deccan Chargers              1.089147
#Favorite/Worst bowler
vk_fav_bowler = aggregate(batsman_runs ~ bowler, data = vk, FUN = sum)

vk_balls_faced_bowler = as.data.frame(table(vk$bowler))
colnames(vk_balls_faced_bowler) = c("bowler", "balls_faced")
vk_fav_bowler = merge(vk_fav_bowler, vk_balls_faced_bowler)

vk_fav_bowler = subset(vk_fav_bowler, vk_fav_bowler$balls_faced >= 20) #Consider bowlers with more than 20 deliveries

vk_fav_bowler$average_against_bowler = vk_fav_bowler$batsman_runs/vk_fav_bowler$balls_faced

head(vk_fav_bowler[with(vk_fav_bowler, order(-average_against_bowler)),c("bowler", "average_against_bowler")])
##             bowler average_against_bowler
## 84       KV Sharma               2.121212
## 167       UT Yadav               1.826667
## 138 Sandeep Sharma               1.652174
## 117     PJ Sangwan               1.650000
## 132       RP Singh               1.589744
## 110    NLTC Perera               1.571429
tail(vk_fav_bowler[with(vk_fav_bowler, order(-average_against_bowler)),c("bowler", "average_against_bowler")])
##           bowler average_against_bowler
## 16  Ankit Sharma              0.8518519
## 70     JH Kallis              0.8285714
## 100     MM Patel              0.8000000
## 160  S Sreesanth              0.8000000
## 159    SR Watson              0.7307692
## 152     SK Warne              0.5714286
msd_fav_bowler = aggregate(batsman_runs ~ bowler, data = msd, FUN = sum)

msd_balls_faced_bowler = as.data.frame(table(msd$bowler))
colnames(msd_balls_faced_bowler) = c("bowler", "balls_faced")
msd_fav_bowler = merge(msd_fav_bowler, msd_balls_faced_bowler)

msd_fav_bowler = subset(msd_fav_bowler, msd_fav_bowler$balls_faced >= 20) #Consider bowlers with more than 20 deliveries

msd_fav_bowler$average_against_bowler = msd_fav_bowler$batsman_runs/msd_fav_bowler$balls_faced

head(msd_fav_bowler[with(msd_fav_bowler, order(-average_against_bowler)),c("bowler", "average_against_bowler")]) 
##        bowler average_against_bowler
## 56  IK Pathan               1.904762
## 14    A Nehra               1.900000
## 76 KA Pollard               1.781818
## 47   DW Steyn               1.775862
## 29   CH Gayle               1.760000
## 84   L Balaji               1.760000
tail(msd_fav_bowler[with(msd_fav_bowler, order(-average_against_bowler)),c("bowler", "average_against_bowler")])
##               bowler average_against_bowler
## 119         R Bhatia              0.8750000
## 10          A Kumble              0.8518519
## 121 RE van der Merwe              0.8421053
## 40        DL Vettori              0.8214286
## 55          HV Patel              0.8076923
## 155        SP Narine              0.4864865
#Favorite Non-Striker
msd_fav_non_striker = aggregate(batsman_runs ~ non_striker, data = msd, FUN = sum)
head(msd_fav_non_striker[with(msd_fav_non_striker, order(-batsman_runs)),])
##    non_striker batsman_runs
## 31    SK Raina          522
## 25   RA Jadeja          349
## 29 S Badrinath          341
## 7     DJ Bravo          291
## 17   ML Hayden          215
## 12   JA Morkel          210
vk_fav_non_striker = aggregate(batsman_runs ~ non_striker, data = vk, FUN = sum)
head(vk_fav_non_striker[with(vk_fav_non_striker, order(-batsman_runs)),])
##       non_striker batsman_runs
## 1  AB de Villiers         1012
## 6        CH Gayle         1009
## 19       KL Rahul          277
## 14      JH Kallis          181
## 46      SS Tiwary          154
## 28   MC Henriques          142
#Favorite Venue
msd_fav_venue = aggregate(batsman_runs ~ venue, data = msd, FUN = sum)
head(msd_fav_venue[with(msd_fav_venue, order(-batsman_runs)),])
##                                 venue batsman_runs
## 11    MA Chidambaram Stadium, Chepauk         1144
## 13              M Chinnaswamy Stadium          281
## 6                        Eden Gardens          214
## 30                   Wankhede Stadium          177
## 9  JSCA International Stadium Complex          138
## 7                    Feroz Shah Kotla          127
vk_fav_venue = aggregate(batsman_runs ~ venue, data = vk, FUN = sum)
head(vk_fav_venue[with(vk_fav_venue, order(-batsman_runs)),])
##                                        venue batsman_runs
## 11                     M Chinnaswamy Stadium         1763
## 17 Rajiv Gandhi International Stadium, Uppal          298
## 9            MA Chidambaram Stadium, Chepauk          285
## 28                          Wankhede Stadium          280
## 5                           Feroz Shah Kotla          277
## 4                               Eden Gardens          176
#Pacing the innings
vk_over_runs = aggregate(batsman_runs ~ over, data = vk, FUN = sum)
vk_overs_faced = as.data.frame(table(vk$over))
colnames(vk_overs_faced) = c("over", "vk_freq")
vk_over_runs = merge(vk_over_runs, vk_overs_faced)
vk_over_runs$vk_strike_rate = (vk_over_runs$batsman_runs/vk_over_runs$vk_freq)*100

msd_over_runs = aggregate(batsman_runs ~ over, data = msd, FUN = sum)
msd_overs_faced = as.data.frame(table(msd$over))
colnames(msd_overs_faced) = c("over", "msd_freq")
msd_over_runs = merge(msd_over_runs, msd_overs_faced)
msd_over_runs$msd_strike_rate = (msd_over_runs$batsman_runs/msd_over_runs$msd_freq)*100

vk_msd_strike_rate = merge(vk_over_runs[,c("over", "vk_strike_rate")], msd_over_runs[,c("over", "msd_strike_rate")])
vk_msd_strike_rate$over = as.factor(vk_msd_strike_rate$over)
vk_msd_strike_rate_long = melt(vk_msd_strike_rate)
## Using over as id variables
ggplot(vk_msd_strike_rate_long, aes(over, value, group = variable, col = variable)) + 
  geom_point() + geom_smooth()+
  ggtitle("Kohli vs Dhoni -- Strike Rate by Over") +
  labs(x = "Over", y = "Strike Rate")

vk_msd_freq = merge(vk_over_runs[,c("over", "vk_freq")], msd_over_runs[,c("over", "msd_freq")])
vk_msd_freq$over = as.factor(vk_msd_freq$over)
vk_msd_freq_long = melt(vk_msd_freq)
## Using over as id variables
ggplot(vk_msd_freq_long, aes(over, value, group = variable, col = variable)) + 
  geom_point() + geom_smooth()+
  ggtitle("Kohli vs Dhoni -- Number of Overs faced") +
  labs(x = "Over", y = "Count")


Me

Shubhankar is an awesome person. He's Co-Founder at Houseware, building the next generation of Analytics. In his spare time, he likes to go out on runs!