Home Artificial Intelligence Using Rare Event Classification Modelling to Predict the 2023 Stanley Cup Playoffs Introduction Training and Testing Data LASSO DT 2022–2023 Stanley Cup Playoffs Prediction Gambling Bracket Pool Conclusions

Using Rare Event Classification Modelling to Predict the 2023 Stanley Cup Playoffs Introduction Training and Testing Data LASSO DT 2022–2023 Stanley Cup Playoffs Prediction Gambling Bracket Pool Conclusions

0
Using Rare Event Classification Modelling to Predict the 2023 Stanley Cup Playoffs
Introduction
Training and Testing Data
LASSO
DT
2022–2023 Stanley Cup Playoffs Prediction
Gambling
Bracket
Pool
Conclusions

Photo by Josh Appel on Unsplash
  1. The event attempting to be predicted doesn’t occur often enough to have the ability to accurately determine relationships between the predictors and response variables.
  2. Splitting the info into training and testing data is difficult as a consequence of the imbalance between the positive and negative response values.
  1. Randomly reduce the variety of negative response values so it’s closer to the variety of positive response values.
  2. Increase the entire positive response values equally so the category balance is closer.
1 of two list of advanced NHL stats.
2 of two list of advanced NHL stats.
# Load Libraries #
library(dplyr)
library(tidyr)
library(glmnet)
library(caret)
library(rpart)

# Read in Data #
NHL_data <- read.csv("nhl_season_data.csv")
head(NHL_data)
dim(NHL_data)

First five rows of the info and the info’s dimensions of 456 rows and 73 columns.
# Replicate Winning Rows #
winners <- NHL_data %>%
filter(Winner >= 1) # Filter the winners

losers <- NHL_data %>%
filter(Winner <= 0) # Filter the losers

duplicated_winners <- winners[rep(seq_len(nrow(winners)), each = 29), ] # Replicate each winner 29 times

prepped_data <- rbind(duplicated_winners, losers)

# Remove Unnecessary Data #
df <- prepped_data[,-c(1,2,3,5,6,7,8,9)]
dim(df)

The brand new dimension of the info after replicating each season’s winning row 29 more times and removing unnecessary columns. Now our response variable is balanced.
# Arrange Training and Testing Data #
set.seed(1) # Set Seed so that very same sample might be reproduced in future also
# Now Choosing 67% of knowledge as sample from total 'n' rows of the info
sample <- sample.int(n = nrow(df), size = floor(.67*nrow(df)), replace = F)

# Training data #
train_Data <- df[sample, ]

x.train<-as.matrix(train_Data[,-65])

y.train<-as.numeric(unlist(train_Data[,65]))

# Test data #
test_Data <- df[-sample, ]

x.test<-(test_Data[,-65])

y.test<-as.numeric(unlist(test_Data[,65]))

# LASSO #
# perform k-fold cross-validation to seek out optimal lambda value #
lambda_model <- cv.glmnet(x.train, y.train, alpha = 1, family = "binomial")

# find optimal lambda value that minimizes test MSE #
best_lambda <- lambda_model$lambda.min

# produce plot of test MSE by lambda value #
plot(lambda_model)

Plot of MSE by lambda value
# find coefficients of best model #
LASSO_model <- glmnet(x.train, y.train, alpha = 1, lambda = best_lambda, family = "binomial")
coef(LASSO_model)
LASSO coefficients 1 of two.
LASSO coefficients 2 of two.
# Predict on Test Data #
LASSO_model_predict <- predict(LASSO_model, as.matrix(x.test), type = "response")
lasso_accuracy <- cbind(LASSO_model_predict, y.test)
LASSO model accurately predicted all 144 Stanley cup winners within the Test data as winning and 127 of 146 (87%) of the losers as losing for a complete accuracy of 93%
# Decision Tree #
# Model Tuning #
hyper_grid <- expand.grid(
minsplit = seq(5, 20, 1),
maxdepth = seq(8, 15, 1)
)

models <- list()

for (i in 1:nrow(hyper_grid)) {
# get minsplit, maxdepth values at row i
minsplit <- hyper_grid$minsplit[i]
maxdepth <- hyper_grid$maxdepth[i]
# train a model and store within the list
models[[i]] <- rpart(
formula = Winner ~ .,
data = train_Data,
method = "class",
control = list(minsplit = minsplit, maxdepth = maxdepth)
)
}

# function to get optimal cp
get_cp <- function(x) {
min <- which.min(x$cptable[, "xerror"])
cp <- x$cptable[min, "CP"]
}

# function to get minimum error
get_min_error <- function(x) {
min <- which.min(x$cptable[, "xerror"])
xerror <- x$cptable[min, "xerror"]
}

hyper_grid %>%
mutate(
cp = purrr::map_dbl(models, get_cp),
error = purrr::map_dbl(models, get_min_error)
) %>%
arrange(error) %>%
top_n(-5, wt = error)

Optimal DT parameters after hyperparameter tuning.
# Create Model #
tree_model <- rpart(
formula = Winner ~ .,
data = train_Data,
method = "class",
control = list(minsplit = 10, maxdepth = 12, cp = 0.01)
)

# Plot #
plot(tree_model, uniform = TRUE,
essential = "NHL Winner")
text(tree_model, use.n = TRUE, cex = .7)

Decision Tree plot trained on Training data and tested on Testing data.
DT model accurately predicted all 144 Stanley cup winners within the Test data as winning and 135 of 146 (92%) of the losers as losing for a complete accuracy of 96%

DT Full Prediction

# Decision Tree on Full data #
# Model Tuning #
hyper_grid <- expand.grid(
minsplit = seq(5, 20, 1),
maxdepth = seq(8, 15, 1)
)

models <- list()

for (i in 1:nrow(hyper_grid)) {
# get minsplit, maxdepth values at row i
minsplit <- hyper_grid$minsplit[i]
maxdepth <- hyper_grid$maxdepth[i]
# train a model and store within the list
models[[i]] <- rpart(
formula = Winner ~ .,
data = df,
method = "class",
control = list(minsplit = minsplit, maxdepth = maxdepth)
)
}

# function to get optimal cp
get_cp <- function(x) {
min <- which.min(x$cptable[, "xerror"])
cp <- x$cptable[min, "CP"]
}

# function to get minimum error
get_min_error <- function(x) {
min <- which.min(x$cptable[, "xerror"])
xerror <- x$cptable[min, "xerror"]
}

hyper_grid %>%
mutate(
cp = purrr::map_dbl(models, get_cp),
error = purrr::map_dbl(models, get_min_error)
) %>%
arrange(error) %>%
top_n(-5, wt = error)

# Create Model #
tree_model <- rpart(
formula = Winner ~ .,
data = df,
method = "class",
control = list(minsplit = 15, maxdepth = 14, cp = 0.01)
)

# Plot #
plot(tree_model, uniform = TRUE,
essential = "NHL Winner")
text(tree_model, use.n = TRUE, cex = .7)

Decision Tree created on the complete dataset.

DT Model Explainability

Glossary for the DT Plot

  • — Percentage of total Goals in games that team played which might be for that team. GF*100/(GF+GA)
  • — Rate of Medium Danger Scoring Probabilities for that team per 60 minutes of play. MDCF*60/TOI
  • — Rate of Goals for that team per 60 minutes of play. GF*60/TOI
  • — Percentage of Scoring Likelihood Shots against that team that weren’t Goals. 100-(SCGA*100/SCSA)
  • — Percentage of total Goals off of Low Danger Scoring Probabilities in games that team played which might be for that team. LDGF*100/(LDGF+LDGA)
  • — Rate of Goals off of Low Danger Scoring Probabilities against that team per 60 minutes of play. LDGA*60/TOI
  • — Rate of Goals off of scoring probabilities for that team per 60 minutes of play. SCGF*60/TOI
  • — Rate of Goals against that team per 60 minutes of play. GA*60/TOI

Prediction

# Predict on 2022/2023 Data #
current_data <- read.csv("...202220223season.csv")
decision_tree_predict <- predict(tree_model, as.data.frame(current_data), method = "class")
playoff_prediction <- cbind(decision_tree_predict, current_data)
The DT model predicted the Colorado Avalanche, Dallas Stars, Recent York Rangers and Tampa Bay Lightning would win.

LASSO Full Prediction

# Isolate x and y values on Historical Data #
x <- as.matrix(df[,-65])
y <- as.numeric(unlist(df[,65]))

# LASSO on Full Data #
# perform k-fold cross-validation to seek out optimal lambda value #
lambda_model <- cv.glmnet(x, y, alpha = 1, family = "binomial")

# find optimal lambda value that minimizes test MSE #
best_lambda <- lambda_model$lambda.min

# produce plot of MSE by lambda value #
plot(lambda_model)

Optimal lambda plot for LASSO on full data.
# find coefficients of best model #
LASSO_model <- glmnet(x, y, alpha = 1, lambda = best_lambda, family = "binomial")

# Model Explainability #
coefficients(LASSO_model)
exp(coefficients(LASSO_model))

Full LASSO Model’s Coefficients 1 of two.
Full LASSO Model’s Coefficients 2 of two.

LASSO Model Explainability

# Predict on 2022/2023 Data #
x <- current_data[-c(1,2,3,5,6,7,8,9)] #Remove unneeded data
LASSO_model_predict <- predict(LASSO_model, as.matrix(x), type = "response")
lasso_23 <- cbind(LASSO_model_predict, current_data)
LASSO Predicted winners.
  • Dallas Stars (+1500)
  • Colorado Avalanche (+650)
  • Recent York Rangers (+1200)
  • Tampa Bay Lightning (+1400)
Dallas over the Rags within the finals.
Dallas Stars value probably the most points.

LEAVE A REPLY

Please enter your comment!
Please enter your name here