Statistic Abbreviation | Statistic Name |
---|---|
POM | Ken Pomeroy’s Team Rankings |
Seed | Tournament Seed |
Score | Score |
FGA | Field Goals Attempted |
FGM2 | 2pt Shots Made |
FGM3 | 3pt Shots Made |
Avg3pt | 3pt Average Per Game |
EFG | Effective Field Goal |
FTA | Free Throws Attempted |
OR | Offensive Rebounds |
TO | Turnovers |
POSS | Possessions |
OFF | Offensive Ratings |
OPP_Score | Opponent’s Score |
DEF_EFF | Defensive Efficiency |
BLK | Blocks |
DR | Defensive Rebounds |
AST | Assists |
STL | Steals |
PF | Personal Fouls |
\[n = \frac{x-min(x)}{max(x)-min(x)}\]
library(tidyverse)
library(caret)
library(data.table)
library(Metrics)
library(knitr)
TourneyDiff <- read_csv("TourneyDiff.csv")
head(TourneyDiff,5)
## # A tibble: 5 x 29
## Season ID Results TeamID.x TeamID.y TeamName.x TeamName.y Conference.x
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 2003 2003… NA 1328 1448 Oklahoma Wake Fore… Big 12 Conf…
## 2 2003 2003… 0 1328 1393 Oklahoma Syracuse Big 12 Conf…
## 3 2003 2003… NA 1328 1329 Oklahoma Oklahoma … Big 12 Conf…
## 4 2003 2003… NA 1328 1386 Oklahoma St Joseph… Big 12 Conf…
## 5 2003 2003… NA 1328 1335 Oklahoma Penn Big 12 Conf…
## # … with 21 more variables: Conference.y <chr>, POM_diff <dbl>,
## # Seed_diff <dbl>, Score_diff <dbl>, FGA_diff <dbl>, FGM2_diff <dbl>,
## # FGM3_dff <dbl>, Avg3pt_diff <dbl>, EFG_diff <dbl>, FTA_diff <dbl>,
## # OR_diff <dbl>, TO_diff <dbl>, POSS_diff <dbl>, OFF_diff <dbl>,
## # OPP_SCORE_diff <dbl>, DEF_EFF_diff <dbl>, BLK_diff <dbl>, DR_diff <dbl>,
## # AST_diff <dbl>, STL_diff <dbl>, PF_diff <dbl>
RealGames <- TourneyDiff %>%
filter(!is.na(TourneyDiff$Results))
Again, to check to see if the dataframe only contains real games use the head() function.
head(RealGames,5)
## # A tibble: 5 x 29
## Season ID Results TeamID.x TeamID.y TeamName.x TeamName.y Conference.x
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 2003 2003… 0 1328 1393 Oklahoma Syracuse Big 12 Conf…
## 2 2003 2003… 1 1328 1354 Oklahoma S Carolin… Big 12 Conf…
## 3 2003 2003… 1 1393 1400 Syracuse Texas Big East Co…
## 4 2003 2003… 0 1329 1393 Oklahoma … Syracuse Big 12 Conf…
## 5 2003 2003… 1 1329 1335 Oklahoma … Penn Big 12 Conf…
## # … with 21 more variables: Conference.y <chr>, POM_diff <dbl>,
## # Seed_diff <dbl>, Score_diff <dbl>, FGA_diff <dbl>, FGM2_diff <dbl>,
## # FGM3_dff <dbl>, Avg3pt_diff <dbl>, EFG_diff <dbl>, FTA_diff <dbl>,
## # OR_diff <dbl>, TO_diff <dbl>, POSS_diff <dbl>, OFF_diff <dbl>,
## # OPP_SCORE_diff <dbl>, DEF_EFF_diff <dbl>, BLK_diff <dbl>, DR_diff <dbl>,
## # AST_diff <dbl>, STL_diff <dbl>, PF_diff <dbl>
#Training data will be from 2003 to 2018
Train1 <- RealGames %>%
filter(Season <= 2018) %>%
select(Results,POM_diff:PF_diff)
#Make Results a factor
Train1$Results <- as.factor(Train1$Results)
#Testing data will be 2019
Test1 <- RealGames %>%
filter(Season == 2019) %>%
select(Results,POM_diff:PF_diff)
#Make Results a factor
Test1$Results <- as.factor(Test1$Results)
control <- trainControl(method="repeatedcv", number=10, repeats = 5)
metric <- "Accuracy"
Remember to use set.seed() to ensure that results are reproducible!
# kNN
set.seed(3142)
fit.knn <- train(Results~., data=Train1, method="knn",metric=metric, trControl=control)
# SVM
set.seed(3142)
fit.svm <- train(Results~., data=Train1, method="svmRadial",metric=metric, trControl=control)
# Random Forest
set.seed(3142)
fit.rf <- train(Results~., data=Train1, method="rf",metric=metric, trControl=control)
# Linear Algorithm
set.seed(3142)
fit.lda <- train(Results~., data=Train1, method="lda",metric=metric, trControl=control)
#Boosted Logistic Regression
set.seed(3142)
fit.blr <- train(Results~., data=Train1, method="LogitBoost",metric=metric, trControl=control)
#eXtreme Gradient Boosting Trees
set.seed(3142)
fit.xgbTree <- train(Results~., data=Train1, method="xgbTree",metric=metric, trControl=control)
#eXtreme Gradient Boosting Linear
set.seed(3142)
fit.xgbLM <- train(Results~., data=Train1, method="xgbLinear", metric=metric, trControl=control)
#Neural Network nnet
set.seed(3142)
fit.nnet <- train(Results~., data=Train1, method="nnet",metric=metric, trControl=control)
# summarize accuracy of models
results <- resamples(list(lda=fit.lda, knn=fit.knn, svm=fit.svm, rf=fit.rf, blr=fit.blr, xgbTree=fit.xgbTree, xgbLM=fit.xgbLM, nnet=fit.nnet))
dotplot(results)
print(fit.nnet)
## Neural Network
##
## 1048 samples
## 20 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 943, 943, 943, 944, 943, 944, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.6990183 0.3978856
## 1 1e-04 0.6984451 0.3968017
## 1 1e-01 0.7089634 0.4177459
## 3 0e+00 0.6602839 0.3207143
## 3 1e-04 0.6612930 0.3223530
## 3 1e-01 0.6875952 0.3745876
## 5 0e+00 0.6343242 0.2687897
## 5 1e-04 0.6398681 0.2806304
## 5 1e-01 0.6803278 0.3603912
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.1.
prednnet <- predict(fit.nnet, newdata = Test1, type = "raw")
confusionMatrix(prednnet, Test1$Results)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 7
## 1 7 22
##
## Accuracy : 0.791
## 95% CI : (0.6743, 0.8808)
## No Information Rate : 0.5672
## P-Value [Acc > NIR] : 0.000104
##
## Kappa : 0.5744
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8158
## Specificity : 0.7586
## Pos Pred Value : 0.8158
## Neg Pred Value : 0.7586
## Prevalence : 0.5672
## Detection Rate : 0.4627
## Detection Prevalence : 0.5672
## Balanced Accuracy : 0.7872
##
## 'Positive' Class : 0
##
Importance1 <- varImp(fit.nnet)
plot(Importance1)
| Based on the figure above, the neural network model deemed the Pomeroy Rankings to be the most important factor in determining who won each game. This is logical and should be the most important variable based on the results from the data visualization report, found here. Really there are no surprises with how the neural network ranked the importance of the various features. However, the Effective Field Goal statistic is normally a stat that many individuals use when trying to pick winners but the neural network did not rely on it that much. The neural network prioritized 2 and 3 point field goals made over all other offensive statistics. From the defensive side of the ball, steals and blocks were the defensive stats relied on the most.
#Create new dataframe without the bottom seven features
Train2 <- RealGames %>%
filter(Season <= 2018) %>%
select(Results,POM_diff:Avg3pt_diff, OPP_SCORE_diff,
BLK_diff:PF_diff)
#Make Results a factor
Train2$Results <- as.factor(Train2$Results)
#Neural Network nnet2
set.seed(3142)
fit2.nnet <- train(Results~., data=Train2, method="nnet", metric=metric, trControl=control)
Now, the performance of the new model (fit2.nnet) will be compared to the original model that contained all of the features.
#Compare accuracy of the Neural Networks
results2 <- resamples(list(nnet1=fit.nnet,nnet2=fit.nnet))
dotplot(results2)
Importance2 <- varImp(fit2.nnet)
plot(Importance2)
#Create new dataframe without the bottom three features
Train3 <- RealGames %>%
filter(Season <= 2018) %>%
select(Results,POM_diff:Seed_diff, FGA_diff:Avg3pt_diff,
BLK_diff,AST_diff:PF_diff)
#Make Results a factor
Train3$Results <- as.factor(Train3$Results)
#Neural Network nnet3
set.seed(3142)
fit3.nnet <- train(Results~., data=Train3, method="nnet", metric=metric, trControl=control)
#Compare accuracy of the Neural Networks
results3 <- resamples(list(nnet1=fit.nnet,nnet2=fit.nnet,nnet3=fit.nnet))
dotplot(results3)
Again, the newest neural network has the highest accuracy, though it appears to be by a very small amount. The variable importance rankings can be seen below.
Importance3 <- varImp(fit3.nnet)
plot(Importance3)
print(fit3.nnet)
## Neural Network
##
## 1048 samples
## 10 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 943, 943, 943, 944, 943, 944, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.7102857 0.4204235
## 1 1e-04 0.7106667 0.4212246
## 1 1e-01 0.7129524 0.4256111
## 3 0e+00 0.6906612 0.3811915
## 3 1e-04 0.6879872 0.3755584
## 3 1e-01 0.7068608 0.4131938
## 5 0e+00 0.6629872 0.3256704
## 5 1e-04 0.6751905 0.3498147
## 5 1e-01 0.7026575 0.4048880
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.1.
nnetgrid <- expand.grid(size = seq(from = 1, to = 5, by = 1),
decay = c(0.25,0.2,0.15, 0.1,0.075))
set.seed(3142)
nnet_tune1 <- train(Results~.,
data=Train3,
method="nnet",
metric=metric,
trControl=control,
tuneGrid = nnetgrid)
print(nnet_tune1)
## Neural Network
##
## 1048 samples
## 10 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 943, 943, 943, 944, 943, 944, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0.075 0.7125696 0.4249327
## 1 0.100 0.7129524 0.4256200
## 1 0.150 0.7144908 0.4286659
## 1 0.200 0.7144927 0.4286484
## 1 0.250 0.7131538 0.4259513
## 2 0.075 0.7093370 0.4181478
## 2 0.100 0.7097198 0.4189925
## 2 0.150 0.7125824 0.4247271
## 2 0.200 0.7118168 0.4232750
## 2 0.250 0.7112491 0.4221146
## 3 0.075 0.7057106 0.4108962
## 3 0.100 0.7072381 0.4140187
## 3 0.150 0.7104908 0.4206053
## 3 0.200 0.7121978 0.4240488
## 3 0.250 0.7104799 0.4206176
## 4 0.075 0.7024670 0.4046463
## 4 0.100 0.7049267 0.4094115
## 4 0.150 0.7104853 0.4205838
## 4 0.200 0.7112418 0.4221527
## 4 0.250 0.7112454 0.4221228
## 5 0.075 0.6954158 0.3905087
## 5 0.100 0.7068535 0.4133919
## 5 0.150 0.7120110 0.4236825
## 5 0.200 0.7121960 0.4240294
## 5 0.250 0.7114341 0.4224985
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.2.
Test2 <- Test1 %>%
select(Results,POM_diff:Seed_diff, FGA_diff:Avg3pt_diff,
BLK_diff,AST_diff:PF_diff)
prednnet2 <- predict(nnet_tune1, newdata = Test2, type = "raw")
confusionMatrix(prednnet2, Test2$Results)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 6
## 1 7 23
##
## Accuracy : 0.806
## 95% CI : (0.6911, 0.8924)
## No Information Rate : 0.5672
## P-Value [Acc > NIR] : 3.393e-05
##
## Kappa : 0.6064
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8158
## Specificity : 0.7931
## Pos Pred Value : 0.8378
## Neg Pred Value : 0.7667
## Prevalence : 0.5672
## Detection Rate : 0.4627
## Detection Prevalence : 0.5522
## Balanced Accuracy : 0.8044
##
## 'Positive' Class : 0
##
#Get the probalities for nnet_tune1
probnnet <- predict(nnet_tune1, newdata = Test2, type = "prob") %>%
select(2)
EffectiveNNET <- RealGames %>%
filter(Season == 2019) %>%
mutate(Predictions = prednnet2) %>%
select(ID,Results,Predictions)
EffectiveNNET <- cbind(EffectiveNNET,probnnet) %>%
rename(Probability = "1")
With the dataframe now containing everything that is needed, the logloss of the final model can now be calculated.
#Logloss for NNET model
EffectiveNNET <- EffectiveNNET %>%
summarise(LogLoss = logLoss(Results,Probability),
Errors = sum(((Probability < 0.0)+(Probability > 1.0)+(is.na(Probability)))))
print(EffectiveNNET)
## LogLoss Errors
## 1 0.4921599 0