Predict Diabetes Diagnosis

#detach all loaded packages
invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload = F))

library(caret)
library(MASS)
library(dplyr)
library(randomForest)
library(kernlab)
library(corrplot)
library(GGally)
library(pROC)

diabetes0 = read.csv("https://raw.githubusercontent.com/melissaalejandro/Class-Projects/main/diabetes.csv")

# ====================================================
# Data Overview  ----
# ====================================================

# Predictors include number of pregnancies, glucose, Diastolic blood pressure (mm HG), skin fold thickness on triceps (mm), 2 hour serum insulin 
# level (mu U/ml), body mass index, diabetes pedigree function, and age.  The Target variable is labeled as 'Outcome' where 0 means 'no' (does not 
# have diabetes) & 1 means yes (does have diabetes). The data consists of all females, age 21 and older who are of Pima Indian heritage. 

# Summary statistics
summary(diabetes0)

# check the number of 0's in each column
cbind(lapply(colSums(diabetes0[1:8] == 0), sum))

## Filtering the data

# The variables BMI, Insulin, SkinThickness, BloodPressure, and Glucose have minimum values of zero. It's important to note that there are certain 
# medical conditions where diastolic blood pressure, insulin levels, and glucose levels can be at zero. However, cases where a patient has a 0 BMI or 
# 0 mm of skin thickness are extremely unlikely. Therefore, any rows with zeroes in SkinThickness and BMI will be removed. 

# keep only rows where bmi and skin thickness are >0
diabetes = diabetes0 %>% 
  filter(diabetes0$BMI > 0 & diabetes0$SkinThickness > 0)

summary(diabetes)
str(diabetes)


# ====================================================
# Data Exploration ----
# ====================================================

# Variable relationship 
ggpairs(diabetes, 
        lower = list(continuous = wrap('smooth', color='red', alpha=0.7, size=0.8)),
        diag = list(continuous = wrap("densityDiag", fill='deepskyblue', alpha=0.9)),
        upper = list(continuous = wrap('cor', color='black'))) # moderate correlation with glucose (0.464)

# correlation plot
corrplot(cor(diabetes), method="square", type = 'full', order = 'hclust', tl.col="black", tl.cex = .35, tl.srt=45, addCoef.col = 'black',number.cex=0.6)

# Boxplot Outliers 
boxplot(diabetes, col = "deepskyblue")

summary(diabetes)

# Histogram: distribution of data 
par(mfrow = c(3, 3))
for (i in 1:9) {
  hist(diabetes[ ,i], xlab = names(diabetes[i]), main = paste(names(diabetes[i]), "Histogram"), col="deepskyblue")  
}

# ====================================================
# Build the Model ----
# ====================================================

## Change labels
##### Label 0 -> No and 1 -> Yes #####
diabetes$Outcome[diabetes$Outcome==0] = "No"
diabetes$Outcome[diabetes$Outcome==1] = "Yes"
diabetes$Outcome = as.factor(diabetes$Outcome)

# split data 
set.seed(100)
inTrain <- createDataPartition(as.matrix(diabetes[,9]), p = .8, list=FALSE)
diab.train = diabetes[inTrain,]
diab.test = diabetes[-inTrain,]

# preprocess 
diab.TrainPP <- preProcess(diab.train, method = c("BoxCox", "scale", "center","nzv","spatialSign"))
diab.TestPP <- preProcess(diab.test, method = c("BoxCox", "scale", "center","nzv","spatialSign"))

# transform the dataset using the parameters ('predict' applies the transformations)
diabTrainTrans <- predict(diab.TrainPP, diab.train)
diabTestTrans <- predict(diab.TestPP, diab.test)

# check dist after transformation
par(mfrow = c(3, 3))
for (i in 1:8) {
  hist(diabetes[ ,i], xlab = names(diabetes[i]), main = paste(names(diabetes[i]), "Histogram"), col="deepskyblue")  
}                      

#control the computational nuances of the train function
set.seed(100)
ctrl <- trainControl(method = "LGOCV",
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE,
                     savePredictions = TRUE)

# ====================================================
# Test Results ----
# ====================================================

############################## 
# Logistic Regression ----
##############################
set.seed(5)
logisticTune <- train(x = diabTrainTrans[,1:8], 
                      y = diabTrainTrans$Outcome,
                      method = "glm",
                      metric = "ROC",
                      trControl = ctrl)
logisticTune 

# variable importance 
plot(varImp(logisticTune, scale =FALSE))

test_results <- data.frame(obs = diabTestTrans$Outcome,
                           logistic = predict(logisticTune, diabTestTrans))


##############################
# PLSDA Regression ----
##############################
set.seed(476)
plsdaTune <- train(x = diabTrainTrans[,1:8],
                   y = diabTrainTrans$Outcome,
                   method = "pls", 
                   metric = "ROC",
                   tuneGrid = expand.grid(.ncomp = 1:5),
                   trControl = ctrl)

plsdaTune

test_results$plsda <- predict(plsdaTune, newdata=diab.test[,-9])

plot(plsdaTune)
plot(varImp(plsdaTune, scale =FALSE))

##############################
# SVM Radial ----
##############################
set.seed(476)
#library(kernlab)
sigmaRangeReduced <- sigest(as.matrix(diabTrainTrans[,1:8]))
svmRGridReduced <- expand.grid(.sigma = sigmaRangeReduced[1],
                               .C = 2^(seq(-4, 4)))
svmRTune <- train(x = as.matrix(diabTrainTrans[,1:8]),
                  y = diabTrainTrans$Outcome,
                  method = "svmRadial", #svmLinear #svmPoly
                  metric = "ROC",
                  tuneGrid = svmRGridReduced,
                  fit = FALSE,
                  trControl = ctrl)
svmRTune

test_results$svm <- predict(svmRTune, newdata=diabTestTrans[,-9])

plot(varImp(svmRTune, scale =FALSE))

##############################
# Random Forest ----
##############################

mtryGrid <- data.frame(mtry = 1:8) #since we only have 8 predictors

### Tune the model using cross-validation
set.seed(476)
rfTune <- train(x = as.matrix(diabTrainTrans[,1:8]),
                y = diabTrainTrans$Outcome,
                method = "rf",
                metric = "ROC",
                tuneGrid = mtryGrid,
                ntree = 150,
                importance = TRUE,
                trControl = ctrl)
rfTune

test_results$rf <- predict(rfTune, newdata=diab.test)

plot(rfTune)

plot(varImp(rfTune, scale =FALSE))

# ====================================================
# Predict the test set based on four models ---- 
# ====================================================

#logistic 
diabTestTrans$logistic <- predict(logisticTune,diabTestTrans, type = "prob")[,1]

#PLSDA
diabTestTrans$plsda <- predict(plsdaTune,diabTestTrans, type = "prob")[,1]

#SVM
diabTestTrans$SVMR <- predict(svmRTune, diabTestTrans[,1:8], type = "prob")[,1]

#Random Forest
diabTestTrans$RF <- predict(rfTune,diabTestTrans, type = "prob")[,1]

# ====================================================
# ROC curves ----
# ====================================================

#library(pROC)
#ROC for logistic model
par(mfrow=c(1,1))
logisticROC <- roc(diabTestTrans$Outcome, diabTestTrans$logistic)
plot(logisticROC, col=1, lty=1, lwd=2)

#ROC for PLSDA
plsdaROC <- roc(diabTestTrans$Outcome, diabTestTrans$plsda)
lines(plsdaROC, col=3, lty=3, lwd=2)

#ROC for SVM
SVMROC <- roc(diabTestTrans$Outcome, diabTestTrans$SVMR)
lines(SVMROC, col=8, lty=8, lwd=2)

#ROC for Random Forest
RFROC <- roc(diabTestTrans$Outcome, diabTestTrans$RF)
lines(RFROC, col=4, lty=4, lwd=2)

legend('bottomright', c('logistic','plsda','SVM', 'Random Forest'), col=1:6, lty=1:5,lwd=2)

# ====================================================
# Create the confusion matrix from the test set ----
# ====================================================

#Confusion matrix of logistic model
a=confusionMatrix(data = predict(logisticTune, diabTestTrans), reference = diabTestTrans$Outcome)

#Confusion matrix of partial least squares discriminant analysis
b=confusionMatrix(data = predict(plsdaTune, diabTestTrans), reference = diabTestTrans$Outcome)

#Confusion matrix of SVM
c=confusionMatrix(data = predict(svmRTune, diabTestTrans[,1:8]), reference = diabTestTrans$Outcome)

#Confusion matrix of Random Forest
d=confusionMatrix(data = predict(rfTune, diabTestTrans), reference = diabTestTrans$Outcome)

# put results of confusion matrix into a df
results2 <- cbind(data.frame(logistic = c(a$overall[1:2], a$byClass[1:2])),
                 data.frame(plsda = c(b$overall[1:2], b$byClass[1:2])),
                 data.frame(svmR = c(c$overall[1:2], c$byClass[1:2])),
                 data.frame(rf = c(d$overall[1:2], d$byClass[1:2])))

# transpose df
(results2=t(results2))


# Create dot plot
res1 = resamples(list(Logistic = logisticTune, PLSDA = plsdaTune, 
                      SVM = svmRTune, RandomForest = rfTune ))
dotplot(res1)