-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPredict Diabetes Diagnosis
259 lines (196 loc) · 8.71 KB
/
Predict Diabetes Diagnosis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#detach all loaded packages
invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload = F))
library(caret)
library(MASS)
library(dplyr)
library(randomForest)
library(kernlab)
library(corrplot)
library(GGally)
library(pROC)
diabetes0 = read.csv("https://raw.githubusercontent.com/melissaalejandro/Class-Projects/main/diabetes.csv")
# ====================================================
# Data Overview ----
# ====================================================
# Predictors include number of pregnancies, glucose, Diastolic blood pressure (mm HG), skin fold thickness on triceps (mm), 2 hour serum insulin
# level (mu U/ml), body mass index, diabetes pedigree function, and age. The Target variable is labeled as 'Outcome' where 0 means 'no' (does not
# have diabetes) & 1 means yes (does have diabetes). The data consists of all females, age 21 and older who are of Pima Indian heritage.
# Summary statistics
summary(diabetes0)
# check the number of 0's in each column
cbind(lapply(colSums(diabetes0[1:8] == 0), sum))
## Filtering the data
# The variables BMI, Insulin, SkinThickness, BloodPressure, and Glucose have minimum values of zero. It's important to note that there are certain
# medical conditions where diastolic blood pressure, insulin levels, and glucose levels can be at zero. However, cases where a patient has a 0 BMI or
# 0 mm of skin thickness are extremely unlikely. Therefore, any rows with zeroes in SkinThickness and BMI will be removed.
# keep only rows where bmi and skin thickness are >0
diabetes = diabetes0 %>%
filter(diabetes0$BMI > 0 & diabetes0$SkinThickness > 0)
summary(diabetes)
str(diabetes)
# ====================================================
# Data Exploration ----
# ====================================================
# Variable relationship
ggpairs(diabetes,
lower = list(continuous = wrap('smooth', color='red', alpha=0.7, size=0.8)),
diag = list(continuous = wrap("densityDiag", fill='deepskyblue', alpha=0.9)),
upper = list(continuous = wrap('cor', color='black'))) # moderate correlation with glucose (0.464)
# correlation plot
corrplot(cor(diabetes), method="square", type = 'full', order = 'hclust', tl.col="black", tl.cex = .35, tl.srt=45, addCoef.col = 'black',number.cex=0.6)
# Boxplot Outliers
boxplot(diabetes, col = "deepskyblue")
summary(diabetes)
# Histogram: distribution of data
par(mfrow = c(3, 3))
for (i in 1:9) {
hist(diabetes[ ,i], xlab = names(diabetes[i]), main = paste(names(diabetes[i]), "Histogram"), col="deepskyblue")
}
# ====================================================
# Build the Model ----
# ====================================================
## Change labels
##### Label 0 -> No and 1 -> Yes #####
diabetes$Outcome[diabetes$Outcome==0] = "No"
diabetes$Outcome[diabetes$Outcome==1] = "Yes"
diabetes$Outcome = as.factor(diabetes$Outcome)
# split data
set.seed(100)
inTrain <- createDataPartition(as.matrix(diabetes[,9]), p = .8, list=FALSE)
diab.train = diabetes[inTrain,]
diab.test = diabetes[-inTrain,]
# preprocess
diab.TrainPP <- preProcess(diab.train, method = c("BoxCox", "scale", "center","nzv","spatialSign"))
diab.TestPP <- preProcess(diab.test, method = c("BoxCox", "scale", "center","nzv","spatialSign"))
# transform the dataset using the parameters ('predict' applies the transformations)
diabTrainTrans <- predict(diab.TrainPP, diab.train)
diabTestTrans <- predict(diab.TestPP, diab.test)
# check dist after transformation
par(mfrow = c(3, 3))
for (i in 1:8) {
hist(diabetes[ ,i], xlab = names(diabetes[i]), main = paste(names(diabetes[i]), "Histogram"), col="deepskyblue")
}
#control the computational nuances of the train function
set.seed(100)
ctrl <- trainControl(method = "LGOCV",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
# ====================================================
# Test Results ----
# ====================================================
##############################
# Logistic Regression ----
##############################
set.seed(5)
logisticTune <- train(x = diabTrainTrans[,1:8],
y = diabTrainTrans$Outcome,
method = "glm",
metric = "ROC",
trControl = ctrl)
logisticTune
# variable importance
plot(varImp(logisticTune, scale =FALSE))
test_results <- data.frame(obs = diabTestTrans$Outcome,
logistic = predict(logisticTune, diabTestTrans))
##############################
# PLSDA Regression ----
##############################
set.seed(476)
plsdaTune <- train(x = diabTrainTrans[,1:8],
y = diabTrainTrans$Outcome,
method = "pls",
metric = "ROC",
tuneGrid = expand.grid(.ncomp = 1:5),
trControl = ctrl)
plsdaTune
test_results$plsda <- predict(plsdaTune, newdata=diab.test[,-9])
plot(plsdaTune)
plot(varImp(plsdaTune, scale =FALSE))
##############################
# SVM Radial ----
##############################
set.seed(476)
#library(kernlab)
sigmaRangeReduced <- sigest(as.matrix(diabTrainTrans[,1:8]))
svmRGridReduced <- expand.grid(.sigma = sigmaRangeReduced[1],
.C = 2^(seq(-4, 4)))
svmRTune <- train(x = as.matrix(diabTrainTrans[,1:8]),
y = diabTrainTrans$Outcome,
method = "svmRadial", #svmLinear #svmPoly
metric = "ROC",
tuneGrid = svmRGridReduced,
fit = FALSE,
trControl = ctrl)
svmRTune
test_results$svm <- predict(svmRTune, newdata=diabTestTrans[,-9])
plot(varImp(svmRTune, scale =FALSE))
##############################
# Random Forest ----
##############################
mtryGrid <- data.frame(mtry = 1:8) #since we only have 8 predictors
### Tune the model using cross-validation
set.seed(476)
rfTune <- train(x = as.matrix(diabTrainTrans[,1:8]),
y = diabTrainTrans$Outcome,
method = "rf",
metric = "ROC",
tuneGrid = mtryGrid,
ntree = 150,
importance = TRUE,
trControl = ctrl)
rfTune
test_results$rf <- predict(rfTune, newdata=diab.test)
plot(rfTune)
plot(varImp(rfTune, scale =FALSE))
# ====================================================
# Predict the test set based on four models ----
# ====================================================
#logistic
diabTestTrans$logistic <- predict(logisticTune,diabTestTrans, type = "prob")[,1]
#PLSDA
diabTestTrans$plsda <- predict(plsdaTune,diabTestTrans, type = "prob")[,1]
#SVM
diabTestTrans$SVMR <- predict(svmRTune, diabTestTrans[,1:8], type = "prob")[,1]
#Random Forest
diabTestTrans$RF <- predict(rfTune,diabTestTrans, type = "prob")[,1]
# ====================================================
# ROC curves ----
# ====================================================
#library(pROC)
#ROC for logistic model
par(mfrow=c(1,1))
logisticROC <- roc(diabTestTrans$Outcome, diabTestTrans$logistic)
plot(logisticROC, col=1, lty=1, lwd=2)
#ROC for PLSDA
plsdaROC <- roc(diabTestTrans$Outcome, diabTestTrans$plsda)
lines(plsdaROC, col=3, lty=3, lwd=2)
#ROC for SVM
SVMROC <- roc(diabTestTrans$Outcome, diabTestTrans$SVMR)
lines(SVMROC, col=8, lty=8, lwd=2)
#ROC for Random Forest
RFROC <- roc(diabTestTrans$Outcome, diabTestTrans$RF)
lines(RFROC, col=4, lty=4, lwd=2)
legend('bottomright', c('logistic','plsda','SVM', 'Random Forest'), col=1:6, lty=1:5,lwd=2)
# ====================================================
# Create the confusion matrix from the test set ----
# ====================================================
#Confusion matrix of logistic model
a=confusionMatrix(data = predict(logisticTune, diabTestTrans), reference = diabTestTrans$Outcome)
#Confusion matrix of partial least squares discriminant analysis
b=confusionMatrix(data = predict(plsdaTune, diabTestTrans), reference = diabTestTrans$Outcome)
#Confusion matrix of SVM
c=confusionMatrix(data = predict(svmRTune, diabTestTrans[,1:8]), reference = diabTestTrans$Outcome)
#Confusion matrix of Random Forest
d=confusionMatrix(data = predict(rfTune, diabTestTrans), reference = diabTestTrans$Outcome)
# put results of confusion matrix into a df
results2 <- cbind(data.frame(logistic = c(a$overall[1:2], a$byClass[1:2])),
data.frame(plsda = c(b$overall[1:2], b$byClass[1:2])),
data.frame(svmR = c(c$overall[1:2], c$byClass[1:2])),
data.frame(rf = c(d$overall[1:2], d$byClass[1:2])))
# transpose df
(results2=t(results2))
# Create dot plot
res1 = resamples(list(Logistic = logisticTune, PLSDA = plsdaTune,
SVM = svmRTune, RandomForest = rfTune ))
dotplot(res1)