-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRunStudy.R
211 lines (173 loc) · 7.46 KB
/
RunStudy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# table names----
outcome_table_name<-paste0(outcome_table_stem,"_o") # for incidence
prevalent_table_name<-paste0(outcome_table_stem,"_p") # for prevalence
feature_disease_table_name <- paste0(outcome_table_stem,"_fdisease") # for disease features
feature_disease_liver_table_name <- paste0(outcome_table_stem,"_fdiseaseliver") # for disease features
feature_medication_table_name <- paste0(outcome_table_stem,"_fmed") # for med features
if (grepl("CPRD", db.name) == TRUE) {
outcome_table_name_han<-paste0(outcome_table_stem,"_o_han") # for incidence
prevalent_table_name_han<-paste0(outcome_table_stem,"_p_han") # for prevalence
}
# QC plot folders ----
qcfolder <- here::here("3_QC",db.name)
if (!file.exists(qcfolder)){
dir.create(qcfolder, recursive = TRUE)}
# output files ----
if (!file.exists(output.folder)){
dir.create(output.folder, recursive = TRUE)}
# add functions ----
# risk table
RiskSetCount <- function(timeindex, survivaltime) {
atrisk <- NULL
for (t in timeindex)
atrisk <- c(atrisk, sum(survivaltime >= t))
return(atrisk)
}
#exporting the survival results
exportSurvivalResults <- function(result, zipName, outputFolder) {
tempDir <- zipName
tempDirCreated <- FALSE
if (!dir.exists(tempDir)) {
dir.create(tempDir)
tempDirCreated <- TRUE
}
# write results to disk
lapply(names(result), FUN = function(checkResultName) {
checkResult <- result[[checkResultName]]
utils::write.csv(checkResult,
file = file.path(
tempDir,
paste0(checkResultName, ".csv")
),
row.names = FALSE
)
})
zip::zip(zipfile = file.path(outputFolder, paste0(zipName, ".zip")),
files = list.files(tempDir, full.names = TRUE))
if (tempDirCreated) {
unlink(tempDir, recursive = TRUE)
}
invisible(result)
}
#formatting for table 1
nice.num<-function(x) {
trimws(format(round(x,1),
big.mark=",", nsmall = 1, digits=1, scientific=FALSE))}
# printing numbers with 2 decimal place and commas
nice.num2<-function(x) {
trimws(format(round(x,2),
big.mark=",", nsmall = 2, digits=2, scientific=FALSE))}
# for counts- without decimal place
nice.num.count<-function(x) {
trimws(format(x,
big.mark=",", nsmall = 0, digits=1, scientific=FALSE))}
#FUNCTION to extract the data and calculate the correct observation time and event (death) for different calender strata
DataExtraction <- function(dataset){
data <-dataset
#for whole dataset
#make new end of observation period to studyEndDate parameter ----
# data <-dataset %>%
# mutate(endOfObservation = ifelse(observation_period_end_date >= studyEndDate, studyEndDate, NA)) %>%
# mutate(endOfObservation = as.Date(endOfObservation) ) %>%
# mutate(endOfObservation = coalesce(endOfObservation, observation_period_end_date))
#
# # binary death outcome (for survival) ---
# # need to take into account follow up
# # if death date is > database end data set death to 0
# data <-data %>%
# mutate(status= ifelse(!is.na(death_date), 2, 1 )) %>%
# mutate(status= ifelse(death_date > endOfObservation , 1, status )) %>%
# mutate(status= ifelse(is.na(status), 1, status ))
#
# # calculate follow up in years
# data <-data %>%
# mutate(time_days=as.numeric(difftime(endOfObservation,
# outcome_start_date,
# units="days"))) %>%
# mutate(time_years=time_days/365.25)
#
# # take "dataset" and do the code for each calender year
# carry out for calender year
# take year and split into groups based on the data available
#grid <- rev(seq(max(lubridate::year(lubridate::ymd(dataset$outcome_start_date))), min(lubridate::year(lubridate::ymd(dataset$cohort_start_date))),by=-5))
grid <- rev(seq(2019, min(lubridate::year(lubridate::ymd(dataset$cohort_start_date))),by=-5))
# create a tool which creates 5 year age gaps but truncates at last year of study period
# now need to create the start and end dates for each one
startYear <- paste0(grid-4,"-01-01") # first days
endYear <- paste0(grid,"-12-31") # end days (plus 4 to create 5 year bands)
#add on extra times for 2020-21
startYear <- c(startYear , "2020-01-01")
endYear <- c(endYear , "2021-12-31")
# split data into groups of calender year and put it into a list. This will create 4 groups split by calender year
calenderSplitData <- list()
for(w in 1:length(endYear)){
calenderdata <- dataset %>%
filter( outcome_start_date >= startYear[w] &
outcome_start_date <= endYear[w] )
calenderdata <- calenderdata %>%
mutate(endOfObservation = ifelse(observation_period_end_date >= endYear[w], endYear[w], NA)) %>%
mutate(endOfObservation = as.Date(endOfObservation) ) %>%
mutate(endOfObservation = coalesce(endOfObservation, observation_period_end_date))
# binary death outcome (for survival) ---
# need to take into account follow up
# if death date is > database end data set death to 0
calenderdata <- calenderdata %>%
mutate(status= ifelse(!is.na(death_date), 2, 1 )) %>%
mutate(status= ifelse(death_date > endOfObservation , 1, status )) %>%
mutate(status= ifelse(is.na(status), 1, status ))
# calculate follow up in years
calenderdata <- calenderdata %>%
mutate(time_days=as.numeric(difftime(endOfObservation,
outcome_start_date,
units="days"))) %>%
mutate(time_years=time_days/365.25)
calenderSplitData[[w]] <- calenderdata
}
return(c(list(data),calenderSplitData))
}
start<-Sys.time()
# start log ----
log_file <- paste0(output.folder, "/log.txt")
logger <- create.logger()
logfile(logger) <- log_file
level(logger) <- "INFO"
# instantiate study cohorts ----
info(logger, 'INSTANTIATING STUDY COHORTS')
source(here("1_InstantiateCohorts","InstantiateStudyCohorts.R"))
info(logger, 'GOT STUDY COHORTS')
# Run incidence rate analysis ----
info(logger, 'RUNNING INCIDENCE RATE ANALYSIS')
source(here("2_Analysis","IncidenceAnalysis1.R"))
info(logger, 'INCIDENCE RATE ANALYSIS RAN')
#
# Run cohort characterisation analysis ----
info(logger, 'RUNNING COHORT CHARACTERISATION ANALYSIS')
source(here("2_Analysis","CohortCharacteristics2.R"))
info(logger, 'COHORT CHARACTERISATION ANALYSIS RAN')
if(runSurvial == TRUE){
# Run survival analysis -----
info(logger, 'RUNNING SURVIVAL ANALYSIS')
source(here("2_Analysis","SurvivalAnalysis1.R"))
info(logger, 'SURVIVAL ANALYSIS RAN')
}
# Run extra analysis ----
info(logger, 'RUNNING EXTRA ANALYSIS')
source(here("2_Analysis","extra_analysis_lung.R"))
info(logger, 'EXTRA ANALYSIS RAN')
# Run extra analysis ----
info(logger, 'RUNNING EXTRA ANALYSIS')
source(here("2_Analysis","extra_analysis_liver.R"))
info(logger, 'EXTRA ANALYSIS RAN')
# Run extra analysis ----
info(logger, 'RUNNING EXTRA ANALYSIS')
source(here("2_Analysis","extra_analysis_oesophageal.R"))
info(logger, 'EXTRA ANALYSIS RAN')
# Run extra analysis ----
info(logger, 'RUNNING EXTRA ANALYSIS')
source(here("2_Analysis","extra_analysis_crc.R"))
info(logger, 'EXTRA ANALYSIS RAN')
print("Done!")
print("-- If all has worked, there should now be three zip folders with the incidence/prevalence and survival results for whole datasets and calender years in the output folder to share")
print("-- Thank you for running the study! :)")
Sys.time()-start
readLines(log_file)