Skip to content

Commit 1e16802

Browse files
authored
Merge pull request #225 from kgoldfeld/216-nonrandom-distribution-returns-a-single-value-when-repeated-values-are-expected
216 nonrandom distribution returns a single value when repeated values are expected
2 parents 93e72f8 + aa853cc commit 1e16802

11 files changed

+254
-20
lines changed

CITATION.cff

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
# -----------------------------------------------------------
2-
# CITATION file created with {cffr} R package, v1.0.0
1+
# --------------------------------------------
2+
# CITATION file created with {cffr} R package
33
# See also: https://docs.ropensci.org/cffr/
4-
# -----------------------------------------------------------
4+
# --------------------------------------------
55

66
cff-version: 1.2.0
77
message: 'To cite package "simstudy" in publications use:'
@@ -49,7 +49,7 @@ preferred-citation:
4949
repository: https://CRAN.R-project.org/package=simstudy
5050
repository-code: https://github.com/kgoldfeld/simstudy
5151
url: https://kgoldfeld.github.io/simstudy/
52-
date-released: '2023-11-22'
52+
date-released: '2024-05-10'
5353
contact:
5454
- family-names: Goldfeld
5555
given-names: Keith
@@ -568,3 +568,4 @@ references:
568568
identifiers:
569569
- type: url
570570
value: https://kgoldfeld.github.io/simstudy/dev/
571+

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Type: Package
22
Package: simstudy
33
Title: Simulation of Study Data
44
Version: 0.7.1.9000
5-
Date: 2023-11-22
5+
Date: 2024-05-10
66
Authors@R:
77
c(person(given = "Keith",
88
family = "Goldfeld",

NEWS.md

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# simstudy (development version)
22

33
## New features
4+
* added the option to specify a customized distribution in `defData` and `defDataAdd` by
5+
specifying `dist = "custom"`.
46
*`addPeriods` now includes a new argument `periodVec` that allows users to designate
57
specific measurement time periods using vector.
68

R/define_data.R

+1
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,7 @@ defSurv <- function(dtDefs = NULL,
727727
.isValidArithmeticFormula(newform, defVars)
728728
.isValidArithmeticFormula(variance, defVars)
729729
},
730+
custom = {},
730731
stop("Unknown distribution.")
731732
)
732733

R/generate_dist.R

+35
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#' @return A data.frame with the updated simulated data
1313
#' @noRd
1414
.generate <- function(args, n, dfSim, idname, envir = parent.frame()) {
15+
1516
newColumn <- switch(args$dist,
1617
beta = .genbeta(
1718
n = n,
@@ -54,6 +55,13 @@
5455
variance = args$variance,
5556
envir = envir
5657
),
58+
custom = .gencustom(
59+
n = n,
60+
fn = args$formula,
61+
args = args$variance,
62+
dtSim = copy(dfSim),
63+
envir = envir
64+
),
5765
exponential = .genexp(
5866
n = n,
5967
formula = args$formula,
@@ -327,6 +335,7 @@
327335
# @return A data.frame column with the updated simulated data
328336

329337
.gendeterm <- function(n, formula, link, dtSim, envir) {
338+
330339
new <- .evalWith(formula, .parseDotVars(formula, envir), dtSim, n)
331340

332341
if (link == "log") {
@@ -336,6 +345,32 @@
336345
new
337346
}
338347

348+
# Internal function called by .generate - returns non-random data
349+
#
350+
# @param n The number of observations required in the data set
351+
# @param fn String that specifies the custom function
352+
# @param args String of comma delimited arguments that will be passed to fn
353+
# @param dtSim Incomplete simulated data.table
354+
# @return A data.frame column with the updated simulated data
355+
356+
.gencustom <- function(n, fn, args, dtSim, envir) {
357+
358+
args <- gsub("\\s+", "", args) # remove any spaces
359+
arg_l <- as.list(unlist(strsplit(args, ",")))
360+
arg_l <- lapply(arg_l, function(a) as.list(unlist(strsplit(a, "="))))
361+
362+
var_vec <- unlist(lapply(arg_l, function(a) a[[1]]))
363+
arg_list <- lapply(arg_l,
364+
function(a) .evalWith(a[[2]], .parseDotVars( a[[2]], envir ), dtSim, n)
365+
)
366+
names(arg_list) <- var_vec
367+
assertNotInVector("n", names(arg_list))
368+
arg_list <- c(n = n, arg_list)
369+
new <- do.call(fn, arg_list, envir = envir)
370+
371+
new
372+
}
373+
339374
# Internal function called by .generate - returns exp data
340375
#
341376
# @param n The number of observations required in the data set

R/internal_utility.R

+3-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
extVars,
5252
dtSim = data.table(),
5353
n = nrow(dtSim)) {
54+
5455
if (missing(dtSim) && missing(n)) {
5556
n <- 1
5657
}
@@ -198,7 +199,8 @@
198199
"exponential",
199200
"mixture",
200201
"trtAssign",
201-
"clusterSize"
202+
"clusterSize",
203+
"custom"
202204
)
203205
}
204206

man/defData.Rd

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-generate_dist.R

+29
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,32 @@ test_that("clusterSize data is generated as expected.", {
170170
expect_true(dt2[, var(test)] > dt1[, var(test)])
171171

172172
})
173+
174+
175+
# .gencustom ----
176+
test_that("custom data is generated as expected.", {
177+
skip_on_cran()
178+
179+
trunc_norm <- function(n, lower, upper, mu = 0, s = 1.5) {
180+
181+
F.a <- pnorm(lower, mean = mu, sd = s)
182+
F.b <- pnorm(upper, mean = mu, sd = s)
183+
184+
u <- runif(n, min = F.a, max = F.b)
185+
qnorm(u, mean = mu, sd = s)
186+
187+
}
188+
189+
def <-
190+
defData(varname = "x", formula = 5, dist = "poisson") |>
191+
defData(varname = "y", formula = "trunc_norm",
192+
variance = "s = 100, lower = x - 1, upper = x + 1",
193+
dist = "custom"
194+
)
195+
196+
dd <- genData(10000, def)
197+
198+
expect_true( dd[, min(y)] > dd[, min(x-1)])
199+
expect_true( dd[, max(y)] < dd[, max(x+1)])
200+
201+
})

tests/testthat/test-internal_utility.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ test_that("probabilities (matrix) are adjusted as documented.", {
8080
# .getDists ----
8181
test_that("number of Dists is up to date.", {
8282
skip_on_cran()
83-
expect_length(.getDists(), 16)
83+
expect_length(.getDists(), 17)
8484
})
8585

8686
# .isFormulaScalar ----

vignettes/customdist.Rmd

+159
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
---
2+
title: "Customized Distributions"
3+
output: rmarkdown::html_vignette
4+
vignette: >
5+
%\VignetteIndexEntry{Customized Distributions}
6+
%\VignetteEngine{knitr::rmarkdown}
7+
\usepackage[utf8]{inputenc}
8+
---
9+
10+
```{r chunkname, echo=-1}
11+
data.table::setDTthreads(2)
12+
```
13+
14+
```{r, echo = FALSE, message = FALSE}
15+
library(simstudy)
16+
library(ggplot2)
17+
library(scales)
18+
library(grid)
19+
library(gridExtra)
20+
library(survival)
21+
library(gee)
22+
library(data.table)
23+
library(ordinal)
24+
25+
odds <- function (p) p/(1 - p) # TODO temporary remove when added to package
26+
plotcolors <- c("#B84226", "#1B8445", "#1C5974")
27+
28+
cbbPalette <- c("#B84226","#B88F26", "#A5B435", "#1B8446",
29+
"#B87326","#B8A526", "#6CA723", "#1C5974")
30+
31+
ggtheme <- function(panelback = "white") {
32+
33+
ggplot2::theme(
34+
panel.background = element_rect(fill = panelback),
35+
panel.grid = element_blank(),
36+
axis.ticks = element_line(colour = "black"),
37+
panel.spacing =unit(0.25, "lines"), # requires package grid
38+
panel.border = element_rect(fill = NA, colour="gray90"),
39+
plot.title = element_text(size = 8,vjust=.5,hjust=0),
40+
axis.text = element_text(size=8),
41+
axis.title = element_text(size = 8)
42+
)
43+
44+
}
45+
46+
```
47+
48+
Custom distributions can be specified in `defData` and `defDataAdd` by setting the argument *dist* to "custom". When defining a custom distribution, you provide the name of the user-defined function as a string in the *formula* argument. The arguments of the custom function are listed in the *variance* argument, separated by commas and formatted as "**arg_1 = val_form_1, arg_2 = val_form_2, $\dots$, arg_K = val_form_K**".
49+
50+
Here, the *arg_k's* represent the names of the arguments passed to the customized function, where $k$ ranges from $1$ to $K$. You can use values or formulas for each *val_form_k*. If formulas are used, ensure that the variables have been previously generated. Double dot notation is available in specifying *value_formula_k*. One important requirement of the custom function is that the parameter list used to define the function must include an argument"**n = n**", but do not include $n$ in the definition as part of `defData` or `defDataAdd`.
51+
52+
### Example 1
53+
54+
Here is an example where we would like to generate data from a zero-inflated beta distribution. In this case, there is a user-defined function `zeroBeta` that takes on shape parameters $a$ and $b$, as well as $p_0$, the proportion of the sample that is zero. Note that the function also takes an argument $n$ that will not to be be specified in the data definition; $n$ will represent the number of observations being generated:
55+
56+
```{r}
57+
zeroBeta <- function(n, a, b, p0) {
58+
betas <- rbeta(n, a, b)
59+
is.zero <- rbinom(n, 1, p0)
60+
betas*!(is.zero)
61+
}
62+
```
63+
64+
The data definition specifies a new variable $zb$ that sets $a$ and $b$ to 0.75, and $p_0 = 0.02$:
65+
66+
```{r}
67+
def <- defData(
68+
varname = "zb",
69+
formula = "zeroBeta",
70+
variance = "a = 0.75, b = 0.75, p0 = 0.02",
71+
dist = "custom"
72+
)
73+
```
74+
75+
The data are generated:
76+
77+
```{r}
78+
set.seed(1234)
79+
dd <- genData(100000, def)
80+
```
81+
82+
```{r, echo = FALSE}
83+
dd
84+
```
85+
86+
A plot of the data reveals dis-proportion of zero's:
87+
88+
```{r, fig.width = 6, fig.height = 3, echo = FALSE}
89+
ggplot(data = dd, aes(x = zb)) +
90+
geom_histogram(binwidth = 0.01, boundary = 0, fill = "grey60") +
91+
theme(panel.grid = element_blank())
92+
```
93+
94+
### Example 2
95+
96+
In this second example, we are generating sets of truncated Gaussian distributions with means ranging from $-1$ to $1$. The limits of the truncation vary across three different groups. `rnormt` is a customized (user-defined) function that generates the truncated Gaussiandata. The function requires four arguments (the left truncation value, the right truncation value, the distribution average and the standard deviation).
97+
98+
```{r}
99+
rnormt <- function(n, min, max, mu, s) {
100+
101+
F.a <- pnorm(min, mean = mu, sd = s)
102+
F.b <- pnorm(max, mean = mu, sd = s)
103+
104+
u <- runif(n, min = F.a, max = F.b)
105+
qnorm(u, mean = mu, sd = s)
106+
107+
}
108+
```
109+
110+
111+
In this example, truncation limits vary based on group membership. Initially, three groups are created, followed by the generation of truncated values. For Group 1, truncation occurs within the range of $-1$ to $1$, for Group 2, it's $-2$ to $2$ and for Group 3, it's $-3$ to $3$. We'll generate three data sets, each with a distinct mean denoted by M, using the double-dot notation to implement these different means.
112+
113+
```{r}
114+
def <-
115+
defData(
116+
varname = "limit",
117+
formula = "1/4;1/2;1/4",
118+
dist = "categorical"
119+
) |>
120+
defData(
121+
varname = "tn",
122+
formula = "rnormt",
123+
variance = "min = -limit, max = limit, mu = ..M, s = 1.5",
124+
dist = "custom"
125+
)
126+
```
127+
128+
The data generation requires three calls to `genData`. The output is a list of three data sets:
129+
130+
```{r}
131+
mus <- c(-1, 0, 1)
132+
dd <-lapply(mus, function(M) genData(100000, def))
133+
```
134+
135+
Here are the first six observations from each of the three data sets:
136+
137+
```{r, echo=FALSE}
138+
lapply(dd, function(D) head(D))
139+
```
140+
141+
A plot highlights the group differences.
142+
143+
```{r, fig.width = 8, fig.height = 6, echo = FALSE}
144+
pfunc <- function(dx, i) {
145+
ggplot(data = dx, aes(x = tn)) +
146+
geom_histogram(aes(fill = factor(limit)), binwidth = 0.05, boundary = 0, alpha = .8) +
147+
facet_grid( ~ limit) +
148+
theme(panel.grid = element_blank(),
149+
legend.position = "none") +
150+
scale_fill_manual(values = plotcolors) +
151+
scale_x_continuous(breaks = seq(-3, 3, by =1)) +
152+
scale_y_continuous(limits = c(0, 1000)) +
153+
ggtitle(paste("mu =", mus[i]))
154+
}
155+
156+
plist <- lapply(seq_along(dd), function(a) pfunc(dd[[a]], a))
157+
grid.arrange(grobs = plist, nrow = 3)
158+
```
159+

vignettes/simstudy.Rmd

+17-12
Original file line numberDiff line numberDiff line change
@@ -190,17 +190,18 @@ d[[2]] <- data.table("binary", "probability", "both", "-", "-", "X", "-", "X")
190190
d[[3]] <- data.table("binomial", "probability", "both", "-", "# of trials", "X", "-", "X")
191191
d[[4]] <- data.table("categorical", "probability", "string", "p_1;p_2;...;p_n", "a;b;c", "X", "-", "-")
192192
d[[5]] <- data.table("clusterSize", "total N", "both", "-", "dispersion", "X", "-", "-")
193-
d[[6]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-")
194-
d[[7]] <- data.table("gamma", "mean", "both", "-", "dispersion", "X", "X", "-")
195-
d[[8]] <- data.table("mixture", "formula", "string", "x_1 | p_1 + ... + x_n | p_n", "-", "X", "-", "-")
196-
d[[9]] <- data.table("negBinomial", "mean", "both", "-", "dispersion", "X", "X", "-")
197-
d[[10]] <- data.table("nonrandom", "formula", "both", "-", "-", "X", "-", "-")
198-
d[[11]] <- data.table("normal", "mean", "both", "-", "variance", "X", "-", "-")
199-
d[[12]] <- data.table("noZeroPoisson", "mean", "both", "-", "-", "X", "X", "-")
200-
d[[13]] <- data.table("poisson", "mean", "both", "-", "-", "X", "X", "-")
201-
d[[14]] <- data.table("trtAssign", "ratio", "string", "r_1;r_2;...;r_n", "stratification", "X", "X", "-")
202-
d[[15]] <- data.table("uniform", "range", "string", "from ; to", "-", "X", "-", "-")
203-
d[[16]] <- data.table("uniformInt", "range", "string", "from ; to", "-", "X", "-", "-")
193+
d[[6]] <- data.table("custom", "function", "string", "-", "arguments", "X", "-", "-")
194+
d[[7]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-")
195+
d[[8]] <- data.table("gamma", "mean", "both", "-", "dispersion", "X", "X", "-")
196+
d[[9]] <- data.table("mixture", "formula", "string", "x_1 | p_1 + ... + x_n | p_n", "-", "X", "-", "-")
197+
d[[10]] <- data.table("negBinomial", "mean", "both", "-", "dispersion", "X", "X", "-")
198+
d[[11]] <- data.table("nonrandom", "formula", "both", "-", "-", "X", "-", "-")
199+
d[[12]] <- data.table("normal", "mean", "both", "-", "variance", "X", "-", "-")
200+
d[[13]] <- data.table("noZeroPoisson", "mean", "both", "-", "-", "X", "X", "-")
201+
d[[14]] <- data.table("poisson", "mean", "both", "-", "-", "X", "X", "-")
202+
d[[15]] <- data.table("trtAssign", "ratio", "string", "r_1;r_2;...;r_n", "stratification", "X", "X", "-")
203+
d[[16]] <- data.table("uniform", "range", "string", "from ; to", "-", "X", "-", "-")
204+
d[[17]] <- data.table("uniformInt", "range", "string", "from ; to", "-", "X", "-", "-")
204205
205206
206207
d <- rbindlist(d)
@@ -226,7 +227,11 @@ A *categorical* distribution is a discrete data distribution taking on values fr
226227

227228
#### clusterSize
228229

229-
The *clusterSize* distribution allocates a total sample size *N* (specified in the *formula* argument) across the *k* rows of the data.table such that the sum of the rows equals *N*. If the *dispersion* argument is set to 0, the allocation to each row is *N/k*, with some rows getting an increment of 1 to ensure that the sum is N. It is possible to relax the assumption of balanced cluster sizes by setting *dispersion > 0*. As the dispersion, the variability of cluster sizes across clusters increases.
230+
The *clusterSize* distribution allocates a total sample size *N* (specified in the *formula* argument) across the *k* rows of the data.table such that the sum of the rows equals *N*. If the *dispersion* argument is set to 0, the allocation to each row is *N/k*, with some rows getting an increment of 1 to ensure that the sum is N. It is possible to relax the assumption of balanced cluster sizes by setting *dispersion > 0*. As the dispersion increases, the variability of cluster sizes across clusters increases.
231+
232+
#### custom
233+
234+
Custom distributions can be specified in `defData` and `defDataAdd` by setting the argument *dist* to "custom". When defining a custom distribution, provide the name of the user-defined function as a string in the *formula* argument. The arguments of the custom function are listed in the *variance* argument, separated by commas and formatted as "**arg_1 = val_form_1, arg_2 = val_form_2, $\dots$, arg_K = val_form_K**". The *arg_k's* represent the names of the arguments passed to the customized function, where $k$ ranges from $1$ to $K$. Values or formulas can be used for each *val_form_k*. If formulas are used, ensure that the variables have been previously generated. Double dot notation is available in specifying *value_formula_k*. One important requirement of the custom function is that the parameter list used to define the function must include an argument"**n = n**", but do not include $n$ in the definition as part of `defData` or `defDataAdd`.
230235

231236
#### exponential
232237

0 commit comments

Comments
 (0)