1
1
# ' Define a theoretical distribution
2
- # '
3
- # ' @description
4
- # '
2
+ # '
3
+ # ' @description
4
+ # '
5
5
# ' This function allows the user to define a null distribution based on
6
6
# ' theoretical methods. In many infer pipelines, `assume()` can be
7
7
# ' used in place of [generate()] and [calculate()] to create a null
8
- # ' distribution. Rather than outputting a data frame containing a
9
- # ' distribution of test statistics calculated from resamples of the observed
8
+ # ' distribution. Rather than outputting a data frame containing a
9
+ # ' distribution of test statistics calculated from resamples of the observed
10
10
# ' data, `assume()` outputs a more abstract type of object just containing
11
11
# ' the distributional details supplied in the `distribution` and `df` arguments.
12
12
# ' However, `assume()` output can be passed to [visualize()], [get_p_value()],
13
13
# ' and [get_confidence_interval()] in the same way that simulation-based
14
14
# ' distributions can.
15
- # '
15
+ # '
16
16
# ' To define a theoretical null distribution (for use in hypothesis testing),
17
17
# ' be sure to provide a null hypothesis via [hypothesize()]. To define a
18
- # ' theoretical sampling distribution (for use in confidence intervals),
18
+ # ' theoretical sampling distribution (for use in confidence intervals),
19
19
# ' provide the output of [specify()]. Sampling distributions (only
20
20
# ' implemented for `t` and `z`) lie on the scale of the data, and will be
21
21
# ' recentered and rescaled to match the corresponding `stat` given in
22
22
# ' [calculate()] to calculate the observed statistic.
23
- # '
23
+ # '
24
24
# ' @param x The output of [specify()] or [hypothesize()], giving the
25
25
# ' observed data, variable(s) of interest, and (optionally) null hypothesis.
26
26
# ' @param distribution The distribution in question, as a string. One of
27
27
# ' `"F"`, `"Chisq"`, `"t"`, or `"z"`.
28
28
# ' @param df Optional. The degrees of freedom parameter(s) for the `distribution`
29
29
# ' supplied, as a numeric vector. For `distribution = "F"`, this should have
30
- # ' length two (e.g. `c(10, 3)`). For `distribution = "Chisq"` or
31
- # ' `distribution = "t"`, this should have length one. For
32
- # ' `distribution = "z"`, this argument is not required. The package
33
- # ' will supply a message if the supplied `df` argument is different from
30
+ # ' length two (e.g. `c(10, 3)`). For `distribution = "Chisq"` or
31
+ # ' `distribution = "t"`, this should have length one. For
32
+ # ' `distribution = "z"`, this argument is not required. The package
33
+ # ' will supply a message if the supplied `df` argument is different from
34
34
# ' recognized values. See the Details section below for more information.
35
35
# ' @param ... Currently ignored.
36
- # '
36
+ # '
37
37
# ' @return An infer theoretical distribution that can be passed to helpers
38
- # ' like [visualize()], [get_p_value()], and [get_confidence_interval()].
39
- # '
40
- # ' @details
41
- # '
38
+ # ' like [visualize()], [get_p_value()], and [get_confidence_interval()].
39
+ # '
40
+ # ' @details
41
+ # '
42
42
# ' Note that the assumption being expressed here, for use in theory-based
43
- # ' inference, only extends to _distributional_ assumptions: the null
44
- # ' distribution in question and its parameters. Statistical inference with
45
- # ' infer, whether carried out via simulation (i.e. based on pipelines
46
- # ' using [generate()] and [calculate()]) or theory (i.e. with `assume()`),
43
+ # ' inference, only extends to _distributional_ assumptions: the null
44
+ # ' distribution in question and its parameters. Statistical inference with
45
+ # ' infer, whether carried out via simulation (i.e. based on pipelines
46
+ # ' using [generate()] and [calculate()]) or theory (i.e. with `assume()`),
47
47
# ' always involves the condition that observations are independent of
48
48
# ' each other.
49
- # '
50
- # ' `infer` only supports theoretical tests on one or two means via the
49
+ # '
50
+ # ' `infer` only supports theoretical tests on one or two means via the
51
51
# ' `t` distribution and one or two proportions via the `z`.
52
- # '
53
- # ' For tests comparing two means, if `n1` is the group size for one level of
52
+ # '
53
+ # ' For tests comparing two means, if `n1` is the group size for one level of
54
54
# ' the explanatory variable, and `n2` is that for the other level, `infer`
55
- # ' will recognize the following degrees of freedom (`df`) arguments:
56
- # '
55
+ # ' will recognize the following degrees of freedom (`df`) arguments:
56
+ # '
57
57
# ' * `min(n1 - 1, n2 - 1)`
58
58
# ' * `n1 + n2 - 2`
59
59
# ' * The `"parameter"` entry of the analogous `stats::t.test()` call
60
60
# ' * The `"parameter"` entry of the analogous `stats::t.test()` call with `var.equal = TRUE`
61
- # '
62
- # ' By default, the package will use the `"parameter"` entry of the analogous
61
+ # '
62
+ # ' By default, the package will use the `"parameter"` entry of the analogous
63
63
# ' `stats::t.test()` call with `var.equal = FALSE` (the default).
64
- # '
65
- # ' @examples
64
+ # '
65
+ # ' @examples
66
66
# ' # construct theoretical distributions ---------------------------------
67
- # '
67
+ # '
68
68
# ' # F distribution
69
69
# ' # with the `partyid` explanatory variable
70
- # ' gss %>%
71
- # ' specify(age ~ partyid) %>%
70
+ # ' gss %>%
71
+ # ' specify(age ~ partyid) %>%
72
72
# ' assume(distribution = "F")
73
- # '
73
+ # '
74
74
# ' # Chi-squared goodness of fit distribution
75
75
# ' # on the `finrela` variable
76
76
# ' gss %>%
83
83
# ' "far above average" = 1/6,
84
84
# ' "DK" = 1/6)) %>%
85
85
# ' assume("Chisq")
86
- # '
86
+ # '
87
87
# ' # Chi-squared test of independence
88
88
# ' # on the `finrela` and `sex` variables
89
89
# ' gss %>%
90
90
# ' specify(formula = finrela ~ sex) %>%
91
91
# ' assume(distribution = "Chisq")
92
- # '
92
+ # '
93
93
# ' # T distribution
94
- # ' gss %>%
94
+ # ' gss %>%
95
95
# ' specify(age ~ college) %>%
96
96
# ' assume("t")
97
- # '
97
+ # '
98
98
# ' # Z distribution
99
99
# ' gss %>%
100
100
# ' specify(response = sex, success = "female") %>%
101
101
# ' assume("z")
102
- # '
102
+ # '
103
103
# ' \dontrun{
104
104
# ' # each of these distributions can be passed to infer helper
105
105
# ' # functions alongside observed statistics!
106
- # '
106
+ # '
107
107
# ' # for example, a 1-sample t-test -------------------------------------
108
- # '
109
- # ' # calculate the observed statistic
108
+ # '
109
+ # ' # calculate the observed statistic
110
110
# ' obs_stat <- gss %>%
111
111
# ' specify(response = hours) %>%
112
112
# ' hypothesize(null = "point", mu = 40) %>%
113
113
# ' calculate(stat = "t")
114
- # '
114
+ # '
115
115
# ' # construct a null distribution
116
116
# ' null_dist <- gss %>%
117
117
# ' specify(response = hours) %>%
118
118
# ' assume("t")
119
- # '
119
+ # '
120
120
# ' # juxtapose them visually
121
- # ' visualize(null_dist) +
121
+ # ' visualize(null_dist) +
122
122
# ' shade_p_value(obs_stat, direction = "both")
123
- # '
123
+ # '
124
124
# ' # calculate a p-value
125
125
# ' get_p_value(null_dist, obs_stat, direction = "both")
126
- # '
126
+ # '
127
127
# ' # or, an F test ------------------------------------------------------
128
- # '
129
- # ' # calculate the observed statistic
130
- # ' obs_stat <- gss %>%
128
+ # '
129
+ # ' # calculate the observed statistic
130
+ # ' obs_stat <- gss %>%
131
131
# ' specify(age ~ partyid) %>%
132
132
# ' hypothesize(null = "independence") %>%
133
133
# ' calculate(stat = "F")
134
- # '
134
+ # '
135
135
# ' # construct a null distribution
136
- # ' null_dist <- gss %>%
136
+ # ' null_dist <- gss %>%
137
137
# ' specify(age ~ partyid) %>%
138
138
# ' assume(distribution = "F")
139
- # '
139
+ # '
140
140
# ' # juxtapose them visually
141
- # ' visualize(null_dist) +
141
+ # ' visualize(null_dist) +
142
142
# ' shade_p_value(obs_stat, direction = "both")
143
- # '
143
+ # '
144
144
# ' # calculate a p-value
145
145
# ' get_p_value(null_dist, obs_stat, direction = "both")
146
146
# ' }
147
- # '
147
+ # '
148
148
# ' @export
149
149
assume <- function (x , distribution , df = NULL , ... ) {
150
150
if (! inherits(x , " infer" )) {
@@ -153,11 +153,11 @@ assume <- function(x, distribution, df = NULL, ...) {
153
153
" likely `specify()` or `hypothesize()`."
154
154
)
155
155
}
156
-
156
+
157
157
# check that `distribution` aligns with what is expected from
158
158
# `x` and that `distribution` and `df` are consistent with each other
159
159
df <- check_distribution(x , distribution , df , ... )
160
-
160
+
161
161
structure(
162
162
glue_null(
163
163
" {distribution_desc(distribution)} distribution{df_desc(df)}." ,
@@ -184,19 +184,19 @@ assume <- function(x, distribution, df = NULL, ...) {
184
184
# check that the distribution is well-specified
185
185
check_distribution <- function (x , distribution , df , ... ) {
186
186
dist <- tolower(distribution )
187
-
187
+
188
188
if (! dist %in% c(" f" , " chisq" , " t" , " z" )) {
189
189
stop_glue(
190
190
' The distribution argument must be one of "Chisq", "F", "t", or "z".'
191
191
)
192
192
}
193
-
193
+
194
194
if ((dist == " f" && attr(x , " theory_type" ) != " ANOVA" ) ||
195
- (dist == " chisq" && ! attr(x , " theory_type" ) %in% c(" Chi-square test of indep" ,
195
+ (dist == " chisq" && ! attr(x , " theory_type" ) %in% c(" Chi-square test of indep" ,
196
196
" Chi-square Goodness of Fit" )) ||
197
- (dist == " t" && ! attr(x , " theory_type" ) %in% c(" One sample t" ,
197
+ (dist == " t" && ! attr(x , " theory_type" ) %in% c(" One sample t" ,
198
198
" Two sample t" )) ||
199
- (dist == " z" && ! attr(x , " theory_type" ) %in% c(" One sample prop z" ,
199
+ (dist == " z" && ! attr(x , " theory_type" ) %in% c(" One sample prop z" ,
200
200
" Two sample props z" ))) {
201
201
if (has_explanatory(x )) {
202
202
msg_tail <- glue_null(
@@ -206,33 +206,33 @@ check_distribution <- function(x, distribution, df, ...) {
206
206
} else {
207
207
msg_tail <- " no explanatory variable."
208
208
}
209
-
209
+
210
210
stop_glue(
211
211
' The supplied distribution "{distribution}" is not well-defined for a ' ,
212
212
" {get_stat_type_desc(attr(x, 'type_desc_response'))} response " ,
213
213
" variable ({response_name(x)}) and " , msg_tail
214
214
)
215
215
}
216
-
216
+
217
217
if (! is.numeric(df ) && ! is.null(df )) {
218
218
stop_glue(
219
219
" `assume()` expects the `df` argument to be a numeric vector, " ,
220
220
" but you supplied a {list(class(df))} object."
221
221
)
222
222
}
223
-
223
+
224
224
if (length(list (... )) != 0 ) {
225
225
plural <- length(list (... )) != 1
226
226
dots <- list (... )
227
-
227
+
228
228
stop_glue(
229
229
" `assume()` ignores the dots `...` argument, though the " ,
230
230
" argument{if (plural) 's' else ''} `{list(dots)}` " ,
231
231
" {if (plural) 'were' else 'was'} supplied. Did you forget to " ,
232
232
" concatenate the `df` argument with `c()`?"
233
233
)
234
234
}
235
-
235
+
236
236
if (dist_df_length(distribution ) != length(df ) && ! is.null(df )) {
237
237
plural <- length(df ) != 1
238
238
stop_glue(
@@ -242,9 +242,9 @@ check_distribution <- function(x, distribution, df, ...) {
242
242
' {if (plural) "were" else "was"} supplied.'
243
243
)
244
244
}
245
-
245
+
246
246
df <- determine_df(x , dist , df )
247
-
247
+
248
248
return (df )
249
249
}
250
250
@@ -286,12 +286,12 @@ df_desc <- function(df) {
286
286
" "
287
287
} else {
288
288
plural <- length(df ) != 1
289
-
289
+
290
290
paste0(
291
- ' with ' ,
292
- if (plural ) {paste0(round(df ), collapse = " and " )} else {round(df )},
293
- ' degree' ,
294
- if (! plural && df == 1 ) {' ' } else {' s' },
291
+ ' with ' ,
292
+ if (plural ) {paste0(round(df ), collapse = " and " )} else {round(df )},
293
+ ' degree' ,
294
+ if (! plural && df == 1 ) {' ' } else {' s' },
295
295
' of freedom' )
296
296
}
297
297
}
@@ -317,18 +317,18 @@ determine_df <- function(x, dist, df) {
317
317
" for `df` is correct (see `?assume` for recognized values) or " ,
318
318
" supply `df = NULL` to `assume()`."
319
319
)
320
-
320
+
321
321
return (df )
322
322
}
323
-
323
+
324
324
if (is.null(df )) {
325
325
df <- acceptable_dfs(x )
326
326
}
327
-
327
+
328
328
if (attr(x , " theory_type" ) == " Two sample t" ) {
329
329
df <- df [1 ]
330
330
}
331
-
331
+
332
332
df
333
333
}
334
334
@@ -339,16 +339,16 @@ acceptable_dfs <- function(x) {
339
339
# t.test param with var.equal = FALSE
340
340
unname(
341
341
unlist(
342
- attr(x , " distr_param" ) <-
343
- stats :: t.test(response_variable(x ) ~
342
+ attr(x , " distr_param" ) <-
343
+ stats :: t.test(response_variable(x ) ~
344
344
explanatory_variable(x ))[[" parameter" ]]
345
345
)
346
346
),
347
347
# t.test param with var.equal = TRUE
348
348
unname(
349
349
unlist(
350
- attr(x , " distr_param" ) <-
351
- stats :: t.test(response_variable(x ) ~
350
+ attr(x , " distr_param" ) <-
351
+ stats :: t.test(response_variable(x ) ~
352
352
explanatory_variable(x ),
353
353
var.equal = TRUE )[[" parameter" ]]
354
354
)
@@ -368,7 +368,7 @@ acceptable_dfs <- function(x) {
368
368
)
369
369
} else {
370
370
c(
371
- unname(unlist(attr(x , " distr_param" ))),
371
+ unname(unlist(attr(x , " distr_param" ))),
372
372
unname(unlist(attr(x , " distr_param2" )))
373
373
)
374
374
}
0 commit comments