library(tidyverse)
12 Simulations
12.1 The Components of a Simulation
12.1.1 Setting up Xs (exogenous variables)
Typically use mvrnorm()
in the MASS package to produces a set of \(X\) that have some specified population means, variances, and covariances. In our example, we will simulate 3 variables with differing means, variances = 1, and covariances = 0. Of course, you could do whatever you need here.
- Specify population means
<- c(4, 6, 10) means
- Variance/covariance matrix. Can do this with uncorrelated Xs with variances of 1 using
diag()
. Or can specific specific variances and covariances
<- diag(3)
sigma
sigma
[,1] [,2] [,3]
[1,] 1 0 0
[2,] 0 1 0
[3,] 0 0 1
- Here we demonstrate use of matrix for specific sigma (but with same values as above)
<- matrix(c(1, 0, 0,
sigma 0, 1, 0,
0, 0, 1),
nrow = 3, byrow = TRUE)
Now we can generate data for a specific \(N\) (in this case, 1000)
<- 1000
n_obs <- MASS::mvrnorm(n = n_obs, mu = means, Sigma = sigma) |>
x ::set_colnames(str_c("x", 1:length(means))) |>
magrittras_tibble()
|> head() x
# A tibble: 6 × 3
x1 x2 x3
<dbl> <dbl> <dbl>
1 2.35 6.40 9.32
2 5.01 4.64 10.4
3 3.07 8.07 9.19
4 2.86 8.09 8.98
5 2.70 6.43 9.34
6 3.26 4.82 9.17
|> summarize(across(everything(), list(mean = mean, var = var))) x
# A tibble: 1 × 6
x1_mean x1_var x2_mean x2_var x3_mean x3_var
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3.95 1.01 5.96 1.01 9.99 1.00
cor(x)
x1 x2 x3
x1 1.00000000 0.03712225 -0.01892683
x2 0.03712225 1.00000000 0.03761251
x3 -0.01892683 0.03761251 1.00000000
12.1.2 Defining a Data Generating Process (DGP) for a Quantitative Y
We start with dataframe \(x\)
- Add an intercept as first column
<- x |>
x mutate(x0 = 1) |>
relocate(x0)
head(x)
# A tibble: 6 × 4
x0 x1 x2 x3
<dbl> <dbl> <dbl> <dbl>
1 1 2.35 6.40 9.32
2 1 5.01 4.64 10.4
3 1 3.07 8.07 9.19
4 1 2.86 8.09 8.98
5 1 2.70 6.43 9.34
6 1 3.26 4.82 9.17
- Set parameter estimates. This includes \(b_0\) as the first entry
<- c(100, 2, 5, 1) b
- Set standard deviation of error
<- 10 e
- Calculate Y and put it all in a dataframe
<- x |>
d mutate(y = rowSums(t(t(x)*b)) + rnorm(n_obs,
mean = 0,
sd = e))
head(d)
# A tibble: 6 × 5
x0 x1 x2 x3 y
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2.35 6.40 9.32 147.
2 1 5.01 4.64 10.4 136.
3 1 3.07 8.07 9.19 149.
4 1 2.86 8.09 8.98 130.
5 1 2.70 6.43 9.34 154.
6 1 3.26 4.82 9.17 156.
12.1.3 Wrap it all in a function
<- function(s, n_obs, means, sigma, b, e) {
get_data
<- MASS::mvrnorm(n = n_obs, mu = means, Sigma = sigma) |>
x ::set_colnames(str_c("x", 1:length(means))) |>
magrittras_tibble() |>
mutate(x0 = 1) |>
relocate(x0)
|>
x mutate(y = rowSums(t(t(x)*b)) + rnorm(n_obs, 0, e)) |>
mutate(s = s) |>
relocate(s)
}
Now we can use this function to simulate a data set with any properties
get_data(1,
<- 1000,
n_obs <- c(4, 6, 10),
means <- diag(3),
sigma <- c(100, 2, 5, 1),
b <- 10) e
# A tibble: 1,000 × 6
s x0 x1 x2 x3 y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 4.47 7.32 8.19 153.
2 1 1 4.24 4.82 8.17 151.
3 1 1 4.32 5.28 11.5 135.
4 1 1 3.86 5.55 9.71 137.
5 1 1 4.70 5.88 9.92 134.
6 1 1 1.83 5.81 9.87 137.
7 1 1 3.81 6.39 10.0 139.
8 1 1 4.50 6.38 11.0 168.
9 1 1 3.21 7.71 11.2 153.
10 1 1 3.87 6.70 10.0 159.
# ℹ 990 more rows
12.1.4 Iterate over simulations
Lets pretend we want 100 simulated datasets
- It is important to set a seed to make these datasets reproducible
- We can use map (or
future_map()
) to iterate overget_data()
- We can use list columns for this so we get a dataframe with rows for each simulated dataset and the full dataset for each entry for the
data
column
<- 100 # this would generally be much higher!
n_sims
<- 1000
n_obs <- c(4, 6, 10)
means <- diag(3)
sigma <- c(100, 2, 5, 1)
b <- 10
e
<- 1:n_sims |>
sims map(\(s) get_data(s, n_obs, means, sigma, b, e)) |>
list_rbind() |>
nest(.by = s,
.key = "data")
head(sims)
# A tibble: 6 × 2
s data
<int> <list>
1 1 <tibble [1,000 × 5]>
2 2 <tibble [1,000 × 5]>
3 3 <tibble [1,000 × 5]>
4 4 <tibble [1,000 × 5]>
5 5 <tibble [1,000 × 5]>
6 6 <tibble [1,000 × 5]>
We can now use this df with list columns to calculate whatever we are testing for our simulation.
In this trivial example, lets show that the parameter estimate for \(x_1\) matches what we set. We will also use list columns to store intermedidate results (e.g., the lm)
- write function to use our simulated data to estimate \(b_1\) (Note that we could have used an anonymous function for this, but I think its cleaner to have a named function)
<- function(d){
fit_lm lm(y ~ x1 + x2 + x3, data = d)
}
- Now fit this model for each simulated dataset and save model in list column
- Notice we now have a column that stores these models
<- sims |>
sims mutate(model = map(data, fit_lm))
head(sims)
# A tibble: 6 × 3
s data model
<int> <list> <list>
1 1 <tibble [1,000 × 5]> <lm>
2 2 <tibble [1,000 × 5]> <lm>
3 3 <tibble [1,000 × 5]> <lm>
4 4 <tibble [1,000 × 5]> <lm>
5 5 <tibble [1,000 × 5]> <lm>
6 6 <tibble [1,000 × 5]> <lm>
Now lets extract \(b_1\) from the models and save in their own column. We can write a function for this and use broom::tidy()
<- function(model){
get_b1 |>
model ::tidy() |>
broomfilter(term == "x1") |>
pull(estimate)
}
<- sims |>
sims mutate(b1 = map(model, get_b1))
head(sims)
# A tibble: 6 × 4
s data model b1
<int> <list> <list> <list>
1 1 <tibble [1,000 × 5]> <lm> <dbl [1]>
2 2 <tibble [1,000 × 5]> <lm> <dbl [1]>
3 3 <tibble [1,000 × 5]> <lm> <dbl [1]>
4 4 <tibble [1,000 × 5]> <lm> <dbl [1]>
5 5 <tibble [1,000 × 5]> <lm> <dbl [1]>
6 6 <tibble [1,000 × 5]> <lm> <dbl [1]>
Now we can unnest the column for b1
and do analyses. Of course, we could unnest other columns if we needed access to the simuated data or the linear models we fit
<- sims |>
sims unnest(b1)
head(sims)
# A tibble: 6 × 4
s data model b1
<int> <list> <list> <dbl>
1 1 <tibble [1,000 × 5]> <lm> 2.42
2 2 <tibble [1,000 × 5]> <lm> 2.52
3 3 <tibble [1,000 × 5]> <lm> 1.87
4 4 <tibble [1,000 × 5]> <lm> 1.86
5 5 <tibble [1,000 × 5]> <lm> 2.09
6 6 <tibble [1,000 × 5]> <lm> 1.75
# mean and standard error of sampling distribution
|>
sims summarize(mean = mean(b1), sd = sd(b1))
# A tibble: 1 × 2
mean sd
<dbl> <dbl>
1 2.03 0.315
# plot of the sampling distribution
|>
sims ggplot(data = sims,
mapping = aes(x = b1)) +
geom_histogram(bins = 30)
12.2 Other Extensions and Loose Ends
12.2.1 Binary Y
In the example above, we simulated a quantitative Y that was a linear function of 3 Xs. Often, we need to simulate binary outcomes. This is easy too.
We typically simulate binary outcomes using the logistic function. You could use other functional forms but this one works well for most our needs.
We can use our same X
head(x)
# A tibble: 6 × 4
x0 x1 x2 x3
<dbl> <dbl> <dbl> <dbl>
1 1 2.35 6.40 9.32
2 1 5.01 4.64 10.4
3 1 3.07 8.07 9.19
4 1 2.86 8.09 8.98
5 1 2.70 6.43 9.34
6 1 3.26 4.82 9.17
- But now the DGP for Y is different. First we need to calculate the probability of the positive class as a function of X
<- function(d, b0, b1, b2, b3){
calc_p exp(b0 + b1*x$x1 + b2*x$x2 + b3*x$x3) /
1 + exp(b0 + b1*x$x1 + b2*x$x2 + b3*x$x3))
( }
- Now we can apply this over the rows of d
<- x |>
d mutate(p = calc_p(x, 1,2,3,4))
head(d)
# A tibble: 6 × 5
x0 x1 x2 x3 p
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2.35 6.40 9.32 1
2 1 5.01 4.64 10.4 1
3 1 3.07 8.07 9.19 1
4 1 2.86 8.09 8.98 1
5 1 2.70 6.43 9.34 1
6 1 3.26 4.82 9.17 1
- And now we can use the binomial distribution to get binary outcome based on these probabilities
<- d |>
d mutate(y = rbinom(1000,1,p))
head(d)
# A tibble: 6 × 6
x0 x1 x2 x3 p y
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 2.35 6.40 9.32 1 1
2 1 5.01 4.64 10.4 1 1
3 1 3.07 8.07 9.19 1 1
4 1 2.86 8.09 8.98 1 1
5 1 2.70 6.43 9.34 1 1
6 1 3.26 4.82 9.17 1 1
- And of course, we can wrap this all up into one function to simulate these data
<- function(s, n_obs, means, sigma, b) {
get_binary_data
<- MASS::mvrnorm(n = n_obs, mu = means, Sigma = sigma) |>
x ::set_colnames(str_c("x", 1:length(means))) |>
magrittras_tibble() |>
mutate(x0 = 1) |>
relocate(x0)
<- function(x, b){
calc_p exp(b[1] + b[2]*x$x1 + b[3]*x$x2 + b[4]*x$x3) /
1 + exp(b[1] + b[2]*x$x1 + b[3]*x$x2 + b[4]*x$x3))
(
}
|>
x mutate(p = calc_p(x, b)) |>
mutate(y = rbinom(nrow(x), 1, p))
}
From here, you can now pick up as before for the quantitative example
12.2.2 Fixing the variance of Y to set value
- insert notes from markus
- Talk with markus about need for scaled Y and possible use of mvrnorm() for all variables
12.2.3 Converting between correlation and covariance matrices
MASS::mvrnorm()
takes sigma (the variance/covariance matrix). Sometimes, we might want to think in terms of correlations.
Here is a function to convert a correlation matrix to a variance/covariance matrix
<- function (r, sd){
cor2cov diag(sd) %*% r %*% diag(sd)
}
<- matrix(c(1.0, 0.4, 0.3,
r 0.4, 1.0, 0.5,
0.3, 0.5, 1.0),
nrow = 3, byrow = TRUE)
<- c(5, 10, 2)
sd
cor2cov(r, sd)
[,1] [,2] [,3]
[1,] 25 20 3
[2,] 20 100 10
[3,] 3 10 4