-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_impute_missing_data.R
186 lines (132 loc) · 5.5 KB
/
02_impute_missing_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#*******************************************************************************
#
# Project: Medical University Innsbruck - Intrauterine insemination (IUI)
# Date: 2020-04-15
# Author: Patrick Rockenschaub
# Purpose: Inspect and impute missing data using MICE
#
#*******************************************************************************
#' ---
#' title: "Inspect and impute missing data"
#' subtitle: "Medical University Innsbruck - Intrauterine insemination (IUI)"
#' output: pdf_document
#' ---
#+ setup, message = FALSE
source("00_init_workspace.R")
source(glue("{.dir_src}/imputation_helpers.R"))
library(nnet)
#+ load-data
data <- read_rds(glue("{.dir_der}/data.rds"))
#' # Describe data distribution and missingness
#+ overall-pattern-of-missingness
invisible(
md.pattern(
data[, names(data)[names(data) != "drug"]],
rotate.names = TRUE,
)
)
#+ define-models
lin_reg <- function(f) glm(f, data = data, family = gaussian) %>% summary()
log_reg <- function(f) glm(f, data = data, family = binomial) %>% summary()
mnl_reg <- function(f) multinom(substitute(f), data = data)
#' ## Diagnoses
#'
#+ describe-diagnosis
summary(data$diagnosis)
(is.na(diagnosis) ~ hcg + age + log(bmi) + log(sperm) + log(amh) + cycle) %>%
log_reg() # Only outcome slightly predictive of missingness
multinom(
diagnosis ~ hcg + age + log(bmi) + log(sperm) + log(amh) + cycle,
data = data
) %>%
summary()
#' ## Anti-mueller hormone (AMH)
#'
summary(data$amh)
hist(data$amh)
hist(log(data$amh))
#' For AMH, no variable predicts whether or not the value is missing. This
#' strengthens the case for it being missing completely at random.
(is.na(amh) ~ hcg + age + log(bmi) + diagnosis + log(sperm) + cycle) %>%
log_reg() # No variable predicts missingness
#' There were however a couple of variables that predicted the value of
#' AMH were it was missing. These included the outcome (+), higher age (-),
#' higher BMI (-), and a diagnosis of anovulation (+).
(log(amh) ~ hcg + age + log(bmi) + diagnosis + log(sperm) + cycle) %>%
lin_reg()
#' ## Sperm count
#'
summary(data$sperm)
hist(data$sperm)
hist(log(data$sperm))
#' For sperm counts, no variable predicts whether or not the value is missing. This
#' strengthens the case for it being missing completely at random.
(is.na(sperm) ~ hcg + age + log(bmi) + diagnosis + log(amh) + cycle) %>%
log_reg()
#' There were however a couple of variables that predicted the value of
#' spermm count were it was missing. These included the outcome (+), higher
#' age of the female patient (+), higher BMI (-), and a female diagnosis of
#' endometriosis or tubal obstruction (+). All other diagnosis also showed a
#' positive but non-significant relationship compared to those patients where
#' no female infertility is known. This reflects the large proportion of
#' patients were male infertility is the underlying problem, as confirmed by
#' additional analyses below.
(log(sperm) ~ hcg + age + log(bmi) + diagnosis + log(amh) + cycle) %>%
lin_reg() # Positive: Age, hcgpositive and all diagnoses compared to no known reason
(log(sperm) ~ hcg + age + log(bmi) + log(amh) + cycle +
I(diagnosis != "no_known_female_inf")) %>%
lin_reg()
(log(sperm) ~ hcg + age + log(bmi) + log(amh) + cycle +
I(diagnosis != "no_known_female_inf") +
I(!is.na(female_male) & female_male %in% c("male", "female and male"))) %>%
lin_reg()
#' ## BMI
#'
summary(data$bmi)
hist(data$bmi)
hist(log(data$bmi))
#' For sperm counts, no variable predicts whether or not the value is missing. This
#' strengthens the case for it being missing completely at random.
(is.na(bmi) ~ hcg + age + log(sperm) + diagnosis + log(amh) + cycle) %>%
log_reg()
#' For BMI, a diagnosis of anovulation (+) and AMH (-) were associated with
#' the observed value.
(log(bmi) ~ hcg + age + log(sperm) + diagnosis + log(amh) + cycle) %>%
lin_reg()
# Follicles
summary(data$follicles)
hist(data$follicles)
#' For follicles, there was only a weak association with cycle number.
(is.na(follicles) ~ hcg + age + log(sperm) + diagnosis + log(amh) + cycle) %>%
log_reg()
#' and no association with any of the other variables
(follicles > 1 ~ hcg + age + log(sperm) + diagnosis + log(amh) + log(bmi) + cycle) %>%
log_reg()
#' # Impute missing values
#'
#' Before imputation, we create separate variables for AMH, sperm count, and
#' BMI that are log-transformed versions of the original variables in order
#' to make more normally distributed. These logged versions will be imputed
#' via the MICE algorithm (using PMM) and the original variables will then
#' be passively imputed via an inverse transformation (= exp()). In addition,
#' a new variable known_male_factor is created with is only used in the
#' imputation of sperm count.
#'
#+ transform-for-imputation
data <- data %>%
add_imputation_variables()
#' Using the newly added variables, we perform multiple imputation of 10
#' imputed datasets with 20 iterations of the algorithm for convergence.
#+ impute
imp <- run_imputation(data, m = 10, maxit = 20, print = TRUE, seed = 42)
#' After the algorithm finishes, we inspect the means and standard deviations
#' of each actively imputed variable for convergence and ensure ourselves that
#' the imputed values appear sensible.
#+ inspect-convergence
plot(imp)
#+ inspect-imputed-value
bwplot(imp, log_bmi + log_sperm + log_amh ~ .imp)
densityplot(imp, .imp ~ log_bmi + log_sperm + log_amh)
#' Finally, we save the imputed datasets.
#+ save-data
write_rds(imp, glue("{.dir_der}/imp.rds"))