3.4 Missing data

3.4.1 Missing at random

3.4.2 Dropping data points

bwght <- bwght[!is.na(motheduc)]
summary(bwght$fatheduc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   12.00   12.00   13.19   16.00   18.00
model1 <- lm(log(bwght)~cigs+motheduc+fatheduc+faminc+male+white,data=bwght[!is.na(fatheduc)])
summary(model1)
## 
## Call:
## lm(formula = log(bwght) ~ cigs + motheduc + fatheduc + faminc + 
##     male + white, data = bwght[!is.na(fatheduc)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61595 -0.08566  0.01800  0.11777  0.83197 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.7115639  0.0346771 135.870  < 2e-16 ***
## cigs        -0.0051484  0.0010297  -5.000  6.6e-07 ***
## motheduc    -0.0036268  0.0029740  -1.219  0.22290    
## fatheduc     0.0033968  0.0026363   1.288  0.19783    
## faminc       0.0004177  0.0003470   1.204  0.22893    
## male         0.0332363  0.0107222   3.100  0.00198 ** 
## white        0.0440468  0.0150772   2.921  0.00355 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1844 on 1184 degrees of freedom
## Multiple R-squared:  0.0429, Adjusted R-squared:  0.03805 
## F-statistic: 8.846 on 6 and 1184 DF,  p-value: 1.859e-09

3.4.3 Dropping variables

summary(step(model1))
## 
## Call:
## lm(formula = log(bwght) ~ cigs + male + white, data = bwght[!is.na(fatheduc)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61536 -0.08562  0.01974  0.11813  0.83437 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.718154   0.014876 317.171  < 2e-16 ***
## cigs        -0.005332   0.001001  -5.325 1.21e-07 ***
## male         0.032697   0.010709   3.053 0.002314 ** 
## white        0.049594   0.014733   3.366 0.000787 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1845 on 1187 degrees of freedom
## Multiple R-squared:  0.03962,    Adjusted R-squared:  0.03719 
## F-statistic: 16.32 on 3 and 1187 DF,  p-value: 2.124e-10
summary(lm(log(bwght)~cigs+motheduc+faminc+male+white,data=bwght))
## 
## Call:
## lm(formula = log(bwght) ~ cigs + motheduc + faminc + male + white, 
##     data = bwght)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61609 -0.08326  0.01931  0.11961  0.83173 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.7244296  0.0332176 142.227  < 2e-16 ***
## cigs        -0.0052005  0.0010292  -5.053 5.03e-07 ***
## motheduc    -0.0015150  0.0024823  -0.610  0.54176    
## faminc       0.0005228  0.0003374   1.549  0.12155    
## male         0.0335698  0.0107220   3.131  0.00179 ** 
## white        0.0449487  0.0150651   2.984  0.00291 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1845 on 1185 degrees of freedom
## Multiple R-squared:  0.04156,    Adjusted R-squared:  0.03752 
## F-statistic: 10.28 on 5 and 1185 DF,  p-value: 1.166e-09

3.4.4 Imputation

meanfe <- mean(bwght$fatheduc)
bwght$predfe <- ifelse(is.na(bwght$fatheduc),0,bwght$fatheduc)
summary(lm(log(bwght)~cigs+faminc+male+white+motheduc+predfe,data=bwght))
## 
## Call:
## lm(formula = log(bwght) ~ cigs + faminc + male + white + motheduc + 
##     predfe, data = bwght)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61595 -0.08566  0.01800  0.11777  0.83197 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.7115639  0.0346771 135.870  < 2e-16 ***
## cigs        -0.0051484  0.0010297  -5.000  6.6e-07 ***
## faminc       0.0004177  0.0003470   1.204  0.22893    
## male         0.0332363  0.0107222   3.100  0.00198 ** 
## white        0.0440468  0.0150772   2.921  0.00355 ** 
## motheduc    -0.0036268  0.0029740  -1.219  0.22290    
## predfe       0.0033968  0.0026363   1.288  0.19783    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1844 on 1184 degrees of freedom
## Multiple R-squared:  0.0429, Adjusted R-squared:  0.03805 
## F-statistic: 8.846 on 6 and 1184 DF,  p-value: 1.859e-09

3.4.5 Distrbutional modeling

3.4.6 EM Algorithms

clast <- rep(0,7)
for (i in 1:100) {
  mdl <- lm(log(bwght)~predfe+cigs+faminc+motheduc+male+white,data=bwght)
  pred <- (log(bwght$bwght)-predict(mdl)+coef(mdl)[2]*bwght$predfe)/coef(mdl)[2]
  pred <- ifelse(pred>18,18,pred)
  pred <- ifelse(pred<10,10,pred)
  pred <- ifelse(is.na(bwght$fatheduc),pred,bwght$fatheduc)
  bwght$predfe <- pred
  if(sum((coef(mdl)-clast)^2)<1e-12)break
  clast <- coef(mdl)
}
model2 <- lm(log(bwght)~cigs+faminc+male+white+motheduc+predfe,data=bwght)
summary(model2)
## 
## Call:
## lm(formula = log(bwght) ~ cigs + faminc + male + white + motheduc + 
##     predfe, data = bwght)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61595 -0.08566  0.01800  0.11777  0.83197 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.7115639  0.0346771 135.870  < 2e-16 ***
## cigs        -0.0051484  0.0010297  -5.000  6.6e-07 ***
## faminc       0.0004177  0.0003470   1.204  0.22893    
## male         0.0332363  0.0107222   3.100  0.00198 ** 
## white        0.0440468  0.0150772   2.921  0.00355 ** 
## motheduc    -0.0036268  0.0029740  -1.219  0.22290    
## predfe       0.0033968  0.0026363   1.288  0.19783    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1844 on 1184 degrees of freedom
## Multiple R-squared:  0.0429, Adjusted R-squared:  0.03805 
## F-statistic: 8.846 on 6 and 1184 DF,  p-value: 1.859e-09
summary(step(model2))
## 
## Call:
## lm(formula = log(bwght) ~ cigs + male + white, data = bwght)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61536 -0.08562  0.01974  0.11813  0.83437 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.718154   0.014876 317.171  < 2e-16 ***
## cigs        -0.005332   0.001001  -5.325 1.21e-07 ***
## male         0.032697   0.010709   3.053 0.002314 ** 
## white        0.049594   0.014733   3.366 0.000787 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1845 on 1187 degrees of freedom
## Multiple R-squared:  0.03962,    Adjusted R-squared:  0.03719 
## F-statistic: 16.32 on 3 and 1187 DF,  p-value: 2.124e-10