Missing data
Missing at random
Dropping data points
bwght <- bwght[!is.na(motheduc)]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 12.00 12.00 13.19 16.00 18.00
model1 <- lm(log(bwght)~cigs+motheduc+fatheduc+faminc+male+white,data=bwght[!is.na(fatheduc)])
summary(model1)
##
## Call:
## lm(formula = log(bwght) ~ cigs + motheduc + fatheduc + faminc +
## male + white, data = bwght[!is.na(fatheduc)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61595 -0.08566 0.01800 0.11777 0.83197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7115639 0.0346771 135.870 < 2e-16 ***
## cigs -0.0051484 0.0010297 -5.000 6.6e-07 ***
## motheduc -0.0036268 0.0029740 -1.219 0.22290
## fatheduc 0.0033968 0.0026363 1.288 0.19783
## faminc 0.0004177 0.0003470 1.204 0.22893
## male 0.0332363 0.0107222 3.100 0.00198 **
## white 0.0440468 0.0150772 2.921 0.00355 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1844 on 1184 degrees of freedom
## Multiple R-squared: 0.0429, Adjusted R-squared: 0.03805
## F-statistic: 8.846 on 6 and 1184 DF, p-value: 1.859e-09
Dropping variables
##
## Call:
## lm(formula = log(bwght) ~ cigs + male + white, data = bwght[!is.na(fatheduc)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61536 -0.08562 0.01974 0.11813 0.83437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.718154 0.014876 317.171 < 2e-16 ***
## cigs -0.005332 0.001001 -5.325 1.21e-07 ***
## male 0.032697 0.010709 3.053 0.002314 **
## white 0.049594 0.014733 3.366 0.000787 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1845 on 1187 degrees of freedom
## Multiple R-squared: 0.03962, Adjusted R-squared: 0.03719
## F-statistic: 16.32 on 3 and 1187 DF, p-value: 2.124e-10
summary(lm(log(bwght)~cigs+motheduc+faminc+male+white,data=bwght))
##
## Call:
## lm(formula = log(bwght) ~ cigs + motheduc + faminc + male + white,
## data = bwght)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61609 -0.08326 0.01931 0.11961 0.83173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7244296 0.0332176 142.227 < 2e-16 ***
## cigs -0.0052005 0.0010292 -5.053 5.03e-07 ***
## motheduc -0.0015150 0.0024823 -0.610 0.54176
## faminc 0.0005228 0.0003374 1.549 0.12155
## male 0.0335698 0.0107220 3.131 0.00179 **
## white 0.0449487 0.0150651 2.984 0.00291 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1845 on 1185 degrees of freedom
## Multiple R-squared: 0.04156, Adjusted R-squared: 0.03752
## F-statistic: 10.28 on 5 and 1185 DF, p-value: 1.166e-09
Imputation
meanfe <- mean(bwght$fatheduc)
bwght$predfe <- ifelse(is.na(bwght$fatheduc),0,bwght$fatheduc)
summary(lm(log(bwght)~cigs+faminc+male+white+motheduc+predfe,data=bwght))
##
## Call:
## lm(formula = log(bwght) ~ cigs + faminc + male + white + motheduc +
## predfe, data = bwght)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61595 -0.08566 0.01800 0.11777 0.83197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7115639 0.0346771 135.870 < 2e-16 ***
## cigs -0.0051484 0.0010297 -5.000 6.6e-07 ***
## faminc 0.0004177 0.0003470 1.204 0.22893
## male 0.0332363 0.0107222 3.100 0.00198 **
## white 0.0440468 0.0150772 2.921 0.00355 **
## motheduc -0.0036268 0.0029740 -1.219 0.22290
## predfe 0.0033968 0.0026363 1.288 0.19783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1844 on 1184 degrees of freedom
## Multiple R-squared: 0.0429, Adjusted R-squared: 0.03805
## F-statistic: 8.846 on 6 and 1184 DF, p-value: 1.859e-09
Distrbutional modeling
EM Algorithms
clast <- rep(0,7)
for (i in 1:100) {
mdl <- lm(log(bwght)~predfe+cigs+faminc+motheduc+male+white,data=bwght)
pred <- (log(bwght$bwght)-predict(mdl)+coef(mdl)[2]*bwght$predfe)/coef(mdl)[2]
pred <- ifelse(pred>18,18,pred)
pred <- ifelse(pred<10,10,pred)
pred <- ifelse(is.na(bwght$fatheduc),pred,bwght$fatheduc)
bwght$predfe <- pred
if(sum((coef(mdl)-clast)^2)<1e-12)break
clast <- coef(mdl)
}
model2 <- lm(log(bwght)~cigs+faminc+male+white+motheduc+predfe,data=bwght)
summary(model2)
##
## Call:
## lm(formula = log(bwght) ~ cigs + faminc + male + white + motheduc +
## predfe, data = bwght)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61595 -0.08566 0.01800 0.11777 0.83197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7115639 0.0346771 135.870 < 2e-16 ***
## cigs -0.0051484 0.0010297 -5.000 6.6e-07 ***
## faminc 0.0004177 0.0003470 1.204 0.22893
## male 0.0332363 0.0107222 3.100 0.00198 **
## white 0.0440468 0.0150772 2.921 0.00355 **
## motheduc -0.0036268 0.0029740 -1.219 0.22290
## predfe 0.0033968 0.0026363 1.288 0.19783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1844 on 1184 degrees of freedom
## Multiple R-squared: 0.0429, Adjusted R-squared: 0.03805
## F-statistic: 8.846 on 6 and 1184 DF, p-value: 1.859e-09
##
## Call:
## lm(formula = log(bwght) ~ cigs + male + white, data = bwght)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61536 -0.08562 0.01974 0.11813 0.83437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.718154 0.014876 317.171 < 2e-16 ***
## cigs -0.005332 0.001001 -5.325 1.21e-07 ***
## male 0.032697 0.010709 3.053 0.002314 **
## white 0.049594 0.014733 3.366 0.000787 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1845 on 1187 degrees of freedom
## Multiple R-squared: 0.03962, Adjusted R-squared: 0.03719
## F-statistic: 16.32 on 3 and 1187 DF, p-value: 2.124e-10