##------------------------------------------------------------------------## ## Script for web appendix on nonparametric regression ## ## An R and S-PLUS Companion to Applied Regression ## ## John Fox ## ## Sage Publications, 2002 ## ##------------------------------------------------------------------------## # preliminaries options(width=65) options(digits=5) library(car) # for data sets ## Local polynomial regression # How it works tricube <- function(z) { ifelse (abs(z) < 1, (1 - (abs(z))^3)^3, 0) } data(Prestige) attach(Prestige) par(mfrow=c(2,2)) plot(income, prestige, xlab="Average Income", ylab="Prestige", type='n', main="(a)") ord <- order(income) inc <- income[ord] pre <- prestige[ord] x0 <- inc[80] diffs <- abs(inc - x0) which.diff <- sort(diffs)[50] abline(v=c(x0-which.diff, x0+which.diff), lty=2) abline(v=x0) points(inc[diffs > which.diff], pre[diffs > which.diff], pch=16, col=gray(.75)) points(inc[diffs <= which.diff], pre[diffs <= which.diff]) x.n <- inc[diffs <= which.diff] y.n <- pre[diffs <= which.diff] plot(range(income), c(0,1), xlab="Average Income", ylab="Tricube Weight", type='n', main="(b)") abline(v=c(x0-which.diff, x0+which.diff), lty=2) abline(v=x0) xwts <- seq(x0-which.diff, x0+which.diff, len=250) lines(xwts, tricube((xwts-x0)/which.diff), lty=1, lwd=2) points(x.n, tricube((x.n - x0)/which.diff)) plot(income, prestige, xlab="Average Income", ylab="Prestige", type='n', main="(c)") abline(v=c(x0-which.diff, x0+which.diff), lty=2) abline(v=x0) points(x.n, y.n) mod <- lm(y.n ~ x.n, weights=tricube((x.n-x0)/which.diff)) reg.line(mod, lwd=2, col=1) points(x0, predict(mod, data.frame(x.n=x0)), pch=16, cex=1.8) plot(income, prestige, xlab="Average Income", ylab="Prestige", main="(d)") lines(lowess(income, prestige, f=0.5, iter=0), lwd=2) # multiple regression library(modreg) mod.lo <- loess(prestige ~ income + education, span=.5, degree=1) summary(mod.lo) inc <- seq(min(income), max(income), len=25) ed <- seq(min(education), max(education), len=25) newdata <- expand.grid(income=inc,education=ed) fit.prestige <- matrix(predict(mod.lo, newdata), 25, 25) persp(inc, ed, fit.prestige, theta=45, phi=30, ticktype='detailed', xlab='Income', ylab='Education', zlab='Prestige', expand=2/3, shade=0.5) mod.lo.inc <- loess(prestige ~ income, span=.7, degree=1) mod.lo.ed <- loess(prestige ~ education, span=.7, degree=1) anova(mod.lo.inc, mod.lo) anova(mod.lo.ed, mod.lo) # smoothing splines plot(income, prestige) inc.100 <- seq(min(income), max(income), len=100) pres <- predict(mod.lo.inc, data.frame(income=inc.100)) lines(inc.100, pres, lty=2, lwd=2) mod.lo.inc lines(smooth.spline(income, prestige, df=3.85), lwd=2) # additive regression models library(mgcv) mod.gam <- gam(prestige ~ s(income) + s(education)) mod.gam fit.prestige <- matrix(predict(mod.gam, newdata), 25, 25) persp(inc, ed, fit.prestige, theta=45, phi=30, ticktype='detailed', xlab='Income', ylab='Education', zlab='Prestige', expand=2/3, shade=0.5) par(mfrow=c(1,2)) plot(mod.gam) fit.prestige <- matrix(predict(mod.gam, newdata), 25, 25) persp(inc, ed, fit.prestige, theta=45, phi=30, ticktype='detailed', xlab='Income', ylab='Education', zlab='Prestige', expand=2/3, shade=0.5) # generalized nonparametric regression detach(Prestige) remove(list=objects()) # clean up everything data(Mroz) attach(Mroz) k5f <- factor(k5) k618f <- factor(k618) mod.1 <- gam(lfp ~ s(age) + s(inc) + k5f + k618f + wc + hc, family=binomial) summary(mod.1) par(mfrow=c(1,2)) plot(mod.1) mod.2 <- gam(lfp ~ age + s(inc) + k5f + k618f + wc + hc, family=binomial) anova(mod.2, mod.1, test='Chisq') mod.3 <- gam(lfp ~ s(age) + inc + k5f + k618f + wc + hc, family=binomial) anova(mod.3, mod.1, test='Chisq') mod.4 <- gam(lfp ~ s(inc) + k5f + k618f + wc + hc, # omits age family=binomial) anova(mod.4, mod.1, test='Chisq') ## Some methods for gam objects in the R mgcv package summary.gam <- function(object, smooth.coef=FALSE, ...){ result <- list() result$call <- object$call result$family <- object$family terms <- terms(formula(object)) trms <- attr(terms, "term.labels") vars <- attr(terms, "variables") sm.posn <- grep("^s\\(.*\\)", trms) nonsm.posn <- setdiff(1:length(trms), sm.posn) environ <- if (is.null(object$call$data)) attr(terms(object$formula), ".Environment") else eval(object$call$data) vars.eval <- eval(vars, envir=environ)[-1] factors <- sapply(vars.eval, is.factor) levs <- sapply(sapply(vars.eval[factors], levels), length) if (object$nsdf > 0){ coefs <- coefficients(object)[1:object$nsdf] se <- sqrt(diag(object$Vp)[1:length(coefs)]) z <- coefs/se table <- cbind(coefs, se, z, 2*(1-pnorm(abs(z)))) rownames(table) <- names(coefs) colnames(table) <- c('Estimate', 'Std. Error', 'z value', 'Pr(>|z|)') result$table <- table } if (smooth.coef){ sel <- (object$nsdf + 1):length(coefficients(object)) s.coefs <- coefficients(object)[sel] s.se <- sqrt(diag(object$Vp)[sel]) z.s <- s.coefs/s.se s.table <- cbind(s.coefs, s.se, z.s, 2*(1-pnorm(abs(z.s)))) rownames(s.table) <- names(s.coefs) colnames(s.table) <- c('Estimate', 'Std. Error', 'z value', 'Pr(>|z|)') result$s.table <- s.table } df.terms <- rep(1, length(trms)) df.terms[sm.posn] <- object$edf if (length(levs) > 0) df.terms[factors] <- levs - 1 if (attr(terms, "intercept") == 1) { df.terms <- c(1, df.terms) names(df.terms) <- c('(Intercept)', trms) } else names(df.terms) <- trms result$df.terms <- df.terms n <- object$df.null result$n <- n df.residual <- n - sum(object$edf) - object$nsdf result$df.residual <- df.residual result$gcv <- n * object$sig2/df.residual result$deviance <- object$deviance result$null.deviance <- object$null.deviance result$dispersion <- object$sig2 class(result) <- 'summary.gam' result } print.summary.gam <- function (x, digits = max(3, getOption("digits") - 3), signif.stars = getOption("show.signif.stars"), ...){ x$dispersion <- if (is.element(x$family[[1]], c('binomial', 'Poisson'))) "taken as 1" else x$dispersion cat('Call:\n') print(x$call) print(x$family) if (!is.null(x$table)){ cat('Parametric Coefficients\n') print.coefmat(x$table, digits = digits, signif.stars = signif.stars, ...) cat('\n') } if (!is.null(x$s.table)){ cat('Smooth Coefficients\n') print.coefmat(x$s.table, digits = digits, signif.stars = signif.stars, ...) cat('\n') } cat('\nDegrees of freedom for terms in the model:\n') print(x$df.terms) cat("\nNull Deviance: ", x$null.deviance) cat("\nDeviance: ", x$deviance) cat("\nGCV score: ", x$gcv) cat("\nResidual Degrees of Freedom :", x$df.residual) cat("\nDispersion: ", x$dispersion) cat("\nNumber of Observations: ", x$n, "\n") invisible(x) } anova.gam <- function (object, ..., dispersion = NULL, test = NULL) # adapted from anova.glm { object$df.residual <- object$df.null - sum(object$edf) - object$nsdf dotargs <- list(...) named <- if (is.null(names(dotargs))) rep(FALSE, length(dotargs)) else (names(dotargs) != "") if (any(named)) warning(paste("The following arguments to anova.glm(..)", "are invalid and dropped:", paste(deparse(dotargs[named]), collapse = ", "))) dotargs <- dotargs[!named] is.gam <- unlist(lapply(dotargs, function(x) inherits(x, "gam"))) dotargs <- dotargs[is.gam] if (length(dotargs) == 0) stop('need two or more models to compare') for (i in 1:length(dotargs)) { class(dotargs[[i]]) <- c(class(dotargs[[i]]), 'glm') dotargs[[i]]$df.residual <- dotargs[[i]]$df.null - sum(dotargs[[i]]$edf) - dotargs[[i]]$nsdf } anova.glmlist(c(list(object), dotargs), test = test) }