## R-code for regression analysis of statistics against the other covariates ## read in data MM <- read.table("mathmarks.txt") ## set the dependent variable and the independent variable y <- MM[,5] X <- MM[,1:4] ## make y the first column MM <- cbind(y, X) ## note that we want the name of y to be statistics names(MM) <- c("statistics", names(MM)[2:5]) ## look at the histograms par(mfrow=c(3,2)) boxplot(MM, col="yellow") for(i in 1:5){ hist(MM[,i], main=names(MM)[i], col="yellow", xlab="", cex.main=1.5) } ## make a pairs plot of the data pairs(MM, pch=16) ## change the way the points look ## just look at scatter plots of statistics against the other variables and add a smoother to each plot par(mfrow=c(2,2)) for(i in 2:5){ plot(MM[,i], MM[,1], pch=16, xlab=names(MM)[i], ylab=names(MM)[1], cex.lab=1.5) ## cex.lab increases the font size of the labels lines(lowess(MM[,i], MM[,1]), col="red", lwd=2) ## lwd is for line thickness } ## fit the first set of regression models lm.1 <- lm(statistics ~ mechanics + vectors + algebra + analysis, data=MM) summary(lm.1) lm.2 <- lm(statistics ~ algebra + analysis, data=MM) summary(lm.2) ## conduct an F-test RSS.big <- sum(lm.1$res^2) RSS.small <- sum(lm.2$res^2) n <- 88 p <- 3 q <- 5 df.small <- n-p df.big <- n-q F.obs.a <- (RSS.small-RSS.big)/(df.small-df.big) F.obs.b <- (RSS.big)/(df.big) F.obs <- F.obs.a/F.obs.b F.obs 1-pf(F.obs, 2, 88-5) ## or anova(lm.2, lm.1) ## more models lm.3 <- lm(statistics ~ mechanics + vectors + algebra + I(algebra^2) + analysis + I(analysis^2), data=MM) summary(lm.3) lm.4 <- lm(statistics ~ algebra + I(algebra^2) + analysis + I(analysis^2), data=MM) summary(lm.4) lm.5 <- lm(statistics ~ algebra + analysis + I(analysis^2), data=MM) summary(lm.5) lm.6 <- lm(statistics ~ algebra + analysis, data=MM) summary(lm.6) anova(lm.4, lm.3) anova(lm.4, lm.5) anova(lm.5, lm.3) anova(lm.5, lm.6) ## plot the residuals against the fitted values par(mfrow=c(2,1)) plot(lm.3$fit, lm.3$res, pch=16, xlab="fitted", ylab="residuals", cex.lab=1.5) hist(lm.3$res, col="yellow", xlab="", main="residuals") ## how sensitive is the 3rd model to each data point (a way to look for outliers)? for fun I will calculate by-hand par(mfrow=c(2,2)) Beta.lse.one.out <- matrix(0, nrow=88, ncol=4) X <- cbind(rep(1,88), MM[,4:5],MM[,5]^2) for(i in 1:88){ y.i <- matrix(y[-i], ncol=1) X.i <- as.matrix(X[-i,]) Beta.i <- solve(t(X.i)%*%X.i)%*%t(X.i)%*%y.i Beta.lse.one.out[i,] <- t(Beta.i) } ## plot the results beta.names <- c("intercept", "algebra", "analysis", "analysis^2") for(i in 1:4){ plot(Beta.lse.one.out[,i], main=beta.names[i], pch=16, xlab="Leave One Out Betas") } MM[88,] ## cooks distance for model cook <- cooks.distance(lm.5) plot(cook, ylab="Cooks distances", pch=16) identify(1:88,cook) ## plot the scatter plots again and look for individual 88 ## just look at scatter plots of statistics against the other variables and add a smoother to each plot par(mfrow=c(2,2)) for(i in 2:5){ plot(MM[,i], MM[,1], pch=16, xlab=names(MM)[i], ylab=names(MM)[1], cex.lab=1.5) ## cex.lab increases the font size of the labels points(MM[88,i], MM[88,1], pch=22, col="red", cex=1.5) points(MM[56,i], MM[56,1], pch=22, col="green", cex=1.5) points(MM[28,i], MM[28,1], pch=22, col="brown", cex=1.5) } ## more on identify plot(MM[,2], MM[,1]) identify(MM[,2], MM[,1]) ## click on the points in the plot you want to identify then click back on the main input window. ## add names that way x <- rnorm(5) y <- rnorm(5) names.xy <- c("a", "b", "c", "d", "e") ## or names.xy <- a:e plot(x,y) identify(x,y, labels=names.xy)