Module 3: Exercise Results
data(BreastCancer)
<- BreastCancer
bc for (k in 2:10) # altered for current lab
<- as.numeric(bc[,k])
bc[,k] head(bc)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei Bl.cromatin
## 1 1000025 5 1 1 1 2 1 3
## 2 1002945 5 4 4 5 7 10 3
## 3 1015425 3 1 1 1 2 2 3
## 4 1016277 6 8 8 1 3 4 3
## 5 1017023 4 1 1 3 2 1 3
## 6 1017122 8 10 10 8 7 10 9
## Normal.nucleoli Mitoses Class
## 1 1 1 benign
## 2 2 1 benign
## 3 1 1 benign
## 4 7 1 benign
## 5 1 1 benign
## 6 7 1 malignant
Explore missingness:
suppressMessages(require(plotrix))
#' show data missingness as a chequered matrix
#'
#' @param x (matrix) data matrix.
#' @param outFile (char) path to file for printing graph
#' @param wd (numeric) width in inches
#' @param ht (numeric) height in inches
#' @return plots missingness matrix to file
#' @import plotrix
#' @export
<- function(x,xlab="columns",
plotMissMat ylab="rows",border=NA) {
<- !is.na(x)
x class(x) <- "numeric"
color2D.matplot(x,show.values=FALSE,axes=FALSE,
cs1=c(1,0),cs2=c(1,0),cs3=c(1,0),border=border,
cex=0.8,
xlab=xlab,ylab=ylab)
}
Explore missingness:
plotMissMat(bc)
colSums(is.na(bc))
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 0 0 0 0 0 0
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 16 0 0 0 0
Plot relationship between variables
require(ggplot2)
ggplot(bc,aes(Cell.size,Normal.nucleoli)) + geom_point()
ggplot(bc) + geom_boxplot(aes(factor(Class), Normal.nucleoli))
ggplot(bc) + geom_boxplot(aes(factor(Class), Cl.thickness))
ggplot(bc) + geom_boxplot(aes(factor(Class), Bare.nuclei))
## Warning: Removed 16 rows containing non-finite values (`stat_boxplot()`).
Fit a binary outcome model:
<- glm(
mod ~ Cl.thickness + Bare.nuclei + Normal.nucleoli + Mitoses + Bl.cromatin,
Class
bc,family="binomial")
summary(mod)
##
## Call:
## glm(formula = Class ~ Cl.thickness + Bare.nuclei + Normal.nucleoli +
## Mitoses + Bl.cromatin, family = "binomial", data = bc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7840 -0.1459 -0.0768 0.0324 2.7798
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.69635 0.98763 -9.818 < 2e-16 ***
## Cl.thickness 0.63413 0.12267 5.169 2.35e-07 ***
## Bare.nuclei 0.50988 0.08228 6.197 5.76e-10 ***
## Normal.nucleoli 0.35765 0.10040 3.562 0.000368 ***
## Mitoses 0.52677 0.27923 1.887 0.059223 .
## Bl.cromatin 0.61398 0.14225 4.316 1.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 884.35 on 682 degrees of freedom
## Residual deviance: 120.81 on 677 degrees of freedom
## (16 observations deleted due to missingness)
## AIC: 132.81
##
## Number of Fisher Scoring iterations: 8