#SIBS 2005: July 19, 2005 #1) Required packages: # Need R>=1.9.0 and the libraries Biobase, affy, hgu95av2cdf # Reference: compdiag.molgen.mpg.de/ngfn/docs/2004/jun/estrogen.pdf (Robert Gentleman, Wolfgang Huber) source("http://www.bioconductor.org/biocLite.R") biocLite(c("estrogen", "Biobase", "affy", "hgu95av2", "hgu95av2cdf")) library(affy) library(vsn) library(estrogen) #2) Load the data: Find the directory where example cel files are. #find the subdirectory extdata of the estrogen package on the #harddisk of the computer. datadir = system.file("extdata", package = "estrogen") datadir dir(datadir) setwd(datadir) #3) The file estrogen.txt contains information on the samples # that were hybridized onto the arrays. system("more C:/PROGRA~1/R/rw2011/library/estrogen/extdata/estrogen.txt") pd = read.phenoData("estrogen.txt", header = TRUE, row.names = 1) > pd phenoData object with 2 variables and 8 cases varLabels estrogen: read from file time.h: read from file > pData(pd) estrogen time.h low10-1.cel absent 10 low10-2.cel absent 10 high10-1.cel present 10 high10-2.cel present 10 low48-1.cel absent 48 low48-2.cel absent 48 high48-1.cel present 48 high48-2.cel present 48 > #4) Load the data from the CEL files as well as the phenotypic data into an AffyBatch object. celdata = ReadAffy(filenames = row.names(pData(pd)), phenoData = pd, verbose = T) 1 reading low10-1.cel ...instantiating an AffyBatch (intensity a 409600x8 matrix)...done. Reading in : low10-1.cel Reading in : low10-2.cel Reading in : high10-1.cel Reading in : high10-2.cel Reading in : low48-1.cel Reading in : low48-2.cel Reading in : high48-1.cel Reading in : high48-2.cel > celdata AffyBatch object size of arrays=640x640 features (25606 kb) cdf=HG_U95Av2 (12625 affyids) number of samples=8 number of genes=12625 annotation=hgu95av2 > pmvals = pm(celdata) > mmvals = mm(celdata) > class(pmvals) [1] "matrix" > dim(pmvals) [1] 201800 8 > dim(mmvals) [1] 201800 8 #Total number of probes 201800 x 2 and we have 8 measurements from 8 different chips #for each probe. #5) Let's look at the cel file image. The image function allows us to look at the #spatial distribution of the intensities on a chip. Useful for quality control. image(celdata[, 1]) #6) Example of a "bad" cel file badcel = ReadAffy("bad.cel") image(badcel) #7) Histograms hist(log2(pmvals[, 1]), col = "red") hist(log2(mmvals[, 1]), col = "red") #8) MA plots #mva.pairs(pm(celdata)[, c(1, 2)]) mva.pairs(pm(celdata)[, c(1, 3, 5, 7)]) #Compare time 10 hours with time 48 hours? #Experimenters think that there is no possible biological #reason why every gene on the chip #would be expressed so differently at time point 48 hrs. #This must be an artifact due to microarray technology. #Normalization should remove this. #5) Using the Robust Multichip Averaging (RMA) method to compute expression levels. edata = rma(celdata) Background correcting Normalizing Calculating Expression > > class(edata) [1] "exprSet" attr(,"package") [1] "Biobase" > dim(exprs(edata)) [1] 12625 8 #Note that we are down to 12625 genes from 201800 probes. #Expression levels across probes sets representing genes are combined. #6) Let's view the expression data: Intensity distributions across several chips. #Recall celdat has raw intensities, edata has normalized probeset values. par(mfcol = c(1, 2)) #Before RMA normalization boxplot(celdata, col = "red") #After RMA normalization boxplot(data.frame(exprs(edata)), col ="blue") #Note the intensity values in these boxplots have been transformed to log2 scale. #7) MA plots after normalization mva.pairs(exprs(edata)[, c(1, 3, 5, 7)], log.it = FALSE) #8) Scatter plots of the data. #Plotting raw intensities for two replicates of the same biological condition. plot(exprs(celdata)[, 1:2], log = "xy", pch = ".", main = "all") #Plotting normalized intensities of probesets for two replicates #of the same biological condition. plot(exprs(edata)[, 1:2], log = "xy", pch = ".", main = "all") #9) Select 50 genes with with the highest variation (standard deviation) across chips. rsd = apply(exprs(edata), 1, rowSds <- function(x){sqrt(var(x))}) glist = order(rsd, decreasing = TRUE)[1:50] heatmap(exprs(edata)[glist, ]) #yellow high, red low > colnames(exprs(edata)) [1] "low10-1.cel" "low10-2.cel" "high10-1.cel" "high10-2.cel" "low48-1.cel" "low48-2.cel" "high48-1.cel" "high48-2.cel" #What do we observe? #Genes that are consistently less expressed with higher levels of estrogen as compared to #lower levels of estrogen? #Genes differentially expressed between time points? library(hgu95av2) ls("package:hgu95av2") slist = sort(rsd, decreasing = TRUE)[46:50] mget(names(slist), hgu95av2GENENAME) > mget(names(slist), hgu95av2GENENAME) $"39677_at" [1] NA $"36681_at" [1] "apolipoprotein D" $"1822_at" [1] NA $"40767_at" [1] "tissue factor pathway inhibitor (lipoprotein-associated coagulation inhibitor)" $"1515_at" [1] "flap structure-specific endonuclease 1"