-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInvestigatingTheRResidualMatrixFromTheSVAFunctionExample.Rmd
124 lines (75 loc) · 2.81 KB
/
InvestigatingTheRResidualMatrixFromTheSVAFunctionExample.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
---
title: "Investigating the R Residual Matrix from the sva() function"
output: html_document
date: "2024-10-28"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
#get residual matrix function that works on the entire dataset as opposed to 10%, like in the original Leek function from the SVA package.
#input is the same as the input for the num.sv() function. i.e. the edata object in the sva vingette.
#example input data creation can be found in the next chunk.
getResid <- function(dat){
dats <- as.matrix(dat)
dims <- dim(dats)
# a <- seq(0, 2, length = 100)
# n <- floor(dims[1]/10)
# rhat <- matrix(0, nrow = 100, ncol = 10)
P <- (diag(dims[2]) - mod %*% solve(t(mod) %*% mod) %*%
t(mod))
# for (j in 1:10) {
#dats <- dat[1:(j * n), ]
#ee <- eigen(t(dats) %*% dats)
#sigbar <- ee$values
R <- dats %*% P
return(R)}
```
```{r}
#Example of plotting the pca and scree plot for the R residual matrix for TCGA brca------
load("~/TCGA/GDC_TCGA_BRCA/RData/GDC_TCGA_BRCA_450k.Rda")
rm(betaSWAN, mValFUNNORM, mValSWAN, RGset)
sampleSheet <- targets[,c(1,3,4,5,6,13,14,15,18)]
row.names(sampleSheet) <- sampleSheet$Sample_Name
pheno = sampleSheet
betaFUNNORM[,colnames(betaFUNNORM) %in% pheno[pheno$sampleTypeCode!="TM",1]] -> betaFUNNORM
pheno <- pheno[pheno$sampleTypeCode!="TM",1:4]
pheno$Sample_Name <- as.factor(pheno$Sample_Name)
pheno$Array <- as.factor(pheno$Array)
pheno$Slide <- as.factor(pheno$Slide)
pheno$sampleTypeCode <- as.factor(pheno$sampleTypeCode)
edata = betaFUNNORM
#full model
mod = model.matrix(~as.factor(sampleTypeCode), data=pheno)
mod1 = model.matrix(~as.factor(sampleTypeCode)+Array+Slide, data=pheno)
#null model
mod0 = model.matrix(~1+Array+Slide, data=pheno)
```
```{r}
#dat = edata
getResid(edata) -> brcaR #it's defined line 203
#skree plot
results <- prcomp(t(brcaR), scale = TRUE)
pcabrcax <- results$x
pcabrcax <- as.data.frame(pcabrcax)
#calculate total variance explained by each principal component
var_explained = results$sdev^2 / sum(results$sdev^2)
#create scree plot
library(ggplot2)
qplot(c(1:890), var_explained) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Skree Plot of R object brca") +
ylim(0, 0.15)
```
```{r}
library(reshape2)
rownames(pcabrcax) -> pcabrcax$Sample_Name
melt(pcabrcax, id.vars=c("Sample_Name", "PC1", "PC2", "PC3")) -> pcabrcaxMelt
pcabrcaxMelt[,c(1:4)] -> pcabrcaxMelt
library(dplyr)
inner_join(pcabrcaxMelt, sampleSheet, by="Sample_Name") -> pcabrcaxMelt
ggplot(pcabrcaxMelt, aes(x=PC1, y=PC2, color=sampleTypeCode)) + geom_point() + ggtitle("TCGA brca PC1/PC2 - residual matrix")
ggplot(pcabrcaxMelt, aes(x=PC2, y=PC3, color=sampleTypeCode)) + geom_point() + ggtitle("TCGA brca PC2/PC3 - residual matrix")
```