TCGAbiolinks-Case study n. 2: Pan Cancer downstream analysis BRCA
开始接触TCGA数据,想学习如何下载、处理、分析这些数据。在目前的常用分析包中选中了R包TCGAbiolinks。 下面就记录过程,实际上本身TCGAbiolinks的官方教程就比较完整,我就按照官方教程学习如何处理然后整合weinfo提供的docker封装环境进行的分析。
library(openxlsx)
data <- read.xlsx("barcode.xlsx")
cc <- as.character(data$cases)
# 1
library(TCGAbiolinks)
library(SummarizedExperiment)
query.exp <- GDCquery(project = "TCGA-BRCA",
legacy = TRUE,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
experimental.strategy = "RNA-Seq",
sample.type = "Primary Tumor",
barcode = cc)
GDCdownload(query.exp)
BRCA.exp <- GDCprepare(query = query.exp, save = TRUE, save.filename = "BRCAExp.rda")
# 2
library(dplyr)
dataPrep <- TCGAanalyze_Preprocessing(object = BRCA.exp, cor.cut = 0.6)
dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
geneInfo = geneInfo,
method = "gcContent")
datFilt <- dataNorm %>% TCGAanalyze_Filtering(method = "varFilter") %>%
TCGAanalyze_Filtering(method = "filter1") %>% TCGAanalyze_Filtering(method = "filter2",foldChange = 0.2)
data_Hc2 <- TCGAanalyze_Clustering(tabDF = datFilt,
method = "consensus",
methodHC = "ward.D2")
# Add cluster information to Summarized Experiment
colData(BRCA.exp)$groupsHC <- paste0("EC",data_Hc2[[4]]$consensusClass)
# 3
TCGAanalyze_survival(data = colData(BRCA.exp),
clusterCol = "groupsHC",
main = "TCGA kaplan meier survival plot from consensus cluster",
legend = "RNA Group",height = 10,
risk.table = T,conf.int = F,
color = c("black","red","blue","green3"),
filename = "survival_BRCA_expression_subtypes.png")
# 4
TCGAvisualize_Heatmap(t(datFilt),
col.metadata = colData(BRCA.exp)[,c("barcode",
"groupsHC",
"paper_pathologic_stage",
"paper_HPV_Status")],
col.colors = list(
groupsHC = c("EC1"="black",
"EC2"="red",
"EC3"="blue",
"EC4"="green3")),
sortCol = "groupsHC",
type = "expression", # sets default color
scale = "row", # use z-scores for better visualization. Center gene expression level around 0.
title = "Heatmap from concensus cluster",
filename = "case2_Heatmap.png",
cluster_rows = TRUE,
color.levels = colorRampPalette(c("green", "black", "red"))(n = 11),
extremes =seq(-5,5,1),
cluster_columns = FALSE,
width = 1000,
height = 1000)
# 5
BRCAmut <- GDCquery_Maf(tumor = "BRCA", pipelines = "muse")
# Selecting gene
mRNAsel <- "ATRX"
BRCAselected <- BRCAmut[BRCAmut$Hugo_Symbol == mRNAsel,]
dataMut <- BRCAselected[!duplicated(BRCAselected$Tumor_Sample_Barcode),]
dataMut$Tumor_Sample_Barcode <- substr(dataMut$Tumor_Sample_Barcode,1,12)
# Adding the Expression Cluster classification found before
dataMut <- merge(dataMut, BRCA.exp@colData, by.y="patient", by.x="Tumor_Sample_Barcode")
dataMut <- dataMut[dataMut$Variant_Classification!=0,]
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!