(已测试)考虑生存时间的ROC曲线-timeROC

1.准备输入数据
通过下面的链接即可获得输入数据。
http://weinformatics.51vip.biz:50496/risk.Rdata
options(stringsAsFactors = F)
load("risk.Rdata")
exprSet = expr[,group_list=="tumor"]
dim(exprSet) ## remove the nomral
> [1] 673 515
head(phe)
> ID event race age gender stage days age_group
> 52.70.0 TCGA-05-4244 0 <NA> 70 male iv 0 older
> 52.70.0.2 TCGA-05-4249 0 <NA> 67 male ib 1158 older
> 52.70.0.3 TCGA-05-4250 1 <NA> 79 female iiia 121 older
> 58.73.0 TCGA-05-4382 0 <NA> 68 male ib 607 older
> 58.73.0.1 TCGA-05-4389 0 <NA> 70 male ia 1369 older
> 58.73.0.2 TCGA-05-4395 1 <NA> 76 male iiib 0 older
> time
> 52.70.0 0.000000
> 52.70.0.2 38.600000
> 52.70.0.3 4.033333
> 58.73.0 20.233333
> 58.73.0.1 45.633333
> 58.73.0.2 0.000000
exprSet[1:4,1:4]
> TCGA-05-4244-01A-01T-1108-13 TCGA-05-4249-01A-01T-1108-13
> hsa-let-7a-1 3985 8916
> hsa-let-7a-2 7947 17800
> hsa-let-7a-3 4128 9079
> hsa-let-7b 9756 32960
> TCGA-05-4250-01A-01T-1108-13 TCGA-05-4382-01A-01T-1207-13
> hsa-let-7a-1 3472 6332
> hsa-let-7a-2 6822 12885
> hsa-let-7a-3 3459 6558
> hsa-let-7b 14223 16781
head(colnames(exprSet))
> [1] "TCGA-05-4244-01A-01T-1108-13" "TCGA-05-4249-01A-01T-1108-13"
> [3] "TCGA-05-4250-01A-01T-1108-13" "TCGA-05-4382-01A-01T-1207-13"
> [5] "TCGA-05-4389-01A-01T-1207-13" "TCGA-05-4395-01A-01T-1207-13"
head(phe$ID)
> [1] "TCGA-05-4244" "TCGA-05-4249" "TCGA-05-4250" "TCGA-05-4382"
> [5] "TCGA-05-4389" "TCGA-05-4395"
## 必须保证生存资料和表达矩阵,两者一致
all(substring(colnames(exprSet),1,12)==phe$ID)
> [1] TRUE
library(survival)
library(survminer)
library(ggplotify)
library(cowplot)
library(Hmisc)
options(scipen=200)
# http://dni-institute.in/blogs/cox-regression-interpret-result-and-predict/
library(survival)
library(survminer)
library(ggplotify)
library(cowplot)
library(Hmisc)
options(scipen=200)
# http://dni-institute.in/blogs/cox-regression-interpret-result-and-predict/
library(pheatmap)
2. 挑选感兴趣的基因构建coxph模型
这里的基因是另一篇文章里的,只是举例子。
e=t(exprSet[c('hsa-mir-31','hsa-mir-196b','hsa-mir-766','hsa-mir-519a-1','hsa-mir-375','hsa-mir-187','hsa-mir-331','hsa-mir-101-1'),])
e=log2(e+1)
colnames(e)=c('miR31','miR196b','miR766','miR519a1','miR375','miR187','miR331','miR101')
dat=cbind(phe,e)
dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)
colnames(dat)
> [1] "ID" "event" "race" "age" "gender"
> [6] "stage" "days" "age_group" "time" "miR31"
> [11] "miR196b" "miR766" "miR519a1" "miR375" "miR187"
> [16] "miR331" "miR101"
s=Surv(time, event) ~ miR31+miR196b+miR766+miR519a1+miR375+miR187+miR331+miR101
model <- coxph(s, data = dat )
summary(model,data=dat)
> Call:
> coxph(formula = s, data = dat)
>
> n= 515, number of events= 125
>
> coef exp(coef) se(coef) z Pr(>|z|)
> miR31 0.05104 1.05237 0.04002 1.275 0.202
> miR196b 0.05617 1.05778 0.03621 1.551 0.121
> miR766 0.13498 1.14451 0.10259 1.316 0.188
> miR519a1 -0.02977 0.97067 0.04477 -0.665 0.506
> miR375 -0.04004 0.96075 0.04976 -0.805 0.421
> miR187 -0.04231 0.95858 0.04375 -0.967 0.334
> miR331 -0.08455 0.91892 0.12715 -0.665 0.506
> miR101 -0.12002 0.88690 0.09694 -1.238 0.216
>
> exp(coef) exp(-coef) lower .95 upper .95
> miR31 1.0524 0.9502 0.9730 1.138
> miR196b 1.0578 0.9454 0.9853 1.136
> miR766 1.1445 0.8737 0.9360 1.399
> miR519a1 0.9707 1.0302 0.8891 1.060
> miR375 0.9607 1.0409 0.8715 1.059
> miR187 0.9586 1.0432 0.8798 1.044
> miR331 0.9189 1.0882 0.7162 1.179
> miR101 0.8869 1.1275 0.7334 1.072
>
> Concordance= 0.652 (se = 0.031 )
> Likelihood ratio test= 17.99 on 8 df, p=0.02
> Wald test = 18.86 on 8 df, p=0.02
> Score (logrank) test = 19.12 on 8 df, p=0.01
options(scipen=1)
ggforest(model, data =dat,
main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1", noDigits = 4)

3.自己预测自己
查看?predict.coxph的帮助文档会发现:type参数有不同的取值,其中
the expected number of events given the covariates and follow-up time (“expected”).本来预测生存时间应该是用type=”expected”,这里用了默认值,是因为文章中对应的图预测值取值范围是-1到1,只是为了保持一致。
fp <- predict(model,dat,type="risk");boxplot(fp)
fp <- predict(model,dat,type="expected") ;boxplot(fp)
names(fp) = rownames(phe)
fp <- predict(model,dat) ;boxplot(fp)

4.三图联动
这里的难点在于三个图的横坐标顺序是一致的。本强迫症还想让三个图严格对齐,然而无功而返,由于热图拼图本来就比较难弄,导致patchwork、cowplot都搞不定,多次尝试后只能通过对grid.arrange的调整对齐一下,应该是有别的办法的,如有后续,将在简书中更新
fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
time=phe[names(sort(fp)),'time'] ,
event=phe[names(sort(fp )),'event'] )
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),(ncol(dat)-7):ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = F)
#拼图实现三图联动
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
c(rep(2,7)),
c(rep(3,7))) #布局矩阵
#> Warning in rbind(c(rep(1, 7), NA), c(rep(2, 7)), c(rep(3, 7))): number of
#> columns of result is not a multiple of vector length (arg 2)
grid.arrange(grobs = plots,
layout_matrix = lay1,
heigths = c(1, 2,3))

从上向下三个图分别表示:
1.每个病人的预测值,按照从小到大排序
2.每个病人的生存时间,颜色区分生死
3.热图,挑出的基因在每个样本中的表达量
我们会汇总所有TCGA数据库的分析结合具体的实例进行分析来做专题的课程,敬请期待!!!
timeROC的升级版全面解读
之前的timeROC曲线图在: 考虑生存时间的ROC曲线-timeROC,刚刚对它进行了一点调整和升级,一不小心探索到了很多新的拓展。
1.输入数据
library(timeROC)
library(survival)
load("new_dat.Rdata")
head(new_dat)
## time event fp
## 1 3817 0 14.63898
## 2 254 1 14.89388
## 3 345 1 13.05678
## 4 109 1 13.88350
## 5 164 1 15.83493
## 6 212 1 16.26325
需要有3列,生存时间、结局事件和预测值,下面的代码里 time ,event 和 fp 就是这三列的列名。
2.完成timeROC分析和画图
result <-with(new_dat, timeROC(T=time,
delta=event,
marker=fp,
cause=1,
times=c(365,1095,1825),
iid = TRUE))
#identical(c(result$TP[,1],result$TP[,2],result$TP[,3]),as.numeric(result$TP))
dat = data.frame(fpr = as.numeric(result$FP),
tpr = as.numeric(result$TP),
time = rep(as.factor(c(365,1095,1825)),each = nrow(result$TP)))
library(ggplot2)
ggplot() +
geom_line(data = dat,aes(x = fpr, y = tpr,color = time),size = 1) +
scale_color_manual(name = NULL,values = c("#92C5DE", "#F4A582", "#66C2A5"),
labels = paste0("AUC of ",c(1,3,5),"-y survival: ",
format(round(result$AUC,2),nsmall = 2)))+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
theme(panel.grid = element_blank(),
legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
legend.position = c(0.765,0.125))+
scale_x_continuous(expand = c(0.005,0.005))+
scale_y_continuous(expand = c(0.005,0.005))+
labs(x = "1 - Specificity",
y = "Sensitivity")+
coord_fixed()
做的调整有: 1.修改颜色、去掉格子、横纵坐标的expand 2.更改图例位置、文字,加框 3.修改横纵坐标名字 4.简化了生成作图数据的代码 5.改正了四舍五入末尾0被省略掉的问题,原来的0.90会被改成0.9,现在永远保留两位。
3.关于横纵坐标
特异度与灵敏度,是评价模型的两个重要指标按照疾病诊断,有病是阳性,没病是阴性来说:FPR是误诊率,把没病的人说成有病的概率,是1-特异度,1-没病的人诊断没病的概率 TPR是灵敏度,把有病的人诊断出有病的概率。我们希望灵敏度高,误诊率低,然而他俩负相关,就像鱼与熊掌不可兼得。好在不是必须二选一,而是可以选个平衡点哦。FPR是1 – Specificity,TPR是Sensitivity。我看到了一个非常清楚的的解读:https://www.jianshu.com/p/7919ef304b19
4.根据ROC曲线确定最佳截点
搜了好一圈没有看到timeROC包怎样去找最佳截点,倒是清一色都是survivalROC包的结果。似乎并不像timeROC一样支持同时计算好几个时间节点,一次只能计算一个时间。我想确定一下两个R包算出的结果是否一致,所以把一年的ROC曲线花在了一起,可见二者几乎完全重叠。
因此我用survivalROC的结果来找截点,是完全可以的。
library(survivalROC)
load("new_dat.Rdata")
head(new_dat)
## time event fp
## 1 3817 0 14.63898
## 2 254 1 14.89388
## 3 345 1 13.05678
## 4 109 1 13.88350
## 5 164 1 15.83493
## 6 212 1 16.26325
result <-with(new_dat,
survivalROC(Stime=time,
status=event,
marker=fp,
predict.time=365,
method="KM"))
##
## 12 records with missing values dropped.
result$cut.values[which.max(result$TP-result$FP)]
## [1] 11.53272
不得不说一下上面这句代码,约登指数=灵敏度+特异度−1,即=TP−FP,约登指数最大时的截断值即为最佳截断值。
5.获取最佳截断值另一种的方法
我已经用过很多次了,survminer包里的surv_cutpoint函数,选出让高低两组间差异最显著的截断值。Determine the optimal cutpoint for one or multiple continuous variables at once, using the maximally selected rank statistics from the ‘maxstat’ R package. This is an outcome-oriented methods providing a value of a cutpoint that correspond to the most significant relation with outcome (here, survival).用它来计算的结果,没有考虑到时间因素,因此与上面得到的结果不相同。
library(survminer)
res.cut <- surv_cutpoint(new_dat, time = "time", event = "event",
variables = "fp")
res.cut
## cutpoint statistic
## fp 11.18487 10.27772
我想了一下,是因为timeROC的计算中,生存时间超过1年,最终结局为死亡的病人,在1年时生存状态为活着。当我把这部分病人的生存状态改为0(活着),这两个方法计算出来的结果就相同咯
new_dat2 = new_dat
new_dat2$event[new_dat2$time>365 & new_dat2$event==1] = 0
res.cut <- surv_cutpoint(new_dat2, time = "time", event = "event",
variables = "fp")
res.cut
## cutpoint statistic
## fp 11.53272 7.327764
用这个截断值做分组,看看KM曲线
new_dat$risk = ifelse(new_dat$fp<res.cut$cutpoint$cutpoint,"low","high")
fit <- survfit(Surv(time, event)~risk, data=new_dat)
ggsurvplot(fit, data=new_dat, pval=TRUE,palette = "jco",
risk.table = TRUE, conf.int = TRUE)
哈哈,非常可以。
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!