使用 python 进行 Ribo-seq 质控分析 一
测试开头








测试结尾
随缘

1前言
小时候放牛,躺在山里的草地上仰望天空 是自由 现在偶尔躺在城市的人造草坪上仰望天空 是喘息 双眼一闭 一二十年时光倒流 炎热的烈日,吃草的黄牛,山间里的鸟鸣 彷佛依然还在 但我已不再少年

2引言
前面关于 Ribo-seq 的质控都是在 R 里面读入 bam 文件,这样 R 的效率太慢,如果有多个数据,消耗的时间将会更慢。
这里尝试使用 python 进行分析输出结果文件,然后读入 R 里面进行可视化,将会大大提高效率。
3分析
这里使用的是 bowtie 比对到 转录组序列上 的默认输出文件,不是 sam/bam
,作为 python 分析的输入文件:
SRR1596101.1 HWI-ST640:681:C289NACXX:1:1101:1230:2221/1 + Rpn2_ENSMUSG00000027642_ENSMUST00000116380_264_2157_2756 813 ATTGAGGACCTTGTTGCTCGGCTGGATGA ?@@B?DDBHHFHHGGGHGDEHGBFI?<E3 0
SRR1596101.14 HWI-ST640:681:C289NACXX:1:1101:1938:2176/1 + Parp1_ENSMUSG00000026496_ENSMUST00000027777_105_3147_3873 553 AGCTGGNTATGATTGACCGCTGGTACCA ?<?DBD#222<<C<<@E>GFEEFFFIEE 0 6:G>N
SRR1596101.63 HWI-ST640:681:C289NACXX:1:1101:4867:2161/1 + Psmc1_ENSMUSG00000021178_ENSMUST00000021595_55_1375_1591 316 TTAGAANAAAAGCAAGAGGAGGAAAGATC @@?DDD#2=:ACBFB@EEHCFGDFCCCGG 0 6:G>N
SRR1596101.20 HWI-ST640:681:C289NACXX:1:1101:2399:2248/1 + Polr1b_ENSMUSG00000027395_ENSMUST00000103205_92_3497_3998 3143 AGGGACAAAGTCACCAACCAGCCCCTCGG CCCFFFFFHHHHHJJJJJJJJJJJJJJJJ 0
SRR1596101.77 HWI-ST640:681:C289NACXX:1:1101:5685:2171/1 + Cobl_ENSMUSG00000020173_ENSMUST00000046755_235_4246_5615 3532 CTGCTCNTGGCAGAGGCCCGTGACTCTGGG CCCFFF#2AFHHHJJJIJIJHIGIIIJIIJ 0 6:T>N
定义分析函数:
# Ribo-seq mapping file QC function
def summaryFramelength(inputFile,outputFile):
# from collections import Counter
frame_dict = {}
# open file
with open(inputFile,'r') as input:
for line in input:
fileds = line.split()
start_codon_pos = int(fileds[3].split('_')[3])
# Bowtie uses zero-based offset
stop_codon_pos = int(fileds[3].split('_')[4]) + 1
# Bowtie uses zero-based offset
align_pos = int(fileds[4])
length = len(fileds[5])
# relative distance
rel2st = align_pos - start_codon_pos
rel2sp = align_pos - stop_codon_pos
# assign frame
yushu = abs(align_pos - start_codon_pos)%3
# key
key = '|'.join([str(length),str(yushu),str(rel2st),str(rel2sp)])
# init dict and count
frame_dict.setdefault(key,0)
frame_dict[key] += 1
# output
output = open(outputFile,'w')
# output
for key,val in frame_dict.items():
elem = key.split('|')
output.write('t'.join([elem[0],elem[1],elem[2],elem[3],str(val)]) + 'n')
output.close()
分析数据:
summaryFramelength('ribo-EB_rmtRNA.map','ribo-EB.qc.txt')
summaryFramelength('ribo-ESC_rmtRNA.map','ribo-ESC.qc.txt')
4导入 R 可视化
读取文件:
library(tidyverse)
library(data.table)
library(ggsci)
library(ggplot2)
library(scales)
file <- list.files('./',pattern = '.qc.txt')
# bacth read
map_df(file,function(x){
tmp = fread(paste('./',x,sep = ''))
colnames(tmp) <- c('readlength','frame','rel2st','rel2sp','numbers')
tmp$sample <- sapply(strsplit(x,split = '\.'),'[',1)
# filter length 25-35 nt
tmp <- tmp %>% filter(readlength >= 25 & readlength <= 35)
return(tmp)
}) -> frame_len
# check
head(frame_len,3)
# readlength frame rel2st rel2sp numbers sample
# 1: 30 0 4500 -1381 4 ribo-EB
# 2: 27 1 1558 -318 1 ribo-EB
# 3: 28 0 252 -2155 8 ribo-EB
read length
###########################################
# read length
readlen <- frame_len %>%
dplyr::group_by(sample,readlength) %>%
dplyr::summarise(count = n())
# plot
ggplot(readlen,aes(x = factor(readlength),y = count)) +
geom_col() +
theme_bw(base_size = 16) +
xlab('') +
theme(aspect.ratio = 0.6) +
facet_wrap(~sample,scales = 'free',ncol = 4)

All reads frame
###########################################
# summarise all frame type
all_frame <- frame_len %>% group_by(sample,frame) %>%
dplyr::summarise(count = sum(numbers)) %>%
dplyr::mutate(percent = count/sum(count))
# plot
ggplot(all_frame,aes(x = factor(frame),y = percent)) +
geom_col(aes(fill = factor(frame)),width = 0.6) +
theme_classic(base_size = 16) +
scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
name = '') +
theme(strip.background = element_rect(colour = NA,fill = 'grey')) +
facet_wrap(~sample,scales = 'free',ncol = 2) +
xlab('')

Different read length frame
###########################################
# summarise all frame type
all_frame_bylen <- frame_len %>%
dplyr::group_by(sample,readlength,frame) %>%
dplyr::summarise(count = sum(numbers))
# plot
ggplot(all_frame_bylen,aes(x = factor(readlength),y = count)) +
geom_col(aes(fill = factor(frame)),
position = position_dodge2()) +
theme_classic(base_size = 16) +
scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
name = '') +
theme(strip.background = element_rect(colour = NA,fill = 'grey'),
aspect.ratio = 0.6) +
facet_wrap(~sample,scales = 'free',ncol = 2) +
xlab('')

distance to start codon
# distance to start codon
tost <- frame_len %>%
dplyr::group_by(sample,readlength,rel2st,frame) %>%
dplyr::summarise(count = sum(numbers)) %>%
dplyr::filter(rel2st >= -40 & rel2st <= 20)
# plot
ggplot(tost %>% filter(readlength %in% c(28,29,30)),
aes(x = rel2st,y = count)) +
geom_col(aes(fill = factor(frame)),
position = position_dodge2()) +
theme_classic(base_size = 16) +
scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
name = '') +
scale_x_continuous(breaks = c(-40,-12,0,20)) +
xlab('Distance from start codon (nt)') +
theme(strip.background = element_rect(colour = NA,fill = 'grey'),
aspect.ratio = 0.6) +
facet_grid(sample~readlength,scales = 'free')

distance to stop codon
# distance to stop codon
tosp <- frame_len %>%
dplyr::group_by(sample,readlength,rel2sp,frame) %>%
dplyr::summarise(count = sum(numbers)) %>%
dplyr::filter(rel2sp >= -40 & rel2sp <= 20)
# plot
ggplot(tosp %>% filter(readlength %in% c(28,29,30)),
aes(x = rel2sp,y = count)) +
geom_col(aes(fill = factor(frame)),
position = position_dodge2()) +
theme_classic(base_size = 16) +
scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
name = '') +
scale_x_continuous(breaks = c(-40,-12,0,20)) +
xlab('Distance from stop codon (nt)') +
theme(strip.background = element_rect(colour = NA,fill = 'grey'),
aspect.ratio = 0.6) +
facet_grid(sample~readlength,scales = 'free')

5结尾
分析速度还是比较快的。

欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
哦,数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀跟着 Cell 学 Ribo–seq 分析 三 (Metagene Plot)
◀RiboPlotR 优雅的可视化你的 Ribo–seq 数据
◀南京的春
◀..
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!