使用 python 进行 Ribo-seq 质控分析 二
测试开头



测试结尾
接纳自己,放过自己。
1引言
这部分是绘制 metagene plot,参考前面那篇 cell 文章的方法,并对代码进行改写和优化
,使之适用于自己的数据。
2from to start codon
定义函数:
def metagenePfofileToST(inputFile,outputFile,cdslength=400,expression=50):
#########################################################
# prepare gene expression and length
with open(inputFile,'r') as input:
gene_infoDict = {}
for line in input:
fileds = line.split()
gene_name = fileds[0].split('_')[0]
pos = int(fileds[1])
start = int(fileds[0].split('_')[3])
end = int(fileds[0].split('_')[4])
cdsLength = end - start + 1
# geneLength = int(fileds[0].split('|')[4])
density = float(fileds[2])
# filter CDS > 400 nt gene
if cdsLength > cdslength:
# key
key = ':'.join([gene_name,str(cdsLength)])
gene_infoDict.setdefault(key,0)
if start <= pos <= end:
gene_infoDict[key] += density
else:
pass
else:
pass
# filter CDS expression > 50
filtedGeneDict = {}
for key,val in gene_infoDict.items():
if val > expression:
meanNorm = val/int(key.split(':')[1])
filtedGeneDict[key] = [val,meanNorm]
else:
pass
#########################################################
# Meta-gene analysis from start codon
mylist = range(-50, 1501)
rangeDict = dict([i, 0] for i in mylist)
countDict = dict([i, 0] for i in mylist)
# open file
with open(inputFile,'r') as input:
gene_infoDict = {}
for line in input:
fileds = line.split()
gene_name = fileds[0].split('_')[0]
pos = int(fileds[1])
start = int(fileds[0].split('_')[3])
end = int(fileds[0].split('_')[4])
cdsLength = end - start + 1
# geneLength = int(fileds[0].split('|')[4])
density = float(fileds[2])
id = ':'.join([gene_name,str(cdsLength)])
if id in filtedGeneDict:
reldist = pos - start
if -50 <= reldist <= 1500:
# divide reads at one position by average number of reads per base for this gene
reads = density / filtedGeneDict[id][1]
rangeDict[reldist] += reads
# how often was this position counted in the calculation
countDict[reldist] += 1
else:
pass
else:
pass
#########################################################
# output data
# sort dict
tupledlist1 = list(rangeDict.items())
tupledlist1.sort()
tupledlist2 = list(countDict.items())
tupledlist2.sort()
fullDict = {}
zippedlist = zip(tupledlist1, tupledlist2)
for elem in zippedlist:
col0 = elem[0][0] # list0 col0 = position (K)
col1 = elem[0][1] # list0 col1 = norm read number
col2 = elem[1][1] # list1 col1 = how often was position counted
#normalization2 by frequnecy
if col2 == 0:
fullDict[col0] = 0
else:
fullDict[col0] = col1 / col2
# Finish output
tupledlist = list(fullDict.items())
tupledlist.sort()
# output
outFileP = open(outputFile, 'w')
for elem in tupledlist:
outFileP.write('t'.join([str(elem[0]),str(elem[1])]) + 'n')
outFileP.close()
分析:
metagenePfofileToST('ribo-EB.density.txt','ribo-EB.metegene2StartCodon.txt',cdslength=600,expression=100)
metagenePfofileToST('ribo-ESC.density.txt','ribo-ESC.metegene2StartCodon.txt',cdslength=600,expression=100)
3from to start codon
定义函数:
def metagenePfofileToSP(inputFile,outputFile,cdslength=400,expression=50):
#########################################################
# prepare gene expression and length
with open(inputFile,'r') as input:
gene_infoDict = {}
for line in input:
fileds = line.split()
gene_name = fileds[0].split('_')[0]
pos = int(fileds[1])
start = int(fileds[0].split('_')[3])
end = int(fileds[0].split('_')[4])
cdsLength = end - start + 1
# geneLength = int(fileds[0].split('|')[4])
density = float(fileds[2])
# filter CDS > 400 nt gene
if cdsLength > cdslength:
# key
key = ':'.join([gene_name,str(cdsLength)])
gene_infoDict.setdefault(key,0)
if start <= pos <= end:
gene_infoDict[key] += density
else:
pass
else:
pass
# filter CDS expression > 50
filtedGeneDict = {}
for key,val in gene_infoDict.items():
if val > expression:
meanNorm = val/int(key.split(':')[1])
filtedGeneDict[key] = [val,meanNorm]
else:
pass
#########################################################
# Meta-gene analysis from stop codon
mylist = range(-1500, 51)
rangeDict = dict([i, 0] for i in mylist)
countDict = dict([i, 0] for i in mylist)
# open file
with open(inputFile,'r') as input:
gene_infoDict = {}
for line in input:
fileds = line.split()
gene_name = fileds[0].split('_')[0]
pos = int(fileds[1])
start = int(fileds[0].split('_')[3])
end = int(fileds[0].split('_')[4])
cdsLength = end - start + 1
# geneLength = int(fileds[0].split('|')[4])
density = float(fileds[2])
id = ':'.join([gene_name,str(cdsLength)])
if id in filtedGeneDict:
# reldist = pos - start
reldisp = pos - (end + 1)
if -1500 <= reldisp <= 50:
# divide reads at one position by average number of reads per base for this gene
reads = density / filtedGeneDict[id][1]
rangeDict[reldisp] += reads
# how often was this position counted in the calculation
countDict[reldisp] += 1
else:
pass
else:
pass
#########################################################
# output data
# sort dict
tupledlist1 = list(rangeDict.items())
tupledlist1.sort()
tupledlist2 = list(countDict.items())
tupledlist2.sort()
fullDict = {}
zippedlist = zip(tupledlist1, tupledlist2)
for elem in zippedlist:
col0 = elem[0][0] # list0 col0 = position (K)
col1 = elem[0][1] # list0 col1 = norm read number
col2 = elem[1][1] # list1 col1 = how often was position counted
#normalization2 by frequnecy
if col2 == 0:
fullDict[col0] = 0
else:
fullDict[col0] = col1 / col2
# Finish output
tupledlist = list(fullDict.items())
tupledlist.sort()
# output
outFileP = open(outputFile, 'w')
for elem in tupledlist:
outFileP.write('t'.join([str(elem[0]),str(elem[1])]) + 'n')
outFileP.close()
分析:
metagenePfofileToSP('ribo-EB.density.txt','ribo-EB.metegene2StopCodon.txt',cdslength=600,expression=100)
metagenePfofileToSP('ribo-ESC.density.txt','ribo-ESC.metegene2StopCodon.txt',cdslength=600,expression=100)
4绘图
批量导入数据:
library(ggplot2)
library(tidyverse)
library(ggsci)
library(Rmisc)
library(data.table)
###################################################################### start codon
name <- list.files(path = './',pattern = 'metegene2StartCodon.txt')
map_df(1:length(name), function(x){
tmp = read.table(paste('./',name[x],sep = ''))
colnames(tmp) <- c('pos','density')
tmp$sample <- sapply(strsplit(name[x],split = '\.'),'[',1)
return(tmp)
}) %>% data.table() -> df_st
# check
head(df_st,3)
# pos density sample
# 1: -50 1.551956 ribo-EB
# 2: -49 1.589671 ribo-EB
# 3: -48 1.581825 ribo-EB
绘图:
# single group
ggplot(df_st,aes(x = pos,y = density)) +
geom_line(aes(color = sample)) +
geom_hline(yintercept = 1,lty = 'dashed',size = 1,color = 'grey50') +
theme_classic(base_size = 16) +
# scale_color_d3(name = '') +
scale_color_futurama(name = '') +
theme(legend.position = c(0.88,0.88),
legend.background = element_blank(),
aspect.ratio = 0.6,
strip.background = element_rect(color = NA,fill = 'grey')) +
# facet_grid(exp~type) +
ylab('Mean read density [AU]') +
xlab('Distance from start codon (nt)')

对于 from to stop codon
也是类似:
###################################################################### stop codon
name <- list.files(path = './',pattern = 'metegene2StopCodon.txt')
map_df(1:length(name), function(x){
tmp = read.table(paste('./',name[x],sep = ''))
colnames(tmp) <- c('pos','density')
tmp$sample <- sapply(strsplit(name[x],split = '\.'),'[',1)
return(tmp)
}) %>% data.table() -> df_sp
# check
head(df_sp,3)
# pos density sample
# 1: -1500 1.512251 ribo-EB
# 2: -1499 1.608943 ribo-EB
# 3: -1498 1.499345 ribo-EB
# single group
ggplot(df_sp,aes(x = pos,y = density)) +
geom_line(aes(color = sample)) +
geom_hline(yintercept = 1,lty = 'dashed',size = 1,color = 'grey50') +
theme_classic(base_size = 16) +
# scale_color_d3(name = '') +
scale_color_futurama(name = '') +
theme(legend.position = c(0.88,0.88),
legend.background = element_blank(),
aspect.ratio = 0.6,
strip.background = element_rect(color = NA,fill = 'grey')) +
# facet_grid(exp~type) +
ylab('Mean read density [AU]') +
xlab('Distance from stop codon (nt)')

不知道为啥有两个特别高的峰,感觉还挺奇怪。
5结尾
质控部分差不多就到这里结束了。

欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
哦,数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀跟着 Cell 学 Ribo–seq 分析 三 (Metagene Plot)
◀RiboPlotR 优雅的可视化你的 Ribo–seq 数据
◀...
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!