跟着 Cell Reports 学做图-CLIP-seq 数据可视化
测试开头

点击上方,关注老俊俊!







测试结尾

1引言
承接上一篇推文,使用写的函数来复现一篇文章的结果。
文章标题:
HITS-CLIP and Integrative Modeling Define the Rbfox Splicing-Regulatory Network Linked to Brain Development and Autism
一篇关于 HITS-CLIP-seq 的文章,揭示了 Rbfox2 这个 RBP(RNA Binding Protein
) 调控剪接的机制。
原理:

通过 紫外交联 RBP 于 RNA 序列的结合,然后通过 蛋白酶的消化 获取结合的 RNA 序列,这样处理过的序列在后面反转 cDNA 序列时会引起序列的 突变 或者 截断 。 后面数据分析则是 寻找这些突变或者截断的位点 来准确识别 RBP 所结合的基序及位置,从而达到单碱基分辨率的效果。
复现图如下:

图注:

2计算相对距离
计算 RBFOX2 的 motif 的 depletion
/truncation
相对距离,详见上篇推文:
import re
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from pyfaidx import Fasta
定义函数:
def GetPeaksFa(peak,genomefie,outfasta,type='bed',a=0,b=0,minlen=5):
# load geneome
genome = Fasta(genomefie)
outfa = open(outfasta,'w')
# process peaks file
with open(peak,'r') as bed:
for line in bed:
fields = line.replace('n','').split()
chr = fields[0]
strand = fields[5]
start = int(fields[1])
end = int(fields[2])
if type == 'bed':
end = end
elif type == 'narrowpeak':
end = end + 1
else:
print('please mark your peaks file type(bed/narrowpeak)')
break
# extend upstream and downstram peaks
if strand == '+':
seq = genome[chr][(start - a):(end + b)].seq
elif strand == '-':
seq = genome[chr][(start + a):(end -b)].complement.reverse.seq
else:
seq = genome[chr][(start - a):(end + b)].seq
# mimimum seq length
if len(seq) >= minlen:
# fa name
name = fields[3] + '|' + '|'.join([chr,str(start),str(end)])
# output seq
outfa.write('>' + name + '|' + strand + 'n')
outfa.write(seq + 'n')
outfa.close()
###############################################################################################
def CalculatePeaksRelativeDist(peaksfa,motif='[G|A][G|A]AC[A|C|T]|[T|G|A]GT[C|T][C|T]',mid=3):
# save in list
relpos = []
# open fa
with open(peaksfa,'r') as seq:
for line in seq:
line = line.replace('n','')
if line.startswith('>'):
next
peakmid = round((len(line) + 1)/2,ndigits=0)
# search all motif
pattern = re.compile(motif,flags=re.IGNORECASE)
pos_res = pattern.finditer(line)
# shift to A site position
allpos = [pos.start() + mid for pos in pos_res]
# calculate relative to peak center bases
relposition = [pos - peakmid for pos in allpos]
# relposition = [peakmid - pos for pos in allpos]
# save in list
relpos.extend(relposition)
return relpos
提取序列
# 提取序列
GetPeaksFa(peak='RBFOX2-CIMS.bed',
genomefie='mm10.fa',type='bed',
a=50,b=50,
outfasta='RBFOX2-CIMS-peaks_sequnce.fa')
GetPeaksFa(peak='RBFOX2-CITS.bed',
genomefie='mm10.fa',type='bed',
a=50,b=50,
outfasta='RBFOX2-CITS-peaks_sequnce.fa')
计算距离
# 计算距离
CIMS = CalculatePeaksRelativeDist(peaksfa='RBFOX2-CIMS-peaks_sequnce.fa',
motif='TGCATG',mid=1)
CITS = CalculatePeaksRelativeDist(peaksfa='RBFOX2-CITS-peaks_sequnce.fa',
motif='TGCATG',mid=1)
保存数据
这里保存数据可以导入 R 里面进行可视化。
先定义一个保存函数:
# save data function
def saveData(listres,outputname):
tmp_freq = Counter(listres).most_common()
# sort ascending
tmp_freq.sort(key= lambda item:item[0])
# to data frame and save
tmp_freq_df = pd.DataFrame(tmp_freq,columns=['reldist','frequnecy'])
tmp_freq_df.to_csv(outputname,header=True,index=False)
保存:
saveData(listres=CIMS,outputname='CIMS_freq_df.csv')
saveData(listres=CITS,outputname='CITS_freq_df.csv')
3python 可视化
接下来对分析的数据进行绘图:
plt.figure(figsize=[16,5])
sns.set(style='ticks')
plt.subplot(1,2,1)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=CIMS,shade=False,color='#398AB9',alpha=1,bw_adjust=0.09,linewidth=4)
# labels
plt.xlabel("Distance to deletion site",fontsize=20)
plt.ylabel("Density of UGCAUG",fontsize=20)
# x lines
plt.axvline(x=0,ls="--",lw=2,c='black')
plt.axvline(x=-5,ls="--",lw=2,c='grey')
plt.xlim(-50,50)
plt.xticks(ticks=[-50,-30,-10,0,10,30,50],labels=[-50,-30,-10,0,10,30,50])
plt.legend(['UGCAUG'],frameon=False,fontsize = 16,loc='upper left')
plt.subplot(1,2,2)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=CIMS,shade=False,color='#398AB9',alpha=1,bw_adjust=0.09,linewidth=4)
# labels
plt.xlabel("Distance to deletion site",fontsize=20)
plt.ylabel("Density of UGCAUG",fontsize=20)
# x lines
plt.axvline(x=0,ls="--",lw=2,c='black')
plt.axvline(x=-5,ls="--",lw=2,c='grey')
plt.xlim(-6,6)
# plt.xticks(ticks=[-6,-5,0,5,6],labels=[-6,-5,0,5,6])
plt.legend(['UGCAUG'],frameon=False,fontsize = 16,loc='upper right')
# save
plt.savefig('Distance to deletion site.pdf',format='pdf')

左边是主图,右边是放大的小图。
绘制 truncation 的图:
plt.figure(figsize=[16,5])
sns.set(style='ticks')
plt.subplot(1,2,1)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=CITS,shade=False,color='#398AB9',alpha=1,bw_adjust=0.09,linewidth=4)
# labels
plt.xlabel("Distance to truncation site",fontsize=20)
plt.ylabel("Density of UGCAUG",fontsize=20)
# x lines
plt.axvline(x=0,ls="--",lw=2,c='black')
plt.axvline(x=-5,ls="--",lw=2,c='grey')
plt.xlim(-50,50)
plt.xticks(ticks=[-50,-30,-10,0,10,30,50],labels=[-50,-30,-10,0,10,30,50])
plt.legend(['UGCAUG'],frameon=False,fontsize = 16,loc='upper left')
plt.subplot(1,2,2)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=CITS,shade=False,color='#398AB9',alpha=1,bw_adjust=0.09,linewidth=4)
# labels
plt.xlabel("Distance to truncation site",fontsize=20)
plt.ylabel("Density of UGCAUG",fontsize=20)
# x lines
plt.axvline(x=0,ls="--",lw=2,c='black')
plt.axvline(x=-5,ls="--",lw=2,c='grey')
plt.xlim(-6,6)
# plt.xticks(ticks=[-6,-5,0,5,6],labels=[-6,-5,0,5,6])
plt.legend(['UGCAUG'],frameon=False,fontsize = 16,loc='upper right')
# save
plt.savefig('Distance to truncation site.pdf',format='pdf')

4计算离 VGCAUG 的距离画图
V 为 A/C/G
三个碱基:
CIMSv = CalculatePeaksRelativeDist(peaksfa='RBFOX2-CIMS-peaks_sequnce.fa',
motif='[A|C|G]GCATG',mid=1)
CITSv = CalculatePeaksRelativeDist(peaksfa='RBFOX2-CITS-peaks_sequnce.fa',
motif='[A|C|G]GCATG',mid=1)
saveData(listres=CIMSv,outputname='CIMSv_freq_df.csv')
saveData(listres=CITSv,outputname='CITSv_freq_df.csv')
画图:
plt.figure(figsize=[15,5])
sns.set(style='ticks')
plt.subplot(1,2,1)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=CIMSv,shade=False,color='#398AB9',alpha=1,bw_adjust=0.09,linewidth=4)
# labels
plt.xlabel("Distance to deletion site",fontsize=20)
plt.ylabel("Density of VGCAUG",fontsize=20)
# x lines
plt.axvline(x=0,ls="--",lw=2,c='black')
plt.axvline(x=-5,ls="--",lw=2,c='grey')
plt.xlim(-50,50)
plt.xticks(ticks=[-50,-30,-10,0,10,30,50],labels=[-50,-30,-10,0,10,30,50])
plt.legend(['VGCAUG'],frameon=False,fontsize = 16,loc='upper left')
plt.subplot(1,2,2)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=CIMSv,shade=False,color='#398AB9',alpha=1,bw_adjust=0.09,linewidth=4)
# labels
plt.xlabel("Distance to deletion site",fontsize=20)
plt.ylabel("Density of VGCAUG",fontsize=20)
# x lines
plt.axvline(x=0,ls="--",lw=2,c='black')
plt.axvline(x=-5,ls="--",lw=2,c='grey')
plt.xlim(-6,6)
# plt.xticks(ticks=[-6,-5,0,5,6],labels=[-6,-5,0,5,6])
plt.legend(['VGCAUG'],frameon=False,fontsize = 16,loc='upper right')

后面代码类似,这里只放图:

5导入 R 进行可视化
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!