m6A metagene distribution 纠正坐标

1引言
本篇推文上篇详见:
强烈建议读完上篇再来看这篇。
回想:
关于上篇基因组位置和转录组位置坐标的转换,对于负链起始时位置反了的,但是我们最后计算相对位置的时候用 1 去减了,所以结果是一样的: 代码部分:
# calculate relative distance
df_anno['rel_dist'] = 0
for mid in df_anno['id']:
_,_,type,length,pos = mid.split(sep='|')
strand_type = df_anno.loc[df_anno['id'] == mid,'strand']
if (strand_type == '+').bool():
if type == '5UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length)
elif type == 'CDS':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 1
elif type == '3UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 2
elif (strand_type == '-').bool():
if type == '5UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = (1- int(pos)/int(length))
elif type == 'CDS':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = (1 - int(pos)/int(length)) + 1
elif type == '3UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = (1- int(pos)/int(length)) + 2
else:
pass
# load info
featureinfo = pd.read_csv('longest_transcripts_info.csv')
featureinfo = featureinfo[['transid','translength','utr5length','cdslength','utr3length']]
# mergen feature length info
mergeinfo = df_anno.merge(featureinfo,on='transid')
mergeinfo
说明如下:

如果想得到第三种图里的正确位置的话,也是可以的,只需要排序一下即可。
2坐标转换
读取tmp.txt
文件,该文件记录了每个特征的单碱基位置:
# read big file faster
reader = pd.read_csv('tmp.txt', sep='t',names=['chr','start','end','strand','id'],iterator=True)
loop = True
chunkSize = 1000000
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop = False
print("Iteration is stopped.")
tmpinfo = pd.concat(chunks, ignore_index=True)
tmpinfo

正负链分出来排序:
# order "-" starnd sigle postion decreasing
tmpinfo_1_strand = tmpinfo[tmpinfo['strand'] == '+']
tmpinfo_1_strand = tmpinfo_1_strand.sort_values(by = ['id','chr','start','end'],ascending = True)
# descrease coord by - strand gene
tmpinfo_2_strand = tmpinfo[tmpinfo['strand'] == '-']
tmpinfo_2_strand = tmpinfo_2_strand.sort_values(by = ['id','chr','start','end'],ascending = False)
# merge
tmp_fianl = pd.concat([tmpinfo_1_strand,tmpinfo_2_strand],axis=0)
# save
tmp_fianl.to_csv('singlebase-region-info.txt',index=False,header=False,sep='t')
tmp_fianl

可以看到负链已经是降序了。
然后再添加每个特征的长度和位置信息:
df = pd.read_csv('longest_transcripts_info.csv')
df['transid'] = [line.split(sep='|')[2] for line in df['ID']]
# save feature length in dict
tansid_lengthinfo = {}
for tid in df['transid']:
tmpfile = df[df['transid'] == tid]
utr5 = int(tmpfile['utr5length'])
cds = int(tmpfile['cdslength'])
utr3 = int(tmpfile['utr3length'])
tansid_lengthinfo[tid] = [utr5,cds,utr3]
# add feature length and pos to id
tmpfinal = open('tmpfinal.txt','w')
myid = " "
with open('singlebase-region-info.txt','r') as FormatedData:
for line in FormatedData:
fid = line.split()
id = fid[4]
# split id
_,tranid,feature = id.split(sep='|')
# add feature length
if feature == '5UTR':
feature_len = tansid_lengthinfo[tranid][0]
elif feature == 'CDS':
feature_len = tansid_lengthinfo[tranid][1]
elif feature == '3UTR':
feature_len = tansid_lengthinfo[tranid][2]
else:
pass
# add transcriptome postion
if id != myid:
i = 0
myid = id
i += 1
tmpfinal.write(line.replace('n','') + "|" + str(feature_len) + "|" + str(i) + "n")
tmpfinal.close()
结果内容:
chr19 58864802 58864803 - A1BG|NM_130786.4|CDS|1485|1
chr19 58864801 58864802 - A1BG|NM_130786.4|CDS|1485|2
chr19 58864800 58864801 - A1BG|NM_130786.4|CDS|1485|3
chr19 58864799 58864800 - A1BG|NM_130786.4|CDS|1485|4
chr19 58864798 58864799 - A1BG|NM_130786.4|CDS|1485|5
chr19 58864797 58864798 - A1BG|NM_130786.4|CDS|1485|6
chr19 58864796 58864797 - A1BG|NM_130786.4|CDS|1485|7
chr19 58864795 58864796 - A1BG|NM_130786.4|CDS|1485|8
chr19 58864794 58864795 - A1BG|NM_130786.4|CDS|1485|9
chr19 58864793 58864794 - A1BG|NM_130786.4|CDS|1485|10
可以看的 从右往左,基因组坐标减小,转录组位置增大,符合预期结果。
3计算相对距离
接下来注释 peak 计算相对距离:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
计算函数:
# pack all code into function
def CalculateRelativeDist(peaksfile,annotatedinfo,featurelengthinfo):
print('Calculation is starting ......')
############################################1.
bed = pd.read_table(peaksfile,sep='t',names = ['chr','start','end','id','score','strand'],usecols=range(0,6))
if list(bed['end'] - bed['start']).count(1) == bed.shape[0]:
bed = bed
else:
# extract peaks midpoint position
bed['start'] = ((bed['start'] + bed['end'])/2).astype(int)
bed['end'] = bed['start'] + 1
# save into dict
peaksinfo = {start:chr for chr,start in zip(bed['chr'],bed['start'].astype(int))}
############################################2.
# intersect peaks
annoedbed = []
with open(annotatedinfo,'r') as annoinfo:
for line in annoinfo:
sepinfo = line.replace('n','').split()
chr = str(sepinfo[0])
start = int(sepinfo[1])
# print(chr,start)
if start in peaksinfo and chr == peaksinfo[start]:
annoedbed.append(sepinfo)
else:
pass
# add transid
df_anno = pd.DataFrame(annoedbed,columns=['chr','start','end','strand','id'])
df_anno['transid'] = [line.split(sep='|')[1] for line in df_anno['id']]
############################################3.
# calculate relative distance
df_anno['rel_dist'] = 0
for mid in df_anno['id']:
_,_,type,length,pos = mid.split(sep='|')
if type == '5UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length)
elif type == 'CDS':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 1
elif type == '3UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 2
else:
pass
# load info
featureinfo = pd.read_csv(featurelengthinfo)
featureinfo = featureinfo[['transid','translength','utr5length','cdslength','utr3length']]
# mergen feature length info
mergeinfo = df_anno.merge(featureinfo,on='transid')
# remove duplicated transcript
uniq_df = mergeinfo[['transid','utr5length','cdslength','utr3length']]
uniq_df = uniq_df.drop_duplicates(keep='first')
# calculate regeion scale factor
utr5_SF = (uniq_df['utr5length'].median(skipna=True))/(uniq_df['cdslength'].median(skipna=True))
utr3_SF = (uniq_df['utr3length'].median(skipna=True))/(uniq_df['cdslength'].median(skipna=True))
# select feature rel_dist
utr5_m6a_dist = mergeinfo.loc[mergeinfo['rel_dist'] < 1,'rel_dist']
cds_m6a_dist = mergeinfo.loc[(mergeinfo['rel_dist'] >= 1) & (mergeinfo['rel_dist'] < 2),'rel_dist'].values.tolist()
utr3_m6a_dist = mergeinfo.loc[mergeinfo['rel_dist'] >= 2,'rel_dist']
# scale utr5/utr3 regions
# scale utr5
scaler5 = MinMaxScaler(feature_range=(1-utr5_SF, 1))
utr5_m6a_dist_scaled = scaler5.fit_transform(np.array(utr5_m6a_dist).reshape(-1, 1))
utr5_m6a_dist_scaled = [y for x in utr5_m6a_dist_scaled.tolist() for y in x]
# scale utr3
scaler3 = MinMaxScaler(feature_range=(2, 2+utr3_SF))
utr3_m6a_dist_scaled = scaler3.fit_transform(np.array(utr3_m6a_dist).reshape(-1, 1))
utr3_m6a_dist_scaled = [y for x in utr3_m6a_dist_scaled.tolist() for y in x]
# combine to series into list
scaled_rel_dist = []
scaled_rel_dist.extend(utr5_m6a_dist_scaled)
scaled_rel_dist.extend(cds_m6a_dist)
scaled_rel_dist.extend(utr3_m6a_dist_scaled)
# add single column
mergeinfo['scaled_rel_dist'] = scaled_rel_dist
print('Calculation is done !')
return mergeinfo
这个地方就不用 1 减了:
# calculate relative distance
df_anno['rel_dist'] = 0
for mid in df_anno['id']:
_,_,type,length,pos = mid.split(sep='|')
if type == '5UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length)
elif type == 'CDS':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 1
elif type == '3UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 2
else:
pass
导入 peaks,这里使用了一个 m6A 和 m1A 的 peaks 文件:
m1A = CalculateRelativeDist(peaksfile='peak/m1A-peaks.txt',
annotatedinfo='tmpfinal.txt',
featurelengthinfo='longest_transcripts_info.csv')
control = CalculateRelativeDist(peaksfile='peak/GSE102493_Ctrl_m6A.narrowPeak',
annotatedinfo='tmpfinal.txt',
featurelengthinfo='longest_transcripts_info.csv')
Calculation is starting ......
Calculation is done !
Calculation is starting ......
Calculation is done !
重命名:
# rename columns
m1A = m1A.rename(columns={'rel_dist':'m1A_rel_dist','scaled_rel_dist':'m1A_scaled_rel_dist'})
control = control.rename(columns={'rel_dist':'control_rel_dist','scaled_rel_dist':'control_scaled_rel_dist'})
4可视化
plt.figure(figsize=[20,5])
sns.set(style='ticks')
###############################
plt.subplot(1,2,1)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=control['control_rel_dist'],shade=True,color='#1572A1',alpha=0.4,legend=True,bw_method=0.01) # control
sns.kdeplot(data=m1A['m1A_rel_dist'],shade=True,color='#DA1212',alpha=0.4,legend=True,bw_method=0.01) # m1A
# x lines
plt.axvline(x=1,ls="--",lw=2,c='grey')
plt.axvline(x=2,ls="--",lw=2,c='grey')
plt.xlim(0,3)
# labels
plt.xlabel("m1/6A relative position",fontsize=18)
plt.xticks(ticks=[0.5,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m1/6A peak density",fontsize=18)
plt.title("Equal feature length",fontsize=18)
plt.subplot(1,2,2)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=control['control_scaled_rel_dist'],shade=False,color='#1572A1',alpha=1,legend=True,bw_method=0.01) # control
sns.kdeplot(data=m1A['m1A_scaled_rel_dist'],shade=False,color='#DA1212',alpha=1,legend=True,bw_method=0.01) # m1A
# x lines
plt.axvline(x=1,ls="--",lw=2,c='grey')
plt.axvline(x=2,ls="--",lw=2,c='grey')
# labels
plt.xlabel("m1/6A relative position",fontsize=18)
plt.xticks(ticks=[0.9,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m1/6A peak density",fontsize=18)
plt.title("Scaled feature length",fontsize=18)
# save
plt.savefig('m1A_m6A_density.pdf',format='pdf')

下面这张是 m1A 原始文献里的:

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4842015/
条形图:
plt.figure(figsize=[30,15])
plt.subplot(2,2,1)
# label size
plt.tick_params(labelsize=18)
plt.hist(x=m1A['m1A_rel_dist'],bins=100)
# x lines
plt.axvline(x=1,ls="--",lw=2,c='black')
plt.axvline(x=2,ls="--",lw=2,c='black')
plt.xlim(0,3)
# labels
plt.xlabel("m1A relative position",fontsize=18)
plt.xticks(ticks=[0.5,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m1A peaks",fontsize=18)
plt.title("Equal feature length",fontsize=18)
plt.subplot(2,2,2)
# label size
plt.tick_params(labelsize=18)
plt.hist(x=m1A['m1A_scaled_rel_dist'],bins=100)
# x lines
plt.axvline(x=1,ls="--",lw=2,c='black')
plt.axvline(x=2,ls="--",lw=2,c='black')
# labels
plt.xlabel("m1A relative position",fontsize=18)
plt.xticks(ticks=[0.9,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m1A peaks",fontsize=18)
plt.title("Scaled feature length",fontsize=18)
plt.subplot(2,2,3)
# label size
plt.tick_params(labelsize=18)
plt.hist(x=control['control_rel_dist'],bins=100)
# x lines
plt.axvline(x=1,ls="--",lw=2,c='black')
plt.axvline(x=2,ls="--",lw=2,c='black')
plt.xlim(0,3)
# labels
plt.xlabel("m6A relative position",fontsize=18)
plt.xticks(ticks=[0.5,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m6A peaks",fontsize=18)
plt.title("Equal feature length",fontsize=18)
plt.subplot(2,2,4)
# label size
plt.tick_params(labelsize=18)
plt.hist(x=control['control_scaled_rel_dist'],bins=100)
# x lines
plt.axvline(x=1,ls="--",lw=2,c='black')
plt.axvline(x=2,ls="--",lw=2,c='black')
# labels
plt.xlabel("m6A relative position",fontsize=18)
plt.xticks(ticks=[0.9,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m6A peaks",fontsize=18)
plt.title("Scaled feature length",fontsize=18)
# save
# plt.savefig('hist_m1A.pdf',format='pdf')

5给 5’UTR-CDS-3’UTR 分 bin
上面两种可视化方式一种是三个区域等长,一种是根据三个区域的相对大小来标准化长度。
还有一种可以将三个区域自定义划分多长的比例。
这里划分 100 份,5’UTR-CDS-3’UTR 分别占 15,45,40 这样的比例。
函数修改:
# pack all code into function
def CalculateRelativeDist(peaksfile,annotatedinfo,featurelengthinfo):
print('Calculation is starting ......')
############################################1.
bed = pd.read_table(peaksfile,sep='t',names = ['chr','start','end','id','score','strand'],usecols=range(0,6))
if list(bed['end'] - bed['start']).count(1) == bed.shape[0]:
bed = bed
else:
# extract peaks midpoint position
bed['start'] = ((bed['start'] + bed['end'])/2).astype(int)
bed['end'] = bed['start'] + 1
# save into dict
peaksinfo = {start:chr for chr,start in zip(bed['chr'],bed['start'].astype(int))}
############################################2.
# intersect peaks
annoedbed = []
with open(annotatedinfo,'r') as annoinfo:
for line in annoinfo:
sepinfo = line.replace('n','').split()
chr = str(sepinfo[0])
start = int(sepinfo[1])
# print(chr,start)
if start in peaksinfo and chr == peaksinfo[start]:
annoedbed.append(sepinfo)
else:
pass
# add transid
df_anno = pd.DataFrame(annoedbed,columns=['chr','start','end','strand','id'])
df_anno['transid'] = [line.split(sep='|')[1] for line in df_anno['id']]
############################################3.
# calculate relative distance
df_anno['rel_dist'] = 0
for mid in df_anno['id']:
_,_,type,length,pos = mid.split(sep='|')
if type == '5UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length)
elif type == 'CDS':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 1
elif type == '3UTR':
df_anno.loc[df_anno['id'] == mid,'rel_dist'] = int(pos)/int(length) + 2
else:
pass
# load info
featureinfo = pd.read_csv(featurelengthinfo)
featureinfo = featureinfo[['transid','translength','utr5length','cdslength','utr3length']]
# mergen feature length info
mergeinfo = df_anno.merge(featureinfo,on='transid')
# # remove duplicated transcript
# uniq_df = mergeinfo[['transid','utr5length','cdslength','utr3length']]
# uniq_df = uniq_df.drop_duplicates(keep='first')
# # calculate regeion scale factor
# utr5_SF = (uniq_df['utr5length'].median(skipna=True))/(uniq_df['cdslength'].median(skipna=True))
# utr3_SF = (uniq_df['utr3length'].median(skipna=True))/(uniq_df['cdslength'].median(skipna=True))
# select feature rel_dist
utr5_m6a_dist = mergeinfo.loc[mergeinfo['rel_dist'] < 1,'rel_dist']
cds_m6a_dist = mergeinfo.loc[(mergeinfo['rel_dist'] >= 1) & (mergeinfo['rel_dist'] < 2),'rel_dist'].values.tolist()
utr3_m6a_dist = mergeinfo.loc[mergeinfo['rel_dist'] >= 2,'rel_dist']
# scale utr5/utr3 regions
# scale utr5
scaler5 = MinMaxScaler(feature_range=(2/3, 1))
utr5_m6a_dist_scaled = scaler5.fit_transform(np.array(utr5_m6a_dist).reshape(-1, 1))
utr5_m6a_dist_scaled = [y for x in utr5_m6a_dist_scaled.tolist() for y in x]
# scale utr3
scaler3 = MinMaxScaler(feature_range=(2, 2+(8/9)))
utr3_m6a_dist_scaled = scaler3.fit_transform(np.array(utr3_m6a_dist).reshape(-1, 1))
utr3_m6a_dist_scaled = [y for x in utr3_m6a_dist_scaled.tolist() for y in x]
# combine to series into list
scaled_rel_dist = []
scaled_rel_dist.extend(utr5_m6a_dist_scaled)
scaled_rel_dist.extend(cds_m6a_dist)
scaled_rel_dist.extend(utr3_m6a_dist_scaled)
# add single column
mergeinfo['scaled_rel_dist'] = scaled_rel_dist
print('Calculation is done !')
return mergeinfo
同样的读取画图:
m1A = CalculateRelativeDist(peaksfile='peak/m1A-peaks.txt',
annotatedinfo='tmpfinal.txt',
featurelengthinfo='longest_transcripts_info.csv')
control = CalculateRelativeDist(peaksfile='peak/GSE102493_Ctrl_m6A.narrowPeak',
annotatedinfo='tmpfinal.txt',
featurelengthinfo='longest_transcripts_info.csv')
# rename columns
m1A = m1A.rename(columns={'rel_dist':'m1A_rel_dist','scaled_rel_dist':'m1A_scaled_rel_dist'})
control = control.rename(columns={'rel_dist':'control_rel_dist','scaled_rel_dist':'control_scaled_rel_dist'})
plt.figure(figsize=[20,5])
sns.set(style='ticks')
###############################
plt.subplot(1,2,1)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=control['control_rel_dist'],shade=True,color='#1572A1',alpha=0.4,legend=True,bw_method=0.01) # control
sns.kdeplot(data=m1A['m1A_rel_dist'],shade=True,color='#DA1212',alpha=0.4,legend=True,bw_method=0.01) # m1A
# x lines
plt.axvline(x=1,ls="--",lw=2,c='grey')
plt.axvline(x=2,ls="--",lw=2,c='grey')
plt.xlim(0,3)
# labels
plt.xlabel("m1/6A relative position",fontsize=18)
plt.xticks(ticks=[0.5,1.5,2.5],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m1/6A peak density",fontsize=18)
plt.title("Equal feature length",fontsize=18)
plt.subplot(1,2,2)
# label size
plt.tick_params(labelsize=18)
# density plot
sns.kdeplot(data=control['control_scaled_rel_dist'],shade=False,color='#1572A1',alpha=1,legend=True,bw_method=0.01) # control
sns.kdeplot(data=m1A['m1A_scaled_rel_dist'],shade=False,color='#DA1212',alpha=1,legend=True,bw_method=0.01) # m1A
# x lines
plt.axvline(x=1,ls="--",lw=2,c='grey')
plt.axvline(x=2,ls="--",lw=2,c='grey')
plt.xlim(2/3,(2+8/9))
# labels
plt.xlabel("m1/6A relative position",fontsize=18)
plt.xticks(ticks=[5/6,1.5,22/9],labels=["5'UTR",'CDS',"3'UTR",])
plt.ylabel("m1/6A peak density",fontsize=18)
plt.title("Scaled feature length",fontsize=18)
# save
plt.savefig('m1A_m6A_density_100bin.pdf',format='pdf')

类似于这样的比例:

6结尾
这些代码跑起来需要个十分钟左右。
所有的数据在开头提到的推文里,这里将 m1A 的 peaks 数据上传到 QQ 群文件夹里了。

欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
哦,数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀m6A metagene distribution 计算详解
◀跟着 nature cell biology 学绘图–小提琴图
◀...
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!