Fisher Exact test for m6A peak and RBP binding sites / microRNA.
Data on cloud drive
import pandas as pd, numpy as np
from pybedtools import BedTool
from collections import Counter
import matplotlib.pyplot as plt
from scipy import stats
from fisher import pvalue
import pickle
import os
path = os.path.expanduser("~/Documents/m6A/")
input_m6A_peak = f"{path}/Data/metApeakFisher/unmerged_peaks_metApeak.bed"
input_m6A_peak_merged = f"{path}/Data/metApeakFisher/joint_merged_peaks_counts.xlsx"
input_m6A_nonpeak = f"{path}/Data/metApeakFisher/unmerged_nonpeaks_metApeak.bed"
input_RBP = f"{path}/Data/all.RBP.intersect.hg19.bed"
input_intersect_peak = f"{path}/Data/metApeakFisher/peak.intersect.RBP.bed"
input_intersect_nonpeak = f"{path}/Data/metApeakFisher/nonpeak.intersect.RBP.bed"
input_intersect_peak_merged = f"{path}/Data/metApeakFisher/peak.merged.intersect.RBP.bed"
output_m6A_peak_merged = f"{path}/Data/metApeakFisher/joint_merged_peaks_counts.bed"
output_RBP_fisher = f"{path}/output/output_RBP_fisher.pkl"
# microRNA
input_miRNA = f"{path}/Data/miRNA_target_in_YRI_LCLs.bed"
output_miRNA = f"{path}/Data/miRNA.bed"
input_intersect_peak_miRNA = f"{path}/Data/metApeakFisher/peak.intersect.miRNA.bed"
input_intersect_peak_merged_miRNA = f"{path}/Data/metApeakFisher/peak.merged.intersect.miRNA.bed"
input_intersect_nonpeak_miRNA = f"{path}/Data/metApeakFisher/nonpeak.intersect.miRNA.bed"
output_miRNA_fisher = f"{path}/output/output_miRNA_fisher.pkl"
def save_data(data, filename):
pickle.dump(data, open(filename, "wb"))
p = 0.05
Prepare and export bed file; load m6A peak/nonpeak and RBP binding datasets; check data quality before using.
In order to use bedtools
to obtain intersection of RBP and m6A peak, first we need to convert the original merged m6A peak, which is an excel, to bed file, then export.
Only need to do it once. bed file must be tab separated, false index, column "chr" rename to "# chr".
m6A_peaks_merged = pd.read_excel(input_m6A_peak_merged, usecols = [0,1,2,3,4,5])
m6A_peaks_merged = m6A_peaks_merged.rename(columns = {"chr": "# chr"})
m6A_peaks_merged.to_csv(output_m6A_peak_merged, index = False, sep = "\t")
m6A_peaks = pd.read_table(input_m6A_peak, header = 0)
m6A_peaks = m6A_peaks.sort_values(by = ["# chr", "chromStart"])
m6A_peaks = m6A_peaks.rename(columns = {"# chr": "chr"})
# m6A_peaks = m6A_peaks[m6A_peaks["score"] < 0.05]
m6A_nonpeaks = pd.read_table(input_m6A_nonpeak, header = 0)
m6A_nonpeaks = m6A_nonpeaks.sort_values(by = ["# chr", "chromStart"])
m6A_nonpeaks = m6A_nonpeaks.rename(columns = {"# chr": "chr"})
# m6A_nonpeaks = m6A_nonpeaks.iloc[:10000, :]
print (m6A_peaks.shape, m6A_peaks_merged.shape, m6A_nonpeaks.shape)
RBP_binding_sites = pd.read_table(input_RBP, sep="\t", header = None,
names = ["chr", "start", "end", "RBP", "peak_width", "strand"])
RBP_binding_sites = RBP_binding_sites[RBP_binding_sites["peak_width"] <= 200]
RBP_name = set(RBP_binding_sites["RBP"])
RBP_name = list(RBP_name)
RBP_name.sort()
print (set(RBP_binding_sites["RBP"]))
Binding widths concentrate around 30 bps.
peak_width = RBP_binding_sites["peak_width"].tolist()
plt.hist(peak_width, bins = 20) # arguments are passed to np.histogram
plt.title("Histogram of RBP binding sites peak width")
plt.show()
For merged/unmerged m6A peak and unmerged nonpeak, obtain their intersections with RBP binding sites respectively.
Use bedtools
in bash under the depository ~/Documents/m6A/Data/metApeakFisher
.
First Sort each m6A peak/nonpeak bed file.
sort -k1,1 -k2,2n unmerged_peaks_metApeak.bed > peaks.bed
sort -k1,1 -k2,2n unmerged_nonpeaks_metApeak.bed > nonpeaks.bed
Then use bedtools
to obtain intersection of m6A and RBP.
bedtools intersect -a ../all.RBP.intersect.hg19.bed -b peaks.bed -s > peak.intersect.RBP.bed
bedtools intersect -a ../all.RBP.intersect.hg19.bed -b nonpeaks.bed -s > nonpeak.intersect.RBP.bed
Since we do not have merged m6A nonpeak data, we use unmerged nonpeak instead (already sort and get intersection above).
sort -k1,1 -k2,2n joint_merged_peaks_counts.bed > peaks.merged.bed
bedtools intersect -a ../all.RBP.intersect.hg19.bed -b peaks.merged.bed -s > peak.merged.intersect.RBP.bed
Perform Fisher's exact test for each RBP binding site.
For each RBP, we need to obtain the four following numbers before Fisher's test:
def read_intersect(fn, head = None, cols = ["chr", "chromStart", "chromEnd", "name", "inter_width", "strand"]):
return pd.read_table(fn, header = head, names = cols)
def get_fisher(peak_fn, nonpeak_fn, peak_merge = False, cols = ['name', 'n_inter_peak', 'n_inter_nonpeak',
'n_nointer_peak', 'n_nointer_nonpeak',
'fisher_p', 'odds_ratio']):
peaks_intersect = read_intersect(peak_fn)
peaks_intersect["peak"] = 1
nonpeaks_intersect = read_intersect(nonpeak_fn)
nonpeaks_intersect["peak"] = 0
intersect = peaks_intersect.append(nonpeaks_intersect)
fisher_stats = pd.DataFrame(columns = cols)
idx = 0
sum_inter_peak = peaks_intersect.shape[0]
sum_inter_nonpeak = nonpeaks_intersect.shape[0]
for RBP in list(set(intersect["name"])):
fisher_stats.loc[idx, 'name'] = RBP
n_inter_peak = intersect[(intersect["name"] == RBP) & (intersect["peak"] == 1)].shape[0]
n_inter_nonpeak = intersect[(intersect["name"] == RBP) & (intersect["peak"] == 0)].shape[0]
n_nointer_peak = sum_inter_peak - n_inter_peak
n_nointer_nonpeak = sum_inter_nonpeak - n_inter_nonpeak
fisher_stats.loc[idx, 'n_inter_peak'] = n_inter_peak
fisher_stats.loc[idx, 'n_inter_nonpeak'] = n_inter_nonpeak
fisher_stats.loc[idx, 'n_nointer_peak'] = n_nointer_peak
fisher_stats.loc[idx, 'n_nointer_nonpeak'] = n_nointer_nonpeak
fisher_stats.loc[idx, 'fisher_p'] = stats.fisher_exact([[n_inter_peak, n_inter_nonpeak],
[n_nointer_peak, n_nointer_nonpeak]])[1]
fisher_stats.loc[idx, 'odds_ratio'] = stats.fisher_exact([[n_inter_peak, n_inter_nonpeak],
[n_nointer_peak, n_nointer_nonpeak]])[0]
idx += 1
fisher_stats = fisher_stats.sort_values(by = ["odds_ratio"], ascending = False)
fisher_stats = fisher_stats.set_index([[i for i in range(fisher_stats.shape[0])]])
return fisher_stats
fn1 = input_intersect_peak_merged
fn2 = input_intersect_nonpeak
fn3 = input_intersect_peak
fisher_peak_merge = get_fisher(peak_fn = fn1, nonpeak_fn = fn2)
fisher_peak_unmerge = get_fisher(peak_fn = fn3, nonpeak_fn = fn2)
save_data(fisher_peak_merge, output_RBP_fisher)
Check the test statistics for potential m6A binding proteins (RBPs contain keywords "YTHDF", "HNRNP", "FMRP", "IGF2BP" or "G3BP").
There are no RBPs that contains keywords "YTHDF" or "G3BP". Only "HNRNP", "FMR1" and "IGF2BP" are found, including 'IGF2BP3', 'HNRNPUL1', 'HNRNPA1', 'HNRNPM', 'IGF2BP1', 'FMR1', 'HNRNPK', 'HNRNPU', 'IGF2BP2', 'HNRNPC'.
RBP_potential = ["YTHDF", "HNRNP", "FMR1", "IGF2BP", "G3BP"]
RBP_names = fisher_peak_merge["name"].tolist()
RBPs_potential = list()
for RBP in RBP_names:
tmp = [RBP for item in RBP_potential if RBP.find(item) == 0]
RBPs_potential.extend(tmp)
print (RBPs_potential)
fisher_peak_merge
res = pd.read_pickle(output_RBP_fisher)
output_RBP_fisher_excel = f"{path}/output/output_RBP_fisher.xlsx"
writer = pd.ExcelWriter(output_RBP_fisher_excel)
res.to_excel(writer)
writer.save()
fisher_peak_merge[fisher_peak_merge["name"].isin(RBPs_potential)]
fisher_peak_unmerge[fisher_peak_unmerge["name"].isin(RBPs_potential)]
Perform Fisher's exact test for each microRNA.
miRNA = pd.read_table(input_miRNA, header = 0, usecols = [0,1,2,3,5,10,11,12])
miRNA["name_gene"] = miRNA.apply(lambda row: row["name"].split(":")[0], axis = 1)
There are 55 microRNAs included in dataset.
Binding sites in one block are not always single. For example, "6,1" and "2,5" also exists in "blockSize".
print (Counter(miRNA["miRNA"]), "\n", len(set(miRNA["miRNA"])))
print (Counter(miRNA["blockSize"]))
microRNA blockCount can be 1 or 2. If blockCount is 2, blockSize and blockStarts contain two numbers, binding sites in one block are not continuous.
For rows whose blockCount is 2, split the row into two rows according to blockSize and blockStarts.
# separate the blocks with multiple blockSize
cols_miRNA = ["chr", "start", "end", "miRNA", "blockSize", "strand"]
single_block_miRNA = miRNA[miRNA["blockSize"].isin(["7", "8"])][cols_miRNA]
multi_block_miRNA = miRNA[~miRNA["blockSize"].isin(["7", "8"])]
tmp_col = ['chr', 'start', 'end', 'name', 'strand', 'miRNA', 'name_gene']
tmp_miRNA = pd.DataFrame(columns = tmp_col + ["blockSize", "interval"])
i = 0
for index, row in multi_block_miRNA.iterrows():
if len(row["blockSize"].split(",")) != len(row["blockStarts"].split(",")):
print ("The length of 'blockSize' and 'blockStarts' are different")
break
else:
for num in range(len(row["blockSize"].split(","))):
tmp1 = [int(row["blockSize"].split(",")[num]), int(row["blockStarts"].split(",")[num])]
tmp = row[tmp_col].tolist()
tmp.extend(tmp1)
tmp_miRNA.loc[i] = tmp
i += 1
tmp_miRNA["start"] = tmp_miRNA.apply(lambda row: row["start"] + row["interval"], axis = 1)
tmp_miRNA["end"] = tmp_miRNA.apply(lambda row: row["start"] + row["blockSize"], axis = 1)
An example of splitting a row with two blockSize/blockStarts into two rows.
tmp_miRNA[tmp_miRNA["name_gene"] == "NRD1"]
single_block_miRNA = single_block_miRNA.append(tmp_miRNA[cols_miRNA])
Export bed file of unique block start and end in each row
cols_sortby = ["miRNA", "chr", "start"]
single_block_miRNA.sort_values(by = cols_sortby).to_csv(output_miRNA, index = False, header = False, sep = "\t")
Use bedtools
to obtain intersect of peak/nonpeak with miRNA under the folder
~/Documents/m6A/Data/metApeakFisher
bedtools intersect -a ../miRNA.bed -b peaks.bed -s > peak.intersect.miRNA.bed
bedtools intersect -a ../miRNA.bed -b peaks.merged.bed -s > peak.merged.intersect.miRNA.bed
bedtools intersect -a ../miRNA.bed -b nonpeaks.bed -s > nonpeak.intersect.miRNA.bed
Load data of intersections of m6A peak and miRNA.
fn4 = input_intersect_peak_miRNA
fn5 = input_intersect_nonpeak_miRNA
fn6 = input_intersect_peak_merged_miRNA
fisher_peak_merge_miRNA = get_fisher(peak_fn = fn4, nonpeak_fn = fn5)
fisher_peak_unmerge_miRNA = get_fisher(peak_fn = fn6, nonpeak_fn = fn5)
save_data(fisher_peak_merge_miRNA.sort_values(by =["odds_ratio"], ascending = False), output_miRNA_fisher)
fisher_peak_merge_miRNA.sort_values(by =["odds_ratio"], ascending = False)
Select those miRNA with p-value less than 0.05.
fisher_peak_merge_miRNA[fisher_peak_merge_miRNA["fisher_p"] < p].sort_values(by = ["odds_ratio"], ascending = False)
All miRNA and those whose p-value less than 0.05.
fisher_peak_unmerge_miRNA.sort_values(by =["odds_ratio"], ascending = False)
fisher_peak_unmerge_miRNA[fisher_peak_unmerge_miRNA["fisher_p"] < p].sort_values(by = ["odds_ratio"], ascending = False)
Exported from analysis/20180212_Enrichment_annalysis_m6A_peak_with_RBP_or_miRNA.ipynb
committed by Min Qiao on Wed Mar 28 20:03:22 2018 revision 2, d71b8f4