dbSNP
¶Obtain mutation position according to its chromosome and position on genotype, and get responding snpID
on dbSNP
.
import pandas as pd
import os
import matplotlib.pyplot as plt
path = os.path.expanduser("~/Documents/m6A/")
exon_m6AQTL = f"{path}/Data/exon_m6AQTLs.txt"
output_exon_m6AQTL = f"{path}/Data/exon_m6AQTLs.bed"
intron_m6AQTL = f"{path}/Data/intron_m6AQTLs.txt"
output_intron_m6AQTL = f"{path}/Data/intron_m6AQTLs.bed"
output_dbSNP = f"{path}/Data/dbSNP/dbSNP.vcf.gz"
Import dbSNP and export the first five columns, only need to do it once.
# input_dbSNP = f"{path}/Data/dbSNP/common_all_20170710.vcf.gz"
# dbSNP = pd.read_table(input_dbSNP, header = 56, compression = "gzip", usecols = [0,1,2,3,4], low_memory = False)
# dbSNP.to_csv(output_dbSNP, compression = "gzip", sep = "\t", index = False)
Handle exon and intron m6AQTL data
def load_m6AQTL(data_path, n = 200, n_width = 1000):
data = pd.read_table(data_path, header = 0)
data = data.sort_values(by = ["chr", "pos"])
data = data.set_index([[i for i in range(data.shape[0])]])
data["strand"] = data.apply(lambda row: row["peakID"].split("_")[-1], axis = 1)
data["gene_symbol"] = data.apply(lambda row: row["peakID"].split("_")[0], axis = 1)
data["start"] = data.apply(lambda row: int(row["peakID"].split("_")[1]), axis = 1)
data["end"] = data.apply(lambda row: int(row["peakID"].split("_")[2]), axis = 1)
data["width"] = data.apply(lambda row: row["end"] - row["start"] + 1, axis = 1)
data["pos1"] = data.apply(lambda row: row["pos"] - n, axis = 1)
data["pos2"] = data.apply(lambda row: row["pos"] + n, axis = 1)
# data["gap"] = data.apply(lambda row: row["start"] - row["pos"], axis = 1)
# data["in"] = data.apply(lambda row: 1 if row["pos"] >= row["start"] and row["pos"] <= row["end"] else 0,axis=1)
# only keep those width <= 1000
data = data[data["width"] <= n_width]
return data
intron_m6AQTL = load_m6AQTL(intron_m6AQTL, )
exon_m6AQTL = load_m6AQTL(exon_m6AQTL)
print (intron_m6AQTL.shape, exon_m6AQTL.shape)
exon_m6AQTL[["chr", "pos", "peakID", "snpID", "start", "end", "width", "pos1", "pos2", "gene_symbol", "strand"]].head(5)
intron_m6AQTL[["chr", "pos", "peakID", "snpID", "start", "end", "width", "pos1", "pos2", "gene_symbol", "strand"]].head(5)
# n1 = 1000
# # gap = intron_m6AQTL["gap"].tolist()
# intron_gap = intron_m6AQTL[(intron_m6AQTL["gap"] <= n1) & (intron_m6AQTL["gap"] >= -n1)]["gap"].tolist()
# plt.hist(intron_gap, bins = 30)
# plt.title("Histogram of gap in introns (the distance between SNP pos and start of peak)")
# plt.show()
# exon_gap = exon_m6AQTL[(exon_m6AQTL["gap"] <= n1) & (exon_m6AQTL["gap"] >= -n1)]["gap"].tolist()
# plt.hist(exon_gap, bins = 30)
# plt.title("Histogram of gap in exons (the distance between SNP pos and start of peak)")
# plt.show()
dbSNP = pd.read_table(output_dbSNP, compression = "gzip", header = 0, sep = "\t", low_memory = False)
dbSNP.head(5)
def get_m6AQTL(data, output_path):
res = pd.merge(data, dbSNP, how = "inner", left_on = ["pos", "snpID"], right_on = ["POS", "ID"])
res = res[((res["start"] >= res["pos1"]) & (res["start"] <= res["pos2"]))
| ((res["end"] >= res["pos1"]) & (res["end"] <= res["pos2"]))
| ((res["start"] <= res["pos1"]) & (res["end"] >= res["pos2"]))]
res = res.set_index([[i for i in range(res.shape[0])]])
res[["chr", "pos1", "pos2", "gene_symbol", "width", "strand"]].to_csv(output_path,
index = False, header = False, sep = "\t")
cols = ["#CHROM", "POS", "ID", "REF", "ALT", "strand", "start", "end", "width", "gene_symbol", "pos1", "pos2",
"beta", "FDR"]
return res[cols]
intron_m6AQTL = get_m6AQTL(intron_m6AQTL, output_intron_m6AQTL)
exon_m6AQTL = get_m6AQTL(exon_m6AQTL, output_exon_m6AQTL)
print (intron_m6AQTL.shape, exon_m6AQTL.shape)
exon_m6AQTL
exon_m6AQTL[["ID"]].to_csv(f"{path}/Data/exon_tmp.txt", index = False, header = True, sep = "\t")
intron_m6AQTL
list(set(exon_m6AQTL["gene_symbol"]))
from collections import Counter
Counter(exon_m6AQTL["width"])
Trial (does not work as website)
RNAsnp -f ~/Downloads/RNAsnp_datasets/dataset1/sequences.fasta -s ~/Downloads/RNAsnp_datasets/dataset1/snps.txt
RNAsnp -f ~/Downloads/RNAsnp_datasets/dataset2/sequences.txt -s ~/Downloads/RNAsnp_datasets/dataset2/snps.txt -m 2
Use "bedtools" in bash under the depository
~/Documents/m6A/Data/metApeakFisher
bedtools intersect -a ../intron_m6AQTLs.bed -b peaks.merged.bed -s > peak.merged.intron.m6AQTL.bed
bedtools intersect -a ../exon_m6AQTLs.bed -b peaks.merged.bed -s > peak.merged.exon.m6AQTL.bed
n2 = 500
width = intron_m6AQTL[intron_m6AQTL["width"] <= n2]["width"].tolist()
plt.hist(width, bins = 25)
plt.title("Histogram of width")
plt.show()
@NTCNCCACCC:K00180:212:H7VCTBBXX:3:1101:21673:1033 1:N:0:NAATTCGT+AGGCTNTA @NCGNTCAAGA:K00180:212:H7VCTBBXX:3:1101:21755:1033 1:N:0:NAATTCGT+AGGCTNTA @NCTNCCCGAG:K00180:212:H7VCTBBXX:3:1101:21795:1033 1:N:0:NAATTCGT+AGGCTNTA @NTCNTCCAAC:K00180:212:H7VCTBBXX:3:1101:22465:1033 1:N:0:NAATTCGT+AGGCTNTA
@
Exported from analysis/20180222_m6A_riboSNitch.ipynb
committed by Min Qiao on Wed May 2 19:10:46 2018 revision 13, bd67d58