M6A

Determine if SNP changes RSS of nearby m6A motif GGACU

Step 1: Obtain mutation ID, ref and alt on dbSNP

Obtain mutation position according to its chromosome and position on genotype, and get responding snpID on dbSNP.

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
path = os.path.expanduser("~/Documents/m6A/")
exon_m6AQTL = f"{path}/Data/exon_m6AQTLs.txt"
output_exon_m6AQTL = f"{path}/Data/exon_m6AQTLs.bed"
intron_m6AQTL = f"{path}/Data/intron_m6AQTLs.txt"
output_intron_m6AQTL = f"{path}/Data/intron_m6AQTLs.bed"
output_dbSNP = f"{path}/Data/dbSNP/dbSNP.vcf.gz"

Import dbSNP and export the first five columns, only need to do it once.

In [2]:
# input_dbSNP = f"{path}/Data/dbSNP/common_all_20170710.vcf.gz"
# dbSNP = pd.read_table(input_dbSNP, header = 56, compression = "gzip", usecols = [0,1,2,3,4], low_memory = False)
# dbSNP.to_csv(output_dbSNP, compression = "gzip", sep = "\t", index = False)

Handle exon and intron m6AQTL data

  1. sort by chromosome and pos
  2. plus and minus 200 for each pos
In [3]:
def load_m6AQTL(data_path, n = 200, n_width = 1000):
    data = pd.read_table(data_path, header = 0)
    data = data.sort_values(by = ["chr", "pos"])
    data = data.set_index([[i for i in range(data.shape[0])]])
    data["strand"] = data.apply(lambda row: row["peakID"].split("_")[-1], axis = 1)
    data["gene_symbol"] = data.apply(lambda row: row["peakID"].split("_")[0], axis = 1)
    data["start"] = data.apply(lambda row: int(row["peakID"].split("_")[1]), axis = 1)
    data["end"] = data.apply(lambda row: int(row["peakID"].split("_")[2]), axis = 1)
    data["width"] = data.apply(lambda row: row["end"] - row["start"] + 1, axis = 1)
    data["pos1"] = data.apply(lambda row: row["pos"] - n, axis = 1)
    data["pos2"] = data.apply(lambda row: row["pos"] + n, axis = 1)
#     data["gap"] = data.apply(lambda row: row["start"] - row["pos"], axis = 1)
#     data["in"] = data.apply(lambda row: 1 if row["pos"] >= row["start"] and row["pos"] <= row["end"] else 0,axis=1)
    # only keep those width <= 1000
    data = data[data["width"] <= n_width]
    return data
In [4]:
intron_m6AQTL = load_m6AQTL(intron_m6AQTL, )
exon_m6AQTL = load_m6AQTL(exon_m6AQTL)
In [5]:
print (intron_m6AQTL.shape, exon_m6AQTL.shape)
(9312, 16) (2254, 16)
In [6]:
exon_m6AQTL[["chr", "pos", "peakID", "snpID", "start", "end", "width", "pos1", "pos2", "gene_symbol", "strand"]].head(5)
Out[6]:
chr pos peakID snpID start end width pos1 pos2 gene_symbol strand
1 chr1 943468 ISG15_949466_949516_+ rs3121567 949466 949516 51 943268 943668 ISG15 +
2 chr1 947538 ISG15_949466_949516_+ rs2465125 949466 949516 51 947338 947738 ISG15 +
3 chr1 948421 ISG15_949466_949516_+ rs113047134 949466 949516 51 948221 948621 ISG15 +
4 chr1 948846 ISG15_949466_949516_+ rs3841266 949466 949516 51 948646 949046 ISG15 +
5 chr1 948921 ISG15_949466_949516_+ rs15842 949466 949516 51 948721 949121 ISG15 +
In [7]:
intron_m6AQTL[["chr", "pos", "peakID", "snpID", "start", "end", "width", "pos1", "pos2", "gene_symbol", "strand"]].head(5)
Out[7]:
chr pos peakID snpID start end width pos1 pos2 gene_symbol strand
0 chr1 899937 AGRN_981788_981988_+ rs143296006 981788 981988 201 899737 900137 AGRN +
1 chr1 899938 AGRN_981788_981988_+ rs147467971 981788 981988 201 899738 900138 AGRN +
2 chr1 899942 AGRN_981788_981988_+ rs71509448 981788 981988 201 899742 900142 AGRN +
3 chr1 943468 ISG15_949466_949516_+ rs3121567 949466 949516 51 943268 943668 ISG15 +
4 chr1 945612 ISG15_949466_949516_+ rs3121565 949466 949516 51 945412 945812 ISG15 +
In [8]:
# n1 = 1000
# # gap = intron_m6AQTL["gap"].tolist()
# intron_gap = intron_m6AQTL[(intron_m6AQTL["gap"] <= n1) & (intron_m6AQTL["gap"] >= -n1)]["gap"].tolist()
# plt.hist(intron_gap, bins = 30)
# plt.title("Histogram of gap in introns (the distance between SNP pos and start of peak)")
# plt.show()
In [9]:
# exon_gap = exon_m6AQTL[(exon_m6AQTL["gap"] <= n1) & (exon_m6AQTL["gap"] >= -n1)]["gap"].tolist()
# plt.hist(exon_gap, bins = 30)
# plt.title("Histogram of gap in exons (the distance between SNP pos and start of peak)")
# plt.show()
In [10]:
dbSNP = pd.read_table(output_dbSNP, compression = "gzip", header = 0, sep = "\t", low_memory = False)
dbSNP.head(5)
Out[10]:
#CHROM POS ID REF ALT
0 1 10177 rs367896724 A AC
1 1 10352 rs555500075 T TA
2 1 10352 rs145072688 T TA
3 1 10616 rs376342519 CCGCCGTTGCAAAGGCGCGCCG C
4 1 10642 rs558604819 G A
In [11]:
def get_m6AQTL(data, output_path):
    res = pd.merge(data, dbSNP, how = "inner", left_on = ["pos", "snpID"], right_on = ["POS", "ID"])
    res = res[((res["start"] >= res["pos1"]) & (res["start"] <= res["pos2"])) 
              | ((res["end"] >= res["pos1"]) & (res["end"] <= res["pos2"])) 
              | ((res["start"] <= res["pos1"]) & (res["end"] >= res["pos2"]))]
    res = res.set_index([[i for i in range(res.shape[0])]])
    res[["chr", "pos1", "pos2", "gene_symbol", "width", "strand"]].to_csv(output_path, 
                                                                   index = False, header = False, sep = "\t")
    cols = ["#CHROM", "POS", "ID", "REF", "ALT", "strand", "start", "end", "width", "gene_symbol", "pos1", "pos2", 
            "beta", "FDR"]
    return res[cols]
In [12]:
intron_m6AQTL = get_m6AQTL(intron_m6AQTL, output_intron_m6AQTL)
exon_m6AQTL = get_m6AQTL(exon_m6AQTL, output_exon_m6AQTL)
In [13]:
print (intron_m6AQTL.shape, exon_m6AQTL.shape)
(188, 14) (378, 14)
In [14]:
exon_m6AQTL
Out[14]:
#CHROM POS ID REF ALT strand start end width gene_symbol pos1 pos2 beta FDR
0 1 15756642 rs116128102 G A + 15756567 15756765 199 EFHD2 15756442 15756842 -0.456230 3.193890e-02
1 1 28212975 rs6564 C T + 28212824 28213122 299 THEMIS2 28212775 28213175 -0.163325 5.168226e-02
2 1 28213157 rs6565 T C + 28212824 28213122 299 THEMIS2 28212957 28213357 -0.163325 5.168226e-02
3 1 46810670 rs17361763 T C + 46810516 46810766 251 NSUN4 46810470 46810870 0.819328 2.263735e-07
4 1 46810842 rs6684274 C T + 46810516 46810766 251 NSUN4 46810642 46811042 0.819328 2.263735e-07
5 1 46830257 rs6683192 C T + 46830199 46830349 151 NSUN4 46830057 46830457 -0.696185 1.186266e-03
6 1 46830430 rs10252 G A + 46830199 46830349 151 NSUN4 46830230 46830630 -0.368553 8.971765e-02
7 1 46830447 rs12062 A C + 46830199 46830349 151 NSUN4 46830247 46830647 -0.696185 1.186266e-03
8 1 55316322 rs7374 A G - 55316022 55316670 649 DHCR24 55316122 55316522 0.200277 7.071380e-04
9 1 145441620 rs7211 T A,C + 145441816 145441866 51 TXNIP 145441420 145441820 -0.350243 7.529982e-02
10 1 145441840 rs3832003 TA T + 145441816 145441866 51 TXNIP 145441640 145442040 -0.349128 9.328973e-02
11 1 155289545 rs11264361 T A,G + 155289703 155290382 680 FDPS 155289345 155289745 0.327422 7.226476e-06
12 1 167674287 rs532648085 CAGCTAATTT C + 167674265 167674464 200 RCSD1 167674087 167674487 0.893350 2.076683e-03
13 1 167674303 rs4656551 T C,G + 167674265 167674464 200 RCSD1 167674103 167674503 0.964278 1.331959e-04
14 1 167674306 rs552388500 CTAAT C + 167674265 167674464 200 RCSD1 167674106 167674506 0.964278 1.331959e-04
15 1 167674311 rs146056978 T C + 167674265 167674464 200 RCSD1 167674111 167674511 0.964278 1.331959e-04
16 1 167674312 rs145024614 T G + 167674265 167674464 200 RCSD1 167674112 167674512 0.964278 1.331959e-04
17 1 167674653 rs2297777 C G + 167674265 167674464 200 RCSD1 167674453 167674853 0.908348 1.071866e-02
18 1 202861463 rs201067798 TCTG T - 202861544 202861693 150 KLHL12 202861263 202861663 0.591088 6.755276e-03
19 1 202861566 rs1042990 A G - 202861544 202861693 150 KLHL12 202861366 202861766 0.591088 6.755276e-03
20 1 209952865 rs623360 C G + 209952704 209952952 249 TRAF3IP3 209952665 209953065 -0.326555 3.201475e-02
21 1 214820299 rs3748698 A G + 214820311 214820711 401 CENPF 214820099 214820499 -0.334450 9.370009e-04
22 1 214820494 rs3790648 G A + 214820311 214820711 401 CENPF 214820294 214820694 -0.334450 9.370009e-04
23 1 214820524 rs3790649 T C + 214820311 214820711 401 CENPF 214820324 214820724 -0.334450 9.370009e-04
24 10 3178865 rs543 G A + 3178015 3178923 909 PFKP 3178665 3179065 -0.268602 9.222012e-02
25 10 5781628 rs2254067 T G + 5781827 5782326 500 FAM208B 5781428 5781828 0.352813 2.296571e-02
26 10 5781969 rs2797486 A T + 5781827 5782326 500 FAM208B 5781769 5782169 0.352813 2.296571e-02
27 10 72059506 rs10762360 G C - 72059495 72059694 200 LRRC20 72059306 72059706 0.285578 2.778509e-02
28 10 91099593 rs34407818 A G + 91099654 91100202 549 IFIT3 91099393 91099793 0.250755 5.216162e-02
29 10 91100068 rs10887948 C A + 91099654 91100202 549 IFIT3 91099868 91100268 0.225057 1.910779e-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
348 7 64437832 rs7810102 A T - 64437039 64437734 696 ZNF117 64437632 64438032 0.619720 2.110247e-05
349 7 76255836 rs73705109 T C + 76255434 76256081 648 LOC100133091 76255636 76256036 0.715343 5.477747e-02
350 7 76256578 rs562560419 C T + 76256429 76256479 51 LOC100133091 76256378 76256778 0.613969 4.562503e-02
351 7 76256674 rs376774216 C A + 76256429 76256479 51 LOC100133091 76256474 76256874 0.650592 4.800594e-02
352 7 92760738 rs10282508 T G - 92760584 92760833 250 SAMD9L 92760538 92760938 0.334275 3.837576e-03
353 7 128589427 rs10954213 G A + 128588632 128589426 795 IRF5 128589227 128589627 -0.182481 2.283409e-02
354 7 128589488 rs11770589 G A + 128588632 128589426 795 IRF5 128589288 128589688 -0.164631 7.839162e-02
355 7 133749164 rs6953296 G A + 133749090 133749240 151 EXOC4 133748964 133749364 -0.454762 4.230318e-03
356 7 134850588 rs111327468 T C + 134849817 134850650 834 TMEM140 134850388 134850788 -0.426385 2.250657e-03
357 7 150070630 rs1051760 A G + 150070714 150070961 248 REPIN1 150070430 150070830 0.316757 3.361276e-02
358 7 150070663 rs1051764 T C + 150070714 150070961 248 REPIN1 150070463 150070863 0.349593 6.318251e-05
359 7 150070814 rs1051823 G A + 150070714 150070961 248 REPIN1 150070614 150071014 0.349593 6.318251e-05
360 7 150094853 rs7780011 G A + 150094755 150094805 51 ZNF775 150094653 150095053 0.402474 1.096456e-02
361 7 150095271 rs7393 T C + 150095101 150095596 496 ZNF775 150095071 150095471 0.205026 7.170437e-02
362 8 22477014 rs7825134 C G + 22476836 22477415 580 CCAR2 22476814 22477214 0.309681 3.667858e-02
363 8 22477131 rs10551411 ACAT A + 22476836 22477415 580 CCAR2 22476931 22477331 0.284725 2.203442e-02
364 8 22477242 rs7846186 G A,C + 22476836 22477415 580 CCAR2 22477042 22477442 0.315464 1.975586e-02
365 8 23290304 rs60497420 C T - 23290337 23290437 101 ENTPD4 23290104 23290504 0.469506 4.727837e-02
366 8 23429536 rs8346 G A + 23429449 23429647 199 SLC25A37 23429336 23429736 0.457361 6.271886e-05
367 8 70745171 rs3750268 C A - 70744722 70745465 744 SLCO5A1 70744971 70745371 0.270035 2.075779e-02
368 8 144103435 rs4464948 C T + 144103263 144103607 345 LY6E 144103235 144103635 0.288879 6.539704e-08
369 8 144687721 rs896961 C T - 144687545 144688270 726 PYCRL 144687521 144687921 0.370943 6.936429e-03
370 9 35088822 rs79276933 G C - 35088759 35089156 398 PIGO 35088622 35089022 0.378849 7.147639e-02
371 9 91992573 rs9445 A C,T - 91992209 91992809 601 SEMA4D 91992373 91992773 -0.154351 7.142517e-02
372 9 134406650 rs7041225 C T - 134405962 134406490 529 UCK1 134406450 134406850 0.349496 3.781191e-02
373 9 136271571 rs7033317 T C - 136271755 136272054 300 REXO4 136271371 136271771 0.472724 5.219326e-07
374 9 136271744 rs2285481 A G - 136271755 136272054 300 REXO4 136271544 136271944 0.490878 4.315411e-08
375 9 136271792 rs2285482 G T - 136271755 136272054 300 REXO4 136271592 136271992 0.490878 4.315411e-08
376 9 136271838 rs2285483 G A - 136271755 136272054 300 REXO4 136271638 136272038 0.490878 4.315411e-08
377 9 136271841 rs2285484 C A,G - 136271755 136272054 300 REXO4 136271641 136272041 0.490878 4.315411e-08

378 rows × 14 columns

In [15]:
exon_m6AQTL[["ID"]].to_csv(f"{path}/Data/exon_tmp.txt", index = False, header = True, sep = "\t")
In [15]:
intron_m6AQTL
Out[15]:
#CHROM POS ID REF ALT strand start end width gene_symbol pos1 pos2 beta FDR
0 1 46810670 rs17361763 T C + 46810516 46810766 251 NSUN4 46810470 46810870 0.819328 2.263735e-07
1 1 46810842 rs6684274 C T + 46810516 46810766 251 NSUN4 46810642 46811042 0.819328 2.263735e-07
2 1 155226233 rs115599747 T C - 155225843 155226040 198 SCAMP3 155226033 155226433 0.459673 3.113221e-02
3 1 155289545 rs11264361 T A,G + 155289703 155290382 680 FDPS 155289345 155289745 0.327422 7.226476e-06
4 1 209952865 rs623360 C G + 209952704 209952952 249 TRAF3IP3 209952665 209953065 -0.326555 3.201475e-02
5 10 3178865 rs543 G A + 3178015 3178923 909 PFKP 3178665 3179065 -0.268602 9.222012e-02
6 10 5781628 rs2254067 T G + 5781827 5782326 500 FAM208B 5781428 5781828 0.352813 2.296571e-02
7 10 5781969 rs2797486 A T + 5781827 5782326 500 FAM208B 5781769 5782169 0.352813 2.296571e-02
8 10 91099593 rs34407818 A G + 91099654 91100202 549 IFIT3 91099393 91099793 0.250755 5.216162e-02
9 10 91100068 rs10887948 C A + 91099654 91100202 549 IFIT3 91099868 91100268 0.225057 1.910779e-03
10 10 91100258 rs1141862 T A + 91099654 91100202 549 IFIT3 91100058 91100458 0.205349 4.617628e-02
11 10 100189138 rs1061135 A G - 100189014 100189360 347 HPS1 100188938 100189338 0.483274 1.374589e-04
12 10 100189173 rs74154473 G A - 100189014 100189360 347 HPS1 100188973 100189373 -0.529313 9.803664e-02
13 10 114187827 rs11195956 T A + 114187814 114188063 250 ACSL5 114187627 114188027 0.511994 3.481202e-03
14 10 114188086 rs3184349 T C + 114187814 114188063 250 ACSL5 114187886 114188286 0.511994 3.481202e-03
15 10 129907741 rs2065718 G A - 129907474 129907574 101 MKI67 129907541 129907941 -0.195491 6.465461e-03
16 11 838419 rs1130678 A G,T + 838411 838610 200 CD151 838219 838619 0.330977 6.649862e-02
17 11 838424 rs1130680 C T + 838411 838610 200 CD151 838224 838624 0.332152 6.220541e-02
18 11 838542 rs1130698 C T + 838411 838610 200 CD151 838342 838742 0.340578 5.695003e-02
19 11 838634 rs8672 C G + 838411 838610 200 CD151 838434 838834 0.340091 8.224138e-02
20 11 6504061 rs9802 C T + 6504042 6504441 400 TIMM10B 6503861 6504261 -0.432604 1.423346e-03
21 11 6504196 rs10136 T G + 6504042 6504441 400 TIMM10B 6503996 6504396 -0.432604 1.423346e-03
22 11 6504444 rs73400862 T C + 6504042 6504441 400 TIMM10B 6504244 6504644 -0.434632 3.234969e-03
23 11 6504450 rs74055813 G C + 6504042 6504441 400 TIMM10B 6504250 6504650 -0.434632 3.234969e-03
24 11 6504587 rs73400864 C T + 6504042 6504441 400 TIMM10B 6504387 6504787 -0.434632 3.234969e-03
25 11 70230030 rs57810426 G A + 70229887 70229986 100 PPFIA1 70229830 70230230 -0.399371 7.311346e-03
26 11 71145687 rs7690 T A,C,G - 71145580 71145975 396 DHCR7 71145487 71145887 0.166936 5.264681e-02
27 12 56325006 rs772701 C G + 56324970 56325851 882 DGKA 56324806 56325206 -0.457031 5.405544e-03
28 12 56325208 rs1681088 A G + 56324970 56325851 882 DGKA 56325008 56325408 -0.443492 1.107442e-02
29 12 56325211 rs1701702 A G + 56324970 56325851 882 DGKA 56325011 56325411 -0.457031 5.405544e-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
158 6 32412004 rs539718048 T C + 32411578 32412457 880 HLA-DRA 32411804 32412204 -0.228416 9.678498e-02
159 6 32412005 rs115317719 G T + 32411578 32412457 880 HLA-DRA 32411805 32412205 -0.228416 9.678498e-02
160 6 32412279 rs9281809 G GAACTAACT + 32411578 32412457 880 HLA-DRA 32412079 32412479 0.220793 3.496738e-07
161 6 32412398 rs4935354 C T + 32411578 32412457 880 HLA-DRA 32412198 32412598 0.223552 8.709481e-08
162 6 32546953 rs9269697 G A - 32546814 32546864 51 HLA-DRB1 32546753 32547153 0.907284 1.232565e-03
163 6 32547053 rs9269701 A G - 32546814 32546864 51 HLA-DRB1 32546853 32547253 0.946333 1.232565e-03
164 6 33036617 rs34370305 C T - 33036550 33036863 314 HLA-DPA1 33036417 33036817 -0.298090 4.949827e-02
165 6 33036630 rs73739662 G T - 33036550 33036863 314 HLA-DPA1 33036430 33036830 -0.298090 4.949827e-02
166 6 33036631 rs73739663 A C - 33036550 33036863 314 HLA-DPA1 33036431 33036831 -0.298090 4.949827e-02
167 6 33036708 rs77350016 C T - 33036550 33036863 314 HLA-DPA1 33036508 33036908 -0.298090 4.949827e-02
168 6 33036709 rs77981473 A G - 33036550 33036863 314 HLA-DPA1 33036509 33036909 -0.298090 4.949827e-02
169 6 57034781 rs16888358 C T + 57034674 57034774 101 ZNF451 57034581 57034981 -0.632285 6.058763e-02
170 7 1005018 rs10282027 G A - 1004960 1005309 350 COX19 1004818 1005218 0.815706 4.470871e-04
171 7 64437820 rs7790578 G C - 64437039 64437734 696 ZNF117 64437620 64438020 0.619720 2.110247e-05
172 7 64437832 rs7810102 A T - 64437039 64437734 696 ZNF117 64437632 64438032 0.619720 2.110247e-05
173 7 64439938 rs1852016 C T - 64439471 64439769 299 ZNF117 64439738 64440138 0.558413 3.193890e-02
174 7 76255836 rs73705109 T C + 76255434 76256081 648 LOC100133091 76255636 76256036 0.715343 5.477747e-02
175 7 134850588 rs111327468 T C + 134849817 134850650 834 TMEM140 134850388 134850788 -0.426385 2.250657e-03
176 7 150070630 rs1051760 A G + 150070714 150070961 248 REPIN1 150070430 150070830 0.316757 3.361276e-02
177 7 150070663 rs1051764 T C + 150070714 150070961 248 REPIN1 150070463 150070863 0.349593 6.318251e-05
178 7 150070814 rs1051823 G A + 150070714 150070961 248 REPIN1 150070614 150071014 0.349593 6.318251e-05
179 7 150071156 rs2021871 T C + 150070714 150070961 248 REPIN1 150070956 150071356 0.349593 6.318251e-05
180 7 150094853 rs7780011 G A + 150094755 150094805 51 ZNF775 150094653 150095053 0.402474 1.096456e-02
181 7 150095271 rs7393 T C + 150095101 150095596 496 ZNF775 150095071 150095471 0.205026 7.170437e-02
182 8 22477014 rs7825134 C G + 22476836 22477415 580 CCAR2 22476814 22477214 0.309681 3.667858e-02
183 8 22477131 rs10551411 ACAT A + 22476836 22477415 580 CCAR2 22476931 22477331 0.284725 2.203442e-02
184 8 23290304 rs60497420 C T - 23290337 23290437 101 ENTPD4 23290104 23290504 0.469506 4.727837e-02
185 8 144103435 rs4464948 C T + 144103263 144103607 345 LY6E 144103235 144103635 0.288879 6.539704e-08
186 9 91992573 rs9445 A C,T - 91992209 91992809 601 SEMA4D 91992373 91992773 -0.154351 7.142517e-02
187 9 134406131 rs3904960 G T - 134405962 134406490 529 UCK1 134405931 134406331 0.353740 8.690303e-02

188 rows × 14 columns

In [16]:
list(set(exon_m6AQTL["gene_symbol"]))
Out[16]:
['DCTD',
 'NSUN4',
 'ZSCAN16',
 'ZNF775',
 'TIMELESS',
 'FNIP2',
 'RAB15',
 'MED24',
 'REPS1',
 'MS4A1',
 'SSH1',
 'PPFIA1',
 'TRIP10',
 'TOMM22',
 'CSNK1G2',
 'ENTPD4',
 'HARBI1',
 'ZC3H7B',
 'HLA-C',
 'PIP4K2B',
 'DGKD',
 'MTG2',
 'C7orf50',
 'TRIM44',
 'RNF31',
 'ANKFY1',
 'ACLY',
 'RBM47',
 'EXOC4',
 'PFKP',
 'ACSL5',
 'DHCR24',
 'BCL9L',
 'USP40',
 'EMC3-AS1',
 'HLA-DOA',
 'SCD5',
 'PIGO',
 'SLC19A1',
 'ELAC2',
 'LY6E',
 'CENPF',
 'PPIL3',
 'DAP',
 'LMAN2',
 'ZZEF1',
 'IFIT3',
 'SAMD9L',
 'PGAP2',
 'C2CD2L',
 'SOGA1',
 'KCNJ12',
 'LOC642852',
 'TBX21',
 'MED29',
 'FAM168B',
 'MCFD2',
 'THEMIS2',
 'HSD17B4',
 'SNAI3-AS1',
 'FDPS',
 'LOC100133091',
 'TMEM55B',
 'RAI1',
 'GSTT1',
 'EZH1',
 'NARF',
 'MYO15B',
 'MYO19',
 'PHLDB3',
 'REPIN1',
 'EXOSC9',
 'CD81',
 'WBP1L',
 'EFHD2',
 'LMNB2',
 'SLC25A23',
 'SFXN5',
 'PRKXP1',
 'HLA-A',
 'PYCRL',
 'FAM188B',
 'TIMM10B',
 'HPS5',
 'ATP6V0D1',
 'CLASP1',
 'REXO4',
 'RTP4',
 'SDR39U1',
 'TANC1',
 'KLHL36',
 'RNASEK',
 'HLA-DRB1',
 'AKIRIN2',
 'ZNF451',
 'TGOLN2',
 'MX1',
 'IP6K2',
 'HLA-DPA1',
 'UCK1',
 'HPS1',
 'MRPS24',
 'RDH11',
 'MTA1',
 'ENOSF1',
 'HGS',
 'RRP1B',
 'PCNT',
 'TAPBP',
 'SNRNP25',
 'SLCO5A1',
 'FAM208B',
 'DHCR7',
 'UBR1',
 'RCSD1',
 'MADD',
 'LYSMD4',
 'DCAF4',
 'ZNF213',
 'TMEM140',
 'MEF2A',
 'CCAR2',
 'RNASEK-C17orf49',
 'KLHL29',
 'GOLGB1',
 'IRF5',
 'MTMR4',
 'TMEM128',
 'ZNF117',
 'AP5Z1',
 'KHNYN',
 'PTPN23',
 'MKI67',
 'UBE2O',
 'PTBP1',
 'ERAP2',
 'SPIB',
 'TRAF3IP3',
 'LRRC20',
 'WDR11',
 'ZNF544',
 'SCIMP',
 'TLR10',
 'OAS3',
 'REC8',
 'DNAJB1',
 'NOD1',
 'CD151',
 'ZNF701',
 'HLA-DRA',
 'POLRMT',
 'ZBTB38',
 'GOSR2',
 'NUP62',
 'SLC25A37',
 'FAM111A',
 'KANK2',
 'LETMD1',
 'MDGA1',
 'PARM1',
 'TXNIP',
 'DIP2A',
 'KLHL12',
 'EIF2B2',
 'USP19',
 'ZNF266',
 'TUG1',
 'TGFBRAP1',
 'SNRNP200',
 'SEMA4D',
 'TNIP2',
 'HCLS1',
 'EBI3',
 'DGKA',
 'ZNF268',
 'TCL1A',
 'C2orf44',
 'APOPT1',
 'DDTL',
 'C15orf39',
 'CD200',
 'C22orf34',
 'MED16',
 'TNRC18',
 'EXOG',
 'ZNF558']
In [17]:
from collections import Counter
Counter(exon_m6AQTL["width"])
Out[17]:
Counter({51: 32,
         100: 9,
         101: 24,
         140: 1,
         147: 3,
         149: 4,
         150: 9,
         151: 20,
         171: 1,
         198: 1,
         199: 4,
         200: 37,
         201: 5,
         248: 3,
         249: 10,
         250: 13,
         251: 5,
         263: 1,
         291: 1,
         296: 1,
         297: 3,
         298: 1,
         299: 5,
         300: 13,
         301: 1,
         314: 11,
         345: 1,
         347: 8,
         348: 7,
         350: 1,
         351: 1,
         377: 1,
         379: 10,
         387: 1,
         396: 1,
         397: 1,
         398: 2,
         399: 1,
         400: 7,
         401: 3,
         406: 2,
         445: 1,
         447: 1,
         448: 1,
         449: 6,
         450: 3,
         486: 1,
         490: 1,
         495: 1,
         496: 2,
         497: 7,
         499: 7,
         500: 6,
         529: 1,
         531: 1,
         533: 3,
         545: 2,
         549: 5,
         580: 3,
         598: 5,
         599: 3,
         601: 1,
         648: 2,
         649: 1,
         680: 1,
         689: 1,
         695: 3,
         696: 4,
         697: 1,
         700: 3,
         719: 1,
         726: 1,
         738: 4,
         743: 1,
         744: 2,
         745: 2,
         747: 2,
         748: 1,
         749: 1,
         795: 2,
         800: 2,
         834: 1,
         843: 3,
         880: 6,
         882: 1,
         896: 1,
         909: 1,
         911: 4})

Step 2: Use RNAsnp to obtain sequence

RNAsnp

Trial (does not work as website)

RNAsnp -f ~/Downloads/RNAsnp_datasets/dataset1/sequences.fasta -s ~/Downloads/RNAsnp_datasets/dataset1/snps.txt
RNAsnp -f ~/Downloads/RNAsnp_datasets/dataset2/sequences.txt -s ~/Downloads/RNAsnp_datasets/dataset2/snps.txt -m 2

Intersection of m6A peaks and

Use "bedtools" in bash under the depository ~/Documents/m6A/Data/metApeakFisher

bedtools intersect -a ../intron_m6AQTLs.bed -b peaks.merged.bed -s > peak.merged.intron.m6AQTL.bed
bedtools intersect -a ../exon_m6AQTLs.bed -b peaks.merged.bed -s > peak.merged.exon.m6AQTL.bed
In [ ]:
n2 = 500
width = intron_m6AQTL[intron_m6AQTL["width"] <= n2]["width"].tolist()
plt.hist(width, bins = 25)
plt.title("Histogram of width")
plt.show()

Step 3:

@NTCNCCACCC:K00180:212:H7VCTBBXX:3:1101:21673:1033 1:N:0:NAATTCGT+AGGCTNTA @NCGNTCAAGA:K00180:212:H7VCTBBXX:3:1101:21755:1033 1:N:0:NAATTCGT+AGGCTNTA @NCTNCCCGAG:K00180:212:H7VCTBBXX:3:1101:21795:1033 1:N:0:NAATTCGT+AGGCTNTA @NTCNTCCAAC:K00180:212:H7VCTBBXX:3:1101:22465:1033 1:N:0:NAATTCGT+AGGCTNTA

@:::::: :::


© 2018 Min Qiao at He Lab, University of Chicago

Exported from analysis/20180222_m6A_riboSNitch.ipynb committed by Min Qiao on Wed May 2 19:10:46 2018 revision 13, bd67d58