import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')
##For the Alternative promoter candidates overlap the known TSS and our candidates
##Sort the output TSS regions
os.system("bedtools sort -i ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed")
##Merge close TSS together if within 1 bp
os.system("mergeBed -s -c 4,5,6,7,8,9 -o distinct,distinct,distinct,distinct,distinct,distinct -d 1 -i ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_bed_simple.merge.sorted.bed")
# then merge bedfiles for use reference TSS from Weissman and our candidates
os.system("closestBed -s -d -b ../annotations/Hs_EPDnew_006_hg38_tab.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_EPD.bed")
os.system("liftOver ../annotations/TSS_human.sorted.bed ../annotations/hg19ToHg38.over.chain.gz ../annotations/TSS_human_hg38.sorted.bed ./unlifted.bed")
# then merge bedfiles for use reference TSS from Weissman and our candidates
os.system("closestBed -s -d -b ../annotations/TSS_human_hg38.sorted.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_TSS.bed")
# then merge bedfiles for use reference TSS from Weissman and our candidates
os.system("closestBed -s -d -b ../annotations/MCF7_PROM_CHROHMM.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_CHROMHMM.bed")
os.system("closestBed -s -d -b ../annotations/refTSS_v3.1_human_coordinate.hg38.sorted.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_refTSS.bed")
0
EPDTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_EPD.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source','EPDchr','EPDTSSstart','EPDTSSend','TSSname','colour','EPDstrand','broadTSSstart', 'broadTSSend','EPD Distance'], index_col=None)
# CHROMHMMTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_CHROMHMM.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source'])
refTSSTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_refTSS.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source','refTSSchr','refTSSstart','refTSSend','refTSSid','num','refTSSstrand','refTSSstartbroad','refTSSendbroad','colour','refTSS distance'])
CRISPRiTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_TSS.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source','CRISPchr','CRISPstart','CRISPend','CRISPid','0','CRISPstrand','CRISPstartbroad','CRISPendbroad','colour','CRISRPi distance'])
subid_col=["TSS_ID", 'Ensembl_ID']
id_col=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source']
So, I have taken the overlaps between candidate TSS and reference TSS (from hCRIPSRi (FANTOM only), Eukaryotic promoter database and refTSS). Interestingly all of them use FANTOM, but do not have absolute correlation. This is due to the fact the majority of candidate promoters are found at the same spot as what I have found within the MCF7 cell line. However, the correlation is caused by the
alldistances=EPDTSS.merge(refTSSTSS,on=id_col).merge(CRISPRiTSS,on=id_col)
alldistances_sub=alldistances[['CRISRPi distance','refTSS distance','EPD Distance']]
alldistances_sub
sns.set(style='white', font_scale=1.0)
g = sns.PairGrid(alldistances_sub, aspect=1.4, diag_sharey=False)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_upper(corrdot)
<seaborn.axisgrid.PairGrid at 0x7ffdc0eb5520>
def corrdot(*args, **kwargs):
corr_r = args[0].corr(args[1], 'pearson')
corr_text = f"{corr_r:2.2f}".replace("0.", ".")
ax = plt.gca()
ax.set_axis_off()
marker_size = abs(corr_r) * 10000
ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
vmin=-1, vmax=1, transform=ax.transAxes)
font_size = abs(corr_r) * 40 + 5
ax.annotate(corr_text, [.5, .5,], xycoords="axes fraction",
ha='center', va='center', fontsize=font_size)