import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')


##For the Alternative promoter candidates overlap the known TSS and our candidates
##Sort the output TSS regions
os.system("bedtools sort -i ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed")


##Merge close TSS together if within 1 bp
os.system("mergeBed -s -c 4,5,6,7,8,9 -o distinct,distinct,distinct,distinct,distinct,distinct -d 1 -i ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_bed_simple.merge.sorted.bed")


# then merge bedfiles for use reference TSS from Weissman and our candidates
os.system("closestBed -s -d -b ../annotations/Hs_EPDnew_006_hg38_tab.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_EPD.bed")

os.system("liftOver ../annotations/TSS_human.sorted.bed ../annotations/hg19ToHg38.over.chain.gz ../annotations/TSS_human_hg38.sorted.bed ./unlifted.bed")

# then merge bedfiles for use reference TSS from Weissman and our candidates
os.system("closestBed -s -d -b ../annotations/TSS_human_hg38.sorted.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_TSS.bed")

# then merge bedfiles for use reference TSS from Weissman and our candidates
os.system("closestBed -s -d -b ../annotations/MCF7_PROM_CHROHMM.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_CHROMHMM.bed")

os.system("closestBed -s -d -b ../annotations/refTSS_v3.1_human_coordinate.hg38.sorted.bed -a ../pacbio_cage_proactiv_TSS_files/compiled_pivot_all.sorted.bed > ../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_refTSS.bed")

0


EPDTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_EPD.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source','EPDchr','EPDTSSstart','EPDTSSend','TSSname','colour','EPDstrand','broadTSSstart', 'broadTSSend','EPD Distance'], index_col=None)

# CHROMHMMTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_CHROMHMM.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source'])

refTSSTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_refTSS.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source','refTSSchr','refTSSstart','refTSSend','refTSSid','num','refTSSstrand','refTSSstartbroad','refTSSendbroad','colour','refTSS distance'])

CRISPRiTSS=pd.read_csv("../pacbio_cage_proactiv_TSS_files/compiled_pivot_wstrand_TSS.bed", sep="\t",header=None, names=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source','CRISPchr','CRISPstart','CRISPend','CRISPid','0','CRISPstrand','CRISPstartbroad','CRISPendbroad','colour','CRISRPi distance'])

subid_col=["TSS_ID", 'Ensembl_ID']
id_col=['chr', 'TSSstart', 'TSSend', "TSS_ID", 'Gene_symbol','strand','Ensembl_ID','MAJOR_MINOR','Source']


alldistances=EPDTSS.merge(refTSSTSS,on=id_col).merge(CRISPRiTSS,on=id_col)
alldistances_sub=alldistances[['CRISRPi distance','refTSS distance','EPD Distance']]
alldistances_sub

sns.set(style='white', font_scale=1.0)
g = sns.PairGrid(alldistances_sub, aspect=1.4, diag_sharey=False)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_upper(corrdot)

<seaborn.axisgrid.PairGrid at 0x7ffdc0eb5520>


def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)

Check the Distances Between Called TSS and Known References of TSS¶