Differential Expression analysis between P1 & P2

Run through the 10x Cellranger pipeline and velocyto for single cell RNAseq quatification and using (2) guides quantifiction. all found in the cellranger files folder bash
Guide Calling for dual guide. Use repogle method to take molecule.h5 generated by cellranger and py to run through repogle version of guide calling or use cellranger_guidecalling.ipynb for Direct Capture Perturb-Seq dual guide. Formed guide-specific lists of cells.
Pseudobulk analysis. A. Seperation of guide-specific fastq files. bash B. Whippet pseudobulk for transcript specific analysis, post UMI deduplication. bash C. Transcript quality control. R D. Whippet result visualisation.
Normalisation of adata object and E-distance of KD
Check gene and neighboring gene expression
Create individual umaps per gene of interest A. UMAPs B. Rand Index score
Cell phase assignment model from FUCCI-matched single cell paper (GSE146773)
Differential Expression analysis. A. Find the shared P1 and P2 genes. B. Check the shared P1 and P2 across different protospacers with the same A/B and C/D.
CNV Score & Numbat to quantify and Velocity quantification with loom file
ESR1-specific analysis from proliferation analysis to rt-qpcr
Spectra analysis and visualisation for pathway enrichment

It performs the Systematic Differential Expression (DE) Analysis requested by reviewers to quantify and compare the transcriptional effects of P1 and P2 promoter knockdowns. The notebook identifies which genes change significantly in expression following the knockdown of a specific promoter compared to non-targeting controls (NTC).

Methodology: It utilizes the scanpy.tl.rank_genes_groups function, applying a t-test or Mann-Whitney U test (Z-score) to identify differentially expressed genes (DEGs).

Filtering Criteria: DEGs are strictly filtered based on a \(log_{2}FC > 0.5\), an adjusted p-value \(< 0.05\), and a requirement that the gene is expressed in \(> 10\%\) of the cells in the perturbed population.

[ ]:

%load_ext autoreload
%matplotlib inline
%autoreload 2
#general
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import hdf5plugin
import gseapy as gp

#form a location
loc="../alt-prom-crispr-fiveprime/"

import seaborn as sns
from tqdm.notebook import tqdm
import scperturb
import sys
sys.path.append(loc+'scripts/')
from apu_analysis import *
import scperturb
import infercnvpy as cnv
from apu_analysis.cell_import import CellPopulation
from IPython.display import clear_output
pd.options.display.float_format = '{:.4f}'.format
import matplotlib.pyplot as plt

#for this python
from scipy.special import rel_entr
import sklearn.cluster as cluster
import umap
from scipy import stats
from scipy.stats import bootstrap
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests
from numpy import reshape
from numpy import array
from sklearn.decomposition import PCA
import scipy.stats as stats
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from scipy.stats import mannwhitneyu

# Taken from:
# Adamson, B.A., Norman, T.M., *et al.* "A multiplexed CRISPR screening platform enables systematic dissection of the unfolded protein response", *Cell*, 2016.
# My experiment deals with two KDs- one of the MP, one of the AP using two guides. Positve controls include GINS1 ect. This is a combnatorial KD double for the same gene. No treatments were used

# colours using garvan
color1 ='#4d00c7'
palecolor1="#b366ff"
color2= '#da3c07'
palecolor2="#ff8954"
color3='#05d3d3'
color4='#c6c7c5'
color4="#434541"
color5="#eb31e1"
color6="#3175eb"
color7="#a7eb31"
color8="#b366ff"
color9="#ff8954"
color10="#35c9d4"

def run_gsea_for_gene(adata, gene):
    print(gene)

    gene_AP = gene + "_AP"
    gene_MP = gene + "_MP"

    # ---------- MP GUIDE ----------
    # get full DE table (NO pval/logFC cutoff for GSEA)
    MP_rank = sc.get.rank_genes_groups_df(adata, group=gene_MP, pval_cutoff=None)

    # make a 2-column ranking like your STIM example:
    #   [names, logfoldchanges]
    mp_rnk = MP_rank.loc[:, ["names", "logfoldchanges"]].dropna()

    # run prerank
    pre_res_mp = gp.prerank(
        rnk=mp_rnk,                 # DataFrame with [gene, score]
        gene_sets="GO_Biological_Process_2021",      # or your favourite collection
        outdir=None,                # don't write files
        seed=123
    )

    # grab the result table
    mp_res_df = pre_res_mp.res2d.copy()
    mp_res_df["Target_Gene"] = gene
    mp_res_df["Guide"] = gene_MP
    # classify direction by NES sign
    mp_res_df["Direction"] = np.where(mp_res_df["NES"] > 0, "Up", "Down")
    gsea_results.append(mp_res_df)

    # ---------- AP GUIDE ----------
    AP_rank = sc.get.rank_genes_groups_df(adata, group=gene_AP, pval_cutoff=None)
    ap_rnk = AP_rank.loc[:, ["names", "logfoldchanges"]].dropna()

    pre_res_ap = gp.prerank(
        rnk=ap_rnk,
        gene_sets="GO_Biological_Process_2021",
        outdir=None,
        seed=123
    )

    ap_res_df = pre_res_ap.res2d.copy()
    ap_res_df["Target_Gene"] = gene
    ap_res_df["Guide"] = gene_AP
    ap_res_df["Direction"] = np.where(ap_res_df["NES"] > 0, "Up", "Down")
    #add together ap_ res_df and mp_res_df
    combined_res_df = pd.concat([mp_res_df, ap_res_df], ignore_index=True)
    return combined_res_df ,pre_res_ap, pre_res_mp

FDR = 0.05
LOG_FOLD_CHANGE = 0.5

# Create the color palette
palette = sns.color_palette([palecolor1,palecolor2,color3,color4])
palette2 = sns.color_palette([color1, color2, palecolor1, palecolor2,color5,color6 ,color7])

# # Create the color palette
# palette = sns.color_palette([color1, color2,color3])
new_palette = sns.color_palette([color1, color2,color1, color2,color1, color2,color1, color2,color1, color2,color1, color2, color3, color4])

from random import randrange
pvalue=0.05

print("Scanpy", sc.__version__)
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Scanpy 1.10.3

[36]:

%%capture
adata = ad.read_h5ad(loc+"files/adata_normalised_cellcycle.h5ad")
#set the directio
#perform log1p on the data
adata.obs["guide_assignment_simple"]=np.where(((adata.obs["guide_assignment"].str.startswith("non"))|(adata.obs["guide_assignment"].str.startswith("Non"))), "non-targeting", adata.obs["guide_assignment"])
gene_list_select=adata.obs["guide_assignment_simple"][adata.obs["guide_assignment_simple"]!="non-targeting"].drop_duplicates().tolist()
# sc.tl.rank_genes_groups(adata, 'guide_assignment_simple', groups=gene_list_select,reference="non-targeting", method='wilcoxon', use_raw=False)
sc.tl.rank_genes_groups(adata, 'guide_assignment_simple', groups=gene_list_select,reference="non-targeting", method='t-test', use_raw=False)

... storing 'guide_assignment_simple' as categorical

[37]:

#get the differentially expressed genes
# %%capture
gene_list=[]
pvalue=0.05
number=1000
sortby="pvals_adj_1"
min_fc=0.5
#get the highly_variable genes
extract_highly_variable_genes=adata.var[adata.var["highly_variable"]==True].index
gene_list_recursive=adata.obs["perturbation"][adata.obs["guide_id"]!="non-targeting_Control"].drop_duplicates().values
#remove ARNT and if gene_A CSNK1E

[38]:

#get the differentially expressed genes
# %%capture
gene_list=[]
pvalue=0.05
logFC=0.5
number=1000
sortby="rank_1"
min_fc=1
#get the highly_variable genes
extract_highly_variable_genes=adata.var[adata.var["highly_variable"]==True].index
gene_list_recursive=adata.obs["perturbation"][adata.obs["guide_id"]!="non-targeting_Control"].drop_duplicates().values
#remove ARNT and if gene_A CSNK1E
gene_list_recursive=gene_list_recursive[~gene_list_recursive.isin(["ARNT","CSNK1E"])]
for gene in adata.obs["perturbation"][(adata.obs["guide_id"]!="non-targeting_Control") & (adata.obs["successfulKD"]=="True")].drop_duplicates().values:
    # print(gene)
    #AP
    gene_AP=gene+"_AP"
    if gene_AP not in adata.obs["guide_id"][adata.obs["guide_id"]!="non-targeting_Control"].drop_duplicates().values:
        continue
    elif gene in ["ARNT"]:
        print("ARNT")
    else:
        for promoter in ["MP","AP"]:
            gene_promoter=gene+"_"+promoter+"_"+str(1)
            gene_promoter_2=gene+"_"+promoter+"_"+str(2)
            gene_promoter_random_1=gene_list_recursive[randrange(len(gene_list_recursive))]+"_"+promoter+"_"+str(1)
            gene_promoter_random_2=gene_list_recursive[randrange(len(gene_list_recursive))]+"_"+promoter+"_"+str(2)
            ##form a dataframe with custom colummns
            #subset random gene from gene list
            protospacer_1= sc.get.rank_genes_groups_df(adata, group=gene_promoter,pval_cutoff=pvalue)
            #order by the absolute value of the logfoldchange
            protospacer_1["abs_logfoldchanges"]=protospacer_1["logfoldchanges"].abs()
            protospacer_1.sort_values(by="abs_logfoldchanges",ascending=False,inplace=True)
            #add a rank column
            protospacer_1["rank"]=range(1,len(protospacer_1)+1)
            protospacer_2= sc.get.rank_genes_groups_df(adata, group=gene_promoter_2,pval_cutoff=pvalue)
            protospacer_2["abs_logfoldchanges"]=protospacer_2["logfoldchanges"].abs()
            protospacer_2.sort_values(by="abs_logfoldchanges",ascending=False,inplace=True)
            #add a rank column
            protospacer_2["rank"]=range(1,len(protospacer_2)+1)
            protospacer_1_random= sc.get.rank_genes_groups_df(adata, group=gene_promoter_random_1,pval_cutoff=pvalue)
            protospacer_1_random["abs_logfoldchanges"]=protospacer_1_random["logfoldchanges"].abs()
            protospacer_1_random.sort_values(by="abs_logfoldchanges",ascending=False,inplace=True)
            #add a rank column
            protospacer_1_random["rank"]=range(1,len(protospacer_1_random)+1)
            protospacer_2_random= sc.get.rank_genes_groups_df(adata, group=gene_promoter_random_2,pval_cutoff=pvalue)
            protospacer_2_random["abs_logfoldchanges"]=protospacer_2_random["logfoldchanges"].abs()
            protospacer_2_random.sort_values(by="abs_logfoldchanges",ascending=False,inplace=True)
            #add a rank column
            protospacer_2_random["rank"]=range(1,len(protospacer_2_random)+1)
            gene_df=protospacer_1.merge(protospacer_2, on="names", how="left", suffixes=('_1', '_2')).dropna()
            #filter for highly variable genes
            gene_df_random_1=protospacer_1.merge(protospacer_1_random, on="names", how="left", suffixes=('_1', '_2')).dropna()
            gene_df_random_2=protospacer_1.merge(protospacer_2_random, on="names", how="left", suffixes=('_1', '_2')).dropna()
            rank_columns=["rank_1","rank_2"]
            #get the correlation for  scores_1 and scores_2 for corr
            correlation_1=gene_df.sort_values(by=sortby, ascending=True)[rank_columns].corr(method='spearman',numeric_only=True)
            correlation_random_1=gene_df_random_1.sort_values(by=sortby, ascending=True)[rank_columns].corr(method='spearman',numeric_only=True)
            correlation_random_2=gene_df_random_2.sort_values(by=sortby, ascending=True)[rank_columns].corr(method='spearman',numeric_only=True)
            #create a row wtht he gene name
            gene_list_row=[gene,promoter]
            gene_list_row.extend([correlation_1.loc["rank_1","rank_2"],correlation_random_1.loc["rank_1","rank_2"],correlation_random_2.loc["rank_1","rank_2"]])
            gene_list.append(gene_list_row)


#flatten list of lists into dataframe
gene_df_final=pd.DataFrame(gene_list, columns=["gene","promoter","scores_1_targeted","scores_1_random","scores_2_random"])

[39]:

#get the differentially expressed genes
# %%capture
# pvalue=0.05
gene_list=[]
gene_list_recursive=adata.obs["perturbation"][adata.obs["guide_id"]!="non-targeting_Control"].drop_duplicates().values
#remove ARNT and if gene_A CSNK1E
gene_list_recursive=gene_list_recursive[~gene_list_recursive.isin(["ARNT","CSNK1E"])]
for gene in adata.obs["perturbation"][(adata.obs["guide_id"]!="non-targeting_Control") & (adata.obs["successfulKD"]=="True")].drop_duplicates().values:
    print(gene)
    #AP
    gene_AP=gene+"_AP"
    if gene_AP not in adata.obs["guide_id"][adata.obs["guide_id"]!="non-targeting_Control"].drop_duplicates().values:
        continue
    elif gene in ["ARNT"]:
        print("ARNT")
    else:
        for promoter in ["MP","AP"]:
            gene_promoter=gene+"_"+promoter+"_"+str(1)
            protospacer_1= sc.get.rank_genes_groups_df(adata, group=gene_promoter,pval_cutoff=pvalue)
            protospacer_1=protospacer_1['names'].values

            gene_promoter=gene+"_"+promoter+"_"+str(2)
            protospacer_2= sc.get.rank_genes_groups_df(adata, group=gene_promoter,pval_cutoff=pvalue)
            protospacer_2=protospacer_2['names'].values

            gene_promoter=gene_list_recursive[randrange(len(gene_list_recursive))]+"_"+promoter+"_"+str(1)
            protospacer_1_random= sc.get.rank_genes_groups_df(adata, group=gene_promoter,pval_cutoff=pvalue)
            protospacer_1_random=protospacer_1_random["names"].values

            gene_promoter=gene_list_recursive[randrange(len(gene_list_recursive))]+"_"+promoter+"_"+str(2)
            protospacer_2_random= sc.get.rank_genes_groups_df(adata, group=gene_promoter,pval_cutoff=pvalue)
            protospacer_2_random=protospacer_2_random["names"].values


            overlap = [x for x in protospacer_1 if x in protospacer_2]
            overlap_random_1 = [x for x in overlap if x in protospacer_1_random]
            overlap_random_2 = [x for x in overlap if x in protospacer_2_random]
            #if overlap protospacer_1 protospacer_2 overlap_random_2 overlap_random_1 = 0 set to 1
            # print(protospacer_1,protospacer_2,overlap,overlap_random_1,overlap_random_2)
            try:
                overlap_1=len(overlap)/len(protospacer_1)
                overlap_2=len(overlap)/len(protospacer_2)
                overlap_random_1=len(overlap_random_1)/len(protospacer_1_random)
                overlap_random_2=len(overlap_random_2)/len(protospacer_2_random)
            except ZeroDivisionError:
                overlap_1=0
                overlap_2=0
                overlap_random_1=0
                overlap_random_2=0

            gene_df=pd.DataFrame([[gene,promoter,overlap_1,overlap_2,overlap_random_1,overlap_random_2]])
            # gene_df_details=pd.DataFrame([[gene,promoter,protospacer_1,protospacer_2,overlap,len(protospacer_1),len(protospacer_2),len(overlap)]])

            # print(gene_df.shape)
            gene_list.append(gene_df)
            # break


# col_names=["Gene","Promoter","Protospacer_1","Protospacer_2","Overlap","Protospacer_Random_1","Protospacer_Random_2"]
#add col_names
col_names=["Gene","Promoter","Overlap_1","Overlap_2","Overlap_Random_1","Overlap_Random_2"]
all=pd.DataFrame(pd.concat(gene_list))
all.columns=col_names

AHCYL1
SP1
TGIF1
YWHAZ
P4HB
NCOR2
JARID2
ESR1
PA2G4
GPBP1
DHX30
SAFB
NFE2L2
CHD8
SIN3A
PSMC5
RC3H2
ZNF3
GTF2F1
ADAR
NR2F2
BZW1
FHL2
SET
MYBBP1A
CUX1
NBN
CALR
APP
STAG2
PKM
BRIP1
GATA3
RBM47

[40]:

#get the tests between two unpaired parametric groups
#melt for random
all_melt=all.melt(id_vars=["Gene","Promoter"])
all.index=all["Gene"]+"_"+all["Promoter"]
print(stats.ttest_ind(all["Overlap_1"],all["Overlap_2"],equal_var=False))
print(stats.ttest_ind(all["Overlap_Random_1"],all["Overlap_Random_2"],equal_var=False))
print(stats.ttest_ind(all["Overlap_1"],all["Overlap_Random_1"],equal_var=False))
print(stats.ttest_ind(all["Overlap_2"],all["Overlap_Random_2"],equal_var=False))

TtestResult(statistic=-1.200758436011414, pvalue=0.23196364293947955, df=133.97197243239725)
TtestResult(statistic=-0.8170297166398177, pvalue=0.4153781706070453, df=132.49121568852897)
TtestResult(statistic=10.428675297500293, pvalue=2.596256035375967e-18, df=114.67625808804517)
TtestResult(statistic=10.723518386158062, pvalue=2.9404501919466304e-19, df=120.79414195321442)

A central goal of this notebook is to determine the degree of overlap between the pathways regulated by different promoters of the same gene.

Limited Overlap: The analysis reveals that for most genes, there is minimal overlap between the DEGs affected by P1 versus P2 knockdown. This supports the conclusion that alternative promoters regulate largely distinct downstream pathways.

Z-Score Correlation: For the small subset of shared DEGs, the notebook calculates Pearson correlations of Z-scores. As shown in the study, these shared genes typically show high positive correlations, indicating a coherent regulatory response even when the number of shared genes is small.

[ ]:

#plot the overlap between the two guides
sns.set(style="white")
sns.set_context("talk")
fig, ax = plt.subplots(figsize=(8, 5))
#show the percentage over "Overlap_length"/"Protospacer_1_length"
sns.kdeplot(data=all_melt,x="value",hue="variable",palette=new_pallete,
)
#save plot
plt.savefig(loc+"figures/overlap_gene_signature.pdf")
gene_list_select=adata.obs["guide_id"][adata.obs["guide_id"]!="non-targeting_Control"].drop_duplicates().values
sc.tl.rank_genes_groups(adata, 'guide_id', groups=gene_list_select.to_list(),reference="non-targeting_Control", method='t-test')
genelist=adata.obs['guide_id'].drop_duplicates().tolist()

[42]:

%%capture
# comp_gene_list MP vs AP summary table for each candidate gene
# all_genes_df full DE results for all guides (MP and AP)
# all_genes_df_filt statistically significant DEGs used for downstream signature analysis

sc.tl.rank_genes_groups(adata, 'guide_id',groups=genelist, reference="non-targeting_Control", use_raw=False, pts=0.1)

#get target gene logfoldchanges
#get the differentially expressed genes
gene_list=[]
comp_gene_list=[]

for gene in adata.obs["perturbation"][(adata.obs["guide_id"]!="non-targeting_Control")& (adata.obs["successfulKD"]=="True")].drop_duplicates().values:
    # print(gene)
    #AP
    gene_AP=gene+"_AP"
    gene_MP=gene+"_MP"

    if gene_AP not in adata.obs["guide_id"][(adata.obs["guide_id"]!="non-targeting_Control")& (adata.obs["successfulKD"]=="True")].drop_duplicates().values:
        continue
    else:
        ##form a dataframe with custom colummns
        MP = sc.get.rank_genes_groups_df(adata, group=gene_MP)
        MP_filt = MP[(MP["names"]==gene)]
        AP = sc.get.rank_genes_groups_df(adata, group=gene_AP)
        AP_filt = AP[(AP["names"]==gene)]
        #do it for every gene
        MP["targeted_gene"]=gene_MP
        AP["targeted_gene"]=gene_AP
        gene_list.append(MP)
        gene_list.append(AP)
        promoter_line=[gene,MP["scores"].values[0],MP["logfoldchanges"].values[0],MP["pvals_adj"].values[0],AP["scores"].values[0],AP["logfoldchanges"].values[0],AP["pvals_adj"].values[0]]
        comp_gene_list.append(promoter_line)
#convert to dataframe
comp_gene_list=pd.DataFrame(comp_gene_list, columns=["gene","MP_zscore","MP_logfoldchange","MP_pvalue","AP_zscore","AP_logfoldchange","AP_pvalue"])

#merge all gene dfs
all_genes_df=pd.DataFrame(pd.concat(gene_list))
#change the column names
all_genes_df.columns=["gene","scores","logfoldchanges","pvals","pvals_adj","pct_nz_group","targeted_gene"]
#filter for logfoldchange > 0.5 and pvalue < 0.05
all_genes_df_filt=all_genes_df[(all_genes_df["pvals_adj"]<0.05) & (all_genes_df["logfoldchanges"].abs()>0.5)]

[43]:

%%capture
# Run DE once
# all  → Summary of MP/AP DEG counts & correlations (one row per target gene)
# directional_genes_df → Up- and down-regulated DEG signatures per target
# all_filt → Summary table filtered for successful KD

gene_list = []          # per-target summary (MP/AP counts, overlap, correlation)
comp_gene_list = []     # full DE tables for each MP/AP guide
directional_genes = []  # per-target, per-direction summary (Up / Down)

# Cache valid guides to avoid recomputing each loop
valid_guides = adata.obs.loc[
    adata.obs["guide_id"] != "non-targeting_Control", "guide_id"
].drop_duplicates().values

perturbations = adata.obs.loc[
    adata.obs["guide_id"] != "non-targeting_Control", "perturbation"
].drop_duplicates().values

for gene in perturbations:
    print(gene)

    gene_AP = gene + "_AP"
    gene_MP = gene + "_MP"

    # skip if there is no AP guide for this gene
    if gene_AP not in valid_guides:
        continue

    # ------- MP -------
    MP = sc.get.rank_genes_groups_df(adata, group=gene_MP, pval_cutoff=0.05)
    MP = MP[MP["logfoldchanges"].abs() > 0.5]
    MP_names = MP["names"].values

    MP_up = MP[MP["logfoldchanges"] > 0]
    MP_up_names = MP_up["names"].values

    MP_down = MP[MP["logfoldchanges"] < 0]
    MP_down_names = MP_down["names"].values

    # ------- AP -------
    AP = sc.get.rank_genes_groups_df(adata, group=gene_AP, pval_cutoff=0.05)
    AP = AP[AP["logfoldchanges"].abs() > 0.5]
    AP_names = AP["names"].values

    AP_up = AP[AP["logfoldchanges"] > 0]
    AP_up_names = AP_up["names"].values

    AP_down = AP[AP["logfoldchanges"] < 0]
    AP_down_names = AP_down["names"].values

    # ------- Overlaps -------
    overlap       = [x for x in MP_names      if x in AP_names]
    overlap_up    = [x for x in MP_up_names   if x in AP_up_names]
    overlap_down  = [x for x in MP_down_names if x in AP_down_names]

    # correlation of scores for overlapping genes (MP vs AP)
    correlation_value = None
    if len(overlap) > 1:
        corr_df = MP[MP["names"].isin(overlap)].merge(
            AP[AP["names"].isin(overlap)],
            on="names",
            how="inner",
            suffixes=("_MP", "_AP")
        )
        corr_mat = corr_df[["scores_MP", "scores_AP"]].corr(
            method="pearson", numeric_only=True
        )
        correlation_value = corr_mat.loc["scores_MP", "scores_AP"]
    else:
        correlation_value = float("nan")

    # ------- per-gene summary row -------
    gene_row = pd.DataFrame(
        [[
            gene,
            MP_names,
            AP_names,
            overlap,
            len(MP_names),
            len(AP_names),
            len(overlap),
            correlation_value
        ]]
    )
    gene_list.append(gene_row)

    # ------- directional (Up / Down) rows -------
    gene_direction_df = pd.DataFrame(
        [
            [
                gene,
                MP_up_names,
                AP_up_names,
                overlap_up,
                len(MP_up_names),
                len(AP_up_names),
                len(overlap_up),
                "Up",
            ],
            [
                gene,
                MP_down_names,
                AP_down_names,
                overlap_down,
                len(MP_down_names),
                len(AP_down_names),
                len(overlap_down),
                "Down",
            ],
        ]
    )
    directional_genes.append(gene_direction_df)

    # ------- full DE tables with guide labels -------
    MP["Guide"] = gene_MP
    AP["Guide"] = gene_AP
    comp_gene_list.append(MP)
    comp_gene_list.append(AP)

# ---------- build final dataframes ----------

col_names = [
    "Target_Gene",
    "MP_Genes",
    "AP_Genes",
    "Overlap_Genes",
    "MP_Gene_Num",
    "AP_Gene_Num",
    "Overlap_Gene_Num",
    "correlation_value",
]
all = pd.DataFrame(pd.concat(gene_list, ignore_index=True))
all.columns = col_names

# only directional
directional_genes_df = pd.DataFrame(
    pd.concat(directional_genes, ignore_index=True)
)
directional_genes_df.columns = [
    "Target_Gene",
    "MP_Genes",
    "AP_Genes",
    "Overlap_Genes",
    "MP_Gene_Num",
    "AP_Gene_Num",
    "Overlap_Gene_Num",
    "Direction",
]

# filter to successful KDs
successful = adata.obs.loc[
    adata.obs["successfulKD"] == "True", "gene"
].drop_duplicates().values

directional_genes_df = directional_genes_df[
    directional_genes_df["Target_Gene"].isin(successful)
]

all_filt = all[all["Target_Gene"].isin(successful)].reset_index(drop=True)

# save if you want
# all_filt.to_csv(loc+"files/singlecell_shortread_analysis/differential_exp_list.csv")
all_filt.to_csv(loc+"files/singlecell_shortread_analysis/differential_exp_list_corr.csv")
comp_gene_df = pd.concat(comp_gene_list, ignore_index=True)
comp_gene_df.to_csv(loc+"files/singlecell_shortread_analysis/differential_exp_fulltable.csv", index=False)

[44]:

directional_genes_df.to_csv(loc+"files/singlecell_shortread_analysis/differential_exp_directional_genes.csv", index=False)

[45]:

#plot all_filt
#plot the correlation_value
sns.histplot(data=all_filt, x="correlation_value", bins=10, kde=True)
plt.xlabel("Correlation between Z-Scores \n from Overlapping DEGs in P1 and P2 ")
plt.ylabel("Count of Target Genes")
#add the number of genes with a value not na and all the genes
num_genes=len(all_filt)
signif_genes=all_filt[all_filt["correlation_value"].notna()]
num_signif_genes=len(signif_genes)
plt.text(0, 15, f'Significant n={num_signif_genes} / Total n={num_genes}', fontsize=12)
# all_filt
#save
plt.savefig(loc+"figures/correlation_DEG_Zscores_MP_AP.pdf")

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_13_0.png

[46]:

#save directional_genes_df to csv
directional_genes_df.to_csv(loc+"files/singlecell_shortread_analysis/differential_exp_directional_genes.csv", index=False)

[47]:

# long format for plotting distributions
long = directional_genes_df.melt(
    id_vars=["Target_Gene", "Direction"],
    value_vars=["MP_Gene_Num", "AP_Gene_Num", "Overlap_Gene_Num"],
    var_name="Category",
    value_name="N_genes"
)

# optional: nicer labels
long["Category"] = long["Category"].map({
    "MP_Gene_Num": "P1",
    "AP_Gene_Num": "P2",
    "Overlap_Gene_Num": "Overlap"
})

# Add log10 transform column
long["log10_N_genes"] = np.log10(long["N_genes"]+1e-7)  # replaces 0 by 1
#add log appropriate long["N_genes"] by assigning it as +0.0000001
#drop duplicates
long["N_adj"] = long["N_genes"].replace(0, 0.1)

long = long.drop_duplicates()

fig, axes = plt.subplots(2, 1, figsize=(8, 10), sharex=True)
#make the background axes lines
sns.set_style("whitegrid")

# ---- UP ----
sns.boxplot(
    data=long[long["Direction"] == "Up"],
    x="Category",
    y="N_adj",
    showfliers=False,
    palette="Set2",
    ax=axes[0]
)
sns.stripplot(
    data=long[long["Direction"] == "Up"],
    x="Category",
    y="N_adj",
    color="black",
    alpha=0.5,
    size=4,
    ax=axes[0]
)
axes[0].set_title("Up-regulated DEGs (log10)")
axes[0].set_ylabel("# DE genes per target")
axes[0].set_xlabel("")
axes[0].set_yscale('log')

# ---- DOWN ----
sns.boxplot(
    data=long[long["Direction"] == "Down"],
    x="Category",
    y="N_adj",
    palette="Set2",
    showfliers=False,
    ax=axes[1]
)
sns.stripplot(
    data=long[long["Direction"] == "Down"],
    x="Category",
    y="N_adj",
    color="black",
    alpha=0.5,
    size=4,
    ax=axes[1]
)
axes[1].set_yscale('log')
axes[1].set_title("Down-regulated DEGs (log10)")
axes[1].set_ylabel("# DE genes per target")
axes[1].set_xlabel("Category")
#change 10-1 to 0 on y axis
tick_vals = [0.1, 1, 10, 100]
for ax in axes:
    ax.set_yticks(tick_vals)
    ax.set_yticklabels(["0", "1", "10", "100"])

#repeat for both axes

#save as pdf
plt.savefig(loc+"figures/DEG_counts_per_target_promoter.pdf")
plt.tight_layout()
plt.show()

/var/folders/r_/bqnt2f_d6919cz7v1yzx2ykc0000gn/T/ipykernel_22880/435962864.py:29: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(
/var/folders/r_/bqnt2f_d6919cz7v1yzx2ykc0000gn/T/ipykernel_22880/435962864.py:52: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_15_1.png

[48]:

#find the top overlapping genes
long[(long["Direction"] == "Down")&(long["Category"]=="Overlap")].sort_values(by="N_genes", ascending=False).head(6)

[48]:

	Target_Gene	Direction	Category	N_genes	log10_N_genes	N_adj
185	MYBBP1A	Down	Overlap	33	1.5185	33.0000
151	ESR1	Down	Overlap	21	1.3222	21.0000
201	GATA3	Down	Overlap	20	1.3010	20.0000
203	RBM47	Down	Overlap	18	1.2553	18.0000
169	RC3H2	Down	Overlap	18	1.2553	18.0000
149	JARID2	Down	Overlap	17	1.2304	17.0000

[49]:

#can check that there are no genes with logfoldchange between -0.5 and 0.5 ad that pvals_adj
print(all_genes_df_filt[(all_genes_df_filt["logfoldchanges"].abs()<=0.5) & (all_genes_df_filt["pvals_adj"]<0.05)].shape)

#chec number of genes with sig. DEG all_genes_df_filt ESR1_MP ESR1_AP
#plot the distribution of logfoldchanges for ESR1_MP and ESR1_AP
all_genes_df_esr1=all_genes_df_filt[all_genes_df_filt["targeted_gene"].isin(["ESR1_MP","ESR1_AP"])]
all_genes_df_esr1["Direction"]=np.where(all_genes_df_esr1["logfoldchanges"]>0, "Up","Down")
#check the number of MP and AP genes per target gene and their overlap using
number_of_genes=all_genes_df_esr1.groupby(["targeted_gene","Direction"]).size().unstack(fill_value=0)
#see the number of gene that overlap between MP and AP
overlap_genes=all_genes_df_esr1[all_genes_df_esr1["targeted_gene"]=="ESR1_MP"]["gene"].isin(all_genes_df_esr1[all_genes_df_esr1["targeted_gene"]=="ESR1_AP"]["gene"])
print(number_of_genes)
print("Number of overlapping genes between ESR1_MP and ESR1_AP:", overlap_genes.sum())

(0, 7)
Direction      Down   Up
targeted_gene
ESR1_AP         634  355
ESR1_MP         154    0
Number of overlapping genes between ESR1_MP and ESR1_AP: 21

/var/folders/r_/bqnt2f_d6919cz7v1yzx2ykc0000gn/T/ipykernel_22880/1044356788.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_genes_df_esr1["Direction"]=np.where(all_genes_df_esr1["logfoldchanges"]>0, "Up","Down")

[50]:

genes_of_interest = ["JARID2", "RC3H2", "RBM47","GATA3", "ESR1", "MYBBP1A"]

plt.subplots_adjust(hspace=0.35, wspace=0.25)

# Determine grid shape
n = len(genes_of_interest)
ncols = 2
nrows = int(np.ceil(n / ncols))



for i in range(nrows):
    fig, axes = plt.subplots(
        1, ncols,
        figsize=(8 * ncols, 3 ),
        sharey=False   # <-- keep y shared, but NOT x
    )
    axes = axes.flatten()
    #sub genes_of_interest list for current row
    row_genes_of_interest = genes_of_interest[i*ncols:(i+1)*ncols]
    for ax, tg in zip(axes, row_genes_of_interest):

        mp = (
            all_genes_df_filt.loc[
                all_genes_df_filt["targeted_gene"] == f"{tg}_MP",
                ["gene", "scores"]
            ].rename(columns={"scores": "score_MP"})
        )
        ap = (
            all_genes_df_filt.loc[
                all_genes_df_filt["targeted_gene"] == f"{tg}_AP",
                ["gene", "scores"]
            ].rename(columns={"scores": "score_AP"})
        )

        overlap = mp.merge(ap, on="gene", how="inner")

        if overlap.empty:
            ax.set_axis_off()
            ax.set_title(f"{tg}: no overlapping genes", fontsize=10)
            continue

        r = overlap["score_MP"].corr(overlap["score_AP"])

        # draw heatmap
        sns.heatmap(
            overlap.set_index("gene").T,
            cmap="vlag",
            center=0,
            cbar_kws={"label": "Z-Score"},
            ax=ax
        )

        # ---- force ALL gene labels to show ----
        genes = overlap["gene"].tolist()
        ax.set_xticks(np.arange(len(genes)) + 0.05)        # centers of cells
        ax.set_xticklabels(genes, rotation=60, ha="right", fontsize=7)
        ax.tick_params(axis='x', which='major', pad=2)
        #remove yticklabels
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)

        ax.set_title(f"{tg}\nn={len(overlap)}, r={r:.2f}", fontsize=12)

    plt.subplots_adjust(hspace=0.35, wspace=0.25)
    #save
    plt.savefig(loc+f"figures/DEG_Zscore_heatmap_MP_AP_row{i+1}.pdf")
    plt.show()

<Figure size 640x480 with 0 Axes>

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_18_1.png

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_18_2.png

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_18_3.png

[51]:

#change MP to P1 and AP to P2
comp_gene_df["Guide"] = comp_gene_df["Guide"].str.replace("_MP", "_P1")
comp_gene_df["Guide"] = comp_gene_df["Guide"].str.replace("_AP", "_P2")
pivot_df = comp_gene_df.pivot_table(
    index="names",
    columns="Guide",
    values="logfoldchanges",
    aggfunc="median"
)

pivot_df = pivot_df.fillna(0)

sns.set(style="white")
#make figure larger
plt.figure(figsize=(8, 4))
#make max and min +2 and -2
vmax = 2
vmin = -2

sns.heatmap(
    pivot_df,
    cmap="coolwarm",
    center=0,
    linewidths=0,
    xticklabels=True,
    yticklabels=False,   # set to True if you want labels (but 17k genes = unreadable)
    vmax=vmax,
    vmin=vmin
)
plt.title("LogFC Heatmap for all guides")
plt.xlabel("Guide")
plt.ylabel("Genes")
#remove vadj
plt.tight_layout()
#make x smaller size
plt.xticks(fontsize=5)
#add colorbar label titlee saying Log Fold Change
cbar = plt.gcf().axes[-1]
cbar.set_ylabel('Log Fold Change (Capped ±2)', rotation=270, labelpad=15)
plt.show()

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_19_0.png

[52]:

#make a correlation between the logfoldchanges for MP vs AP for all genes
corr=pivot_df.corr(method='pearson',numeric_only=True)
#plot the correlation as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(
    corr,
    cmap="viridis",
    center=0,
    linewidths=0,
    xticklabels=True,
    yticklabels=True

)
plt.title("Correlation Heatmap of LogFC between Guides")
plt.xlabel("Guide")
plt.ylabel("Guide")
plt.xticks(fontsize=4)
plt.yticks(fontsize=4)

[52]:

(array([  0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,
          9.5,  10.5,  11.5,  12.5,  13.5,  14.5,  15.5,  16.5,  17.5,
         18.5,  19.5,  20.5,  21.5,  22.5,  23.5,  24.5,  25.5,  26.5,
         27.5,  28.5,  29.5,  30.5,  31.5,  32.5,  33.5,  34.5,  35.5,
         36.5,  37.5,  38.5,  39.5,  40.5,  41.5,  42.5,  43.5,  44.5,
         45.5,  46.5,  47.5,  48.5,  49.5,  50.5,  51.5,  52.5,  53.5,
         54.5,  55.5,  56.5,  57.5,  58.5,  59.5,  60.5,  61.5,  62.5,
         63.5,  64.5,  65.5,  66.5,  67.5,  68.5,  69.5,  70.5,  71.5,
         72.5,  73.5,  74.5,  75.5,  76.5,  77.5,  78.5,  79.5,  80.5,
         81.5,  82.5,  83.5,  84.5,  85.5,  86.5,  87.5,  88.5,  89.5,
         90.5,  91.5,  92.5,  93.5,  94.5,  95.5,  96.5,  97.5,  98.5,
         99.5, 100.5, 101.5, 102.5, 103.5, 104.5, 105.5]),
 [Text(0, 0.5, 'ADAR_P1'),
  Text(0, 1.5, 'ADAR_P2'),
  Text(0, 2.5, 'AHCYL1_P1'),
  Text(0, 3.5, 'AHCYL1_P2'),
  Text(0, 4.5, 'APP_P1'),
  Text(0, 5.5, 'APP_P2'),
  Text(0, 6.5, 'ARNT_P1'),
  Text(0, 7.5, 'ARNT_P2'),
  Text(0, 8.5, 'BRIP1_P1'),
  Text(0, 9.5, 'BRIP1_P2'),
  Text(0, 10.5, 'BZW1_P1'),
  Text(0, 11.5, 'BZW1_P2'),
  Text(0, 12.5, 'CALR_P1'),
  Text(0, 13.5, 'CALR_P2'),
  Text(0, 14.5, 'CCND1_P1'),
  Text(0, 15.5, 'CCND1_P2'),
  Text(0, 16.5, 'CHD4_P1'),
  Text(0, 17.5, 'CHD4_P2'),
  Text(0, 18.5, 'CHD8_P1'),
  Text(0, 19.5, 'CHD8_P2'),
  Text(0, 20.5, 'CTNNB1_P1'),
  Text(0, 21.5, 'CTNNB1_P2'),
  Text(0, 22.5, 'CUX1_P1'),
  Text(0, 23.5, 'CUX1_P2'),
  Text(0, 24.5, 'DHX30_P1'),
  Text(0, 25.5, 'DHX30_P2'),
  Text(0, 26.5, 'EEF2_P1'),
  Text(0, 27.5, 'EEF2_P2'),
  Text(0, 28.5, 'ESR1_P1'),
  Text(0, 29.5, 'ESR1_P2'),
  Text(0, 30.5, 'FHL2_P1'),
  Text(0, 31.5, 'FHL2_P2'),
  Text(0, 32.5, 'GATA3_P1'),
  Text(0, 33.5, 'GATA3_P2'),
  Text(0, 34.5, 'GPBP1_P1'),
  Text(0, 35.5, 'GPBP1_P2'),
  Text(0, 36.5, 'GTF2F1_P1'),
  Text(0, 37.5, 'GTF2F1_P2'),
  Text(0, 38.5, 'HNRNPU_P1'),
  Text(0, 39.5, 'HNRNPU_P2'),
  Text(0, 40.5, 'JARID2_P1'),
  Text(0, 41.5, 'JARID2_P2'),
  Text(0, 42.5, 'KDM2A_P1'),
  Text(0, 43.5, 'KDM2A_P2'),
  Text(0, 44.5, 'MYBBP1A_P1'),
  Text(0, 45.5, 'MYBBP1A_P2'),
  Text(0, 46.5, 'NAP1L1_P1'),
  Text(0, 47.5, 'NAP1L1_P2'),
  Text(0, 48.5, 'NBN_P1'),
  Text(0, 49.5, 'NBN_P2'),
  Text(0, 50.5, 'NCOR2_P1'),
  Text(0, 51.5, 'NCOR2_P2'),
  Text(0, 52.5, 'NFE2L2_P1'),
  Text(0, 53.5, 'NFE2L2_P2'),
  Text(0, 54.5, 'NME2_P1'),
  Text(0, 55.5, 'NME2_P2'),
  Text(0, 56.5, 'NR2F2_P1'),
  Text(0, 57.5, 'NR2F2_P2'),
  Text(0, 58.5, 'P4HB_P1'),
  Text(0, 59.5, 'P4HB_P2'),
  Text(0, 60.5, 'PA2G4_P1'),
  Text(0, 61.5, 'PA2G4_P2'),
  Text(0, 62.5, 'PKM_P1'),
  Text(0, 63.5, 'PKM_P2'),
  Text(0, 64.5, 'PRRC2C_P1'),
  Text(0, 65.5, 'PRRC2C_P2'),
  Text(0, 66.5, 'PSMC5_P1'),
  Text(0, 67.5, 'PSMC5_P2'),
  Text(0, 68.5, 'RBM47_P1'),
  Text(0, 69.5, 'RBM47_P2'),
  Text(0, 70.5, 'RC3H2_P1'),
  Text(0, 71.5, 'RC3H2_P2'),
  Text(0, 72.5, 'SAFB_P1'),
  Text(0, 73.5, 'SAFB_P2'),
  Text(0, 74.5, 'SET_P1'),
  Text(0, 75.5, 'SET_P2'),
  Text(0, 76.5, 'SIN3A_P1'),
  Text(0, 77.5, 'SIN3A_P2'),
  Text(0, 78.5, 'SLTM_P1'),
  Text(0, 79.5, 'SLTM_P2'),
  Text(0, 80.5, 'SMARCA4_P1'),
  Text(0, 81.5, 'SMARCA4_P2'),
  Text(0, 82.5, 'SP1_P1'),
  Text(0, 83.5, 'SP1_P2'),
  Text(0, 84.5, 'SRSF5_P1'),
  Text(0, 85.5, 'SRSF5_P2'),
  Text(0, 86.5, 'STAG2_P1'),
  Text(0, 87.5, 'STAG2_P2'),
  Text(0, 88.5, 'STAT3_P1'),
  Text(0, 89.5, 'STAT3_P2'),
  Text(0, 90.5, 'STAU2_P1'),
  Text(0, 91.5, 'STAU2_P2'),
  Text(0, 92.5, 'TGIF1_P1'),
  Text(0, 93.5, 'TGIF1_P2'),
  Text(0, 94.5, 'TPD52L1_P1'),
  Text(0, 95.5, 'TPD52L1_P2'),
  Text(0, 96.5, 'UPF3B_P1'),
  Text(0, 97.5, 'UPF3B_P2'),
  Text(0, 98.5, 'XRCC5_P1'),
  Text(0, 99.5, 'XRCC5_P2'),
  Text(0, 100.5, 'YWHAZ_P1'),
  Text(0, 101.5, 'YWHAZ_P2'),
  Text(0, 102.5, 'ZNF217_P1'),
  Text(0, 103.5, 'ZNF217_P2'),
  Text(0, 104.5, 'ZNF3_P1'),
  Text(0, 105.5, 'ZNF3_P2')])

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_20_1.png

Pathway and GO Enrichment Analysis To move from individual gene lists to biological functions, the notebook conducts pathway enrichment analysis.

GSEA and pyGSEA: It performs pre-ranked Gene Set Enrichment Analysis (GSEA) using the GO:BP (Gene Ontology: Biological Process) database.

NES Differences: The code quantifies differences between P1 and P2 at the pathway level by calculating the difference in Normalized Enrichment Scores (NES)

[ ]:

# list to collect per-guide GSEA results
gsea_results = []

# cache valid guides to avoid recomputing
valid_guides = adata.obs.loc[
    adata.obs["guide_id"] != "non-targeting_Control", "guide_id"
].drop_duplicates().values

# list of perturbed genes
perturbations = adata.obs.loc[
    adata.obs["guide_id"] != "non-targeting_Control", "perturbation"
].drop_duplicates().values

for gene in perturbations:
    print(gene)

    gene_AP = gene + "_AP"
    gene_MP = gene + "_MP"

    # skip if no AP guide for this gene
    if gene_AP not in valid_guides:
        continue

    # ---------- MP GUIDE ----------
    # get full DE table (NO pval/logFC cutoff for GSEA)
    MP_rank = sc.get.rank_genes_groups_df(adata, group=gene_MP, pval_cutoff=None)

    # make a 2-column ranking like your STIM example:
    #   [names, logfoldchanges]
    mp_rnk = MP_rank.loc[:, ["names", "logfoldchanges"]].dropna()

    # run prerank
    pre_res_mp = gp.prerank(
        rnk=mp_rnk,                 # DataFrame with [gene, score]
        gene_sets="GO_Biological_Process_2021",      # or your favourite collection
        outdir=None,                # don't write files
        seed=123
    )

    # grab the result table
    mp_res_df = pre_res_mp.res2d.copy()
    mp_res_df["Target_Gene"] = gene
    mp_res_df["Guide"] = gene_MP
    # classify direction by NES sign
    mp_res_df["Direction"] = np.where(mp_res_df["NES"] > 0, "Up", "Down")
    gsea_results.append(mp_res_df)

    # ---------- AP GUIDE ----------
    AP_rank = sc.get.rank_genes_groups_df(adata, group=gene_AP, pval_cutoff=None)
    ap_rnk = AP_rank.loc[:, ["names", "logfoldchanges"]].dropna()

    pre_res_ap = gp.prerank(
        rnk=ap_rnk,
        gene_sets="GO_Biological_Process_2021",
        outdir=None,
        seed=123
    )

    ap_res_df = pre_res_ap.res2d.copy()
    ap_res_df["Target_Gene"] = gene
    ap_res_df["Guide"] = gene_AP
    ap_res_df["Direction"] = np.where(ap_res_df["NES"] > 0, "Up", "Down")
    gsea_results.append(ap_res_df)

gsea_results.head()

[ ]:

if len(gsea_results) > 0:
    gsea_results_df = pd.concat(gsea_results, ignore_index=True)
else:
    gsea_results_df = pd.DataFrame()
successful_guides = adata.obs["guide_id"][adata.obs["successfulKD"]=='True'].unique()

[ ]:

#filter for only genes in the whippet siuccessful KD
gsea_results_df = gsea_results_df[gsea_results_df["Guide"].isin(successful_guides)].reset_index(drop=True)
# Clean up pathway names a bit (optional)
gsea_results_df["Pathway"] = gsea_results_df["Term"].str.replace(r" Homo sapiens.*", "", regex=True)
gsea_results_df["Condition"] = gsea_results_df["Guide"].apply(lambda x: "P1" if "_MP" in x else "P2")

# Filter to significant pathways
sig = gsea_results_df[(gsea_results_df["NOM p-val"] < 0.05) & (gsea_results_df["FDR q-val"] < 0.25)].copy()

# Keep only pathways that appear in ≥2 target genes (shared)
pathway_counts = sig.groupby("Pathway")["Target_Gene"].nunique()
shared_pathways = pathway_counts[pathway_counts >= 2].index
sig_shared = sig[sig["Pathway"].isin(shared_pathways)].copy()

# This is your “significant shared list”
sig_shared.to_csv("significant_shared_pathways.csv", index=False)
print(sig_shared.head())
#save sig
sig.to_excel(loc+"figures/significant_genes.xlsx")

          Name                                               Term      ES  \
4075   prerank  positive regulation of potassium ion transport...  0.7394
8155   prerank  positive regulation of cellular component move... -0.9718
8160   prerank  negative regulation of endopeptidase activity ... -0.9330
10185  prerank  positive regulation of biosynthetic process (G...  0.7262
16299  prerank  positive regulation of phospholipase activity ... -0.9546

          NES NOM p-val FDR q-val FWER p-val Tag % Gene %  \
4075   1.8924    0.0025    0.2356     0.2740  4/16  5.02%
8155  -1.6374    0.0000    0.0601     0.0600  3/15  0.59%
8160  -1.6019    0.0064    0.1928     0.4290  5/20  2.34%
10185  1.9967    0.0000    0.0651     0.0590  5/25  3.00%
16299 -1.6885    0.0050    0.1416     0.1250  3/15  1.78%

                                   Lead_genes Target_Gene     Guide Direction  \
4075              GABBR2;ADRA2A;KCNMB4;KCNMB3         SP1    SP1_MP        Up
8155                          TWIST1;CCR7;LYN       TGIF1  TGIF1_MP      Down
8160   SERPINB8;RECK;SERPINI1;SPINK2;SERPINE2       TGIF1  TGIF1_MP      Down
10185        MLXIPL;CREB3L1;SORBS1;SIRT4;WNT4       TGIF1  TGIF1_AP        Up
16299                        FLT1;NTRK3;AGTR1        P4HB   P4HB_MP      Down

                                                 Pathway Condition
4075   positive regulation of potassium ion transport...        P1
8155   positive regulation of cellular component move...        P1
8160   negative regulation of endopeptidase activity ...        P1
10185  positive regulation of biosynthetic process (G...        P2
16299  positive regulation of phospholipase activity ...        P1

[59]:

#use pathway_counts to count the number of pathways with the number fo genes
pathway_counts_count = pathway_counts.value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(y=pathway_counts_count.values, x=pathway_counts_count.index, palette="viridis")
plt.ylabel("Number of Gene Ontology Biological Process Pathways")
plt.xlabel("Number of Pseudobulk Perturbations per Pathway")
plt.tight_layout()
plt.show()

INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
/var/folders/r_/bqnt2f_d6919cz7v1yzx2ykc0000gn/T/ipykernel_22880/1666908462.py:5: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=pathway_counts_count.values, x=pathway_counts_count.index, palette="viridis")
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_25_1.png

[61]:

#find the average difference NES between P1 and P2 for each gene
nes_diff_list = []
for gene in sig["Target_Gene"].drop_duplicates().values:
    gene_df = sig[sig["Target_Gene"] == gene]
    # separate P1 and P2
    p1_df = gene_df[gene_df["Condition"] == "P1"]
    p2_df = gene_df[gene_df["Condition"] == "P2"]

    #just calculate thge mean NES difference
    mean_nes_p1 = p1_df["NES"].mean()
    mean_nes_p2 = p2_df["NES"].mean()
    nes_diff = mean_nes_p1 - mean_nes_p2
    nes_diff_list.append({
        "Target_Gene": gene,
        "Mean_NES_P1": mean_nes_p1,
        "Mean_NES_P2": mean_nes_p2,
        "NES_Difference": nes_diff
    })
nes_diff_df = pd.DataFrame(nes_diff_list)
print(nes_diff_df)
#plot the NES difference
plt.figure(figsize=(8, 6))
#order difference nes_diff_df
nes_diff_df = nes_diff_df.sort_values("NES_Difference", ascending=False)
#remove genes with nan values
nes_diff_df = nes_diff_df[nes_diff_df["NES_Difference"].notna()]
sns.barplot(data=nes_diff_df, x="Target_Gene", y="NES_Difference", palette="coolwarm")
plt.axhline(0, color='black', linestyle='--')
plt.xlabel("Target Gene")
plt.ylabel("Mean NES Difference (P1 - P2)")
plt.title("Average NES Difference Between P1 and P2 Across Shared Pathways")
plt.xticks(rotation=45)
plt.tight_layout()
#save as a pdf
plt.savefig(loc+"figures/mean_NES_difference_P1_P2.pdf")
plt.show()

   Target_Gene  Mean_NES_P1  Mean_NES_P2  NES_Difference
0          SP1       1.8865          NaN             NaN
1        TGIF1      -1.6060       1.9967         -3.6027
2         P4HB      -1.6885          NaN             NaN
3        NCOR2          NaN      -1.6075             NaN
4       JARID2      -1.4687      -1.6524          0.1836
5         ESR1       1.9448       0.7594          1.1854
6        PA2G4      -1.6153          NaN             NaN
7        GPBP1          NaN       1.9496             NaN
8        DHX30          NaN      -0.2122             NaN
9         SAFB          NaN      -1.4168             NaN
10        CHD8          NaN      -1.5998             NaN
11       SIN3A      -0.9079      -1.7137          0.8058
12       PSMC5      -1.7132       1.8945         -3.6077
13      GTF2F1       1.9189      -1.5914          3.5103
14        ADAR      -0.8903          NaN             NaN
15       NR2F2      -1.5661          NaN             NaN
16        BZW1       1.0284      -1.5607          2.5891
17         SET      -0.4655          NaN             NaN
18     MYBBP1A       1.8961      -1.6009          3.4971
19        CUX1       1.9624       2.0318         -0.0695
20         NBN          NaN       1.8661             NaN
21        CALR       1.5437          NaN             NaN
22       STAG2          NaN      -1.6416             NaN
23         PKM       1.2322      -1.5604          2.7926
24       BRIP1       0.7132      -1.5030          2.2162
25       GATA3          NaN      -1.4402             NaN

/var/folders/r_/bqnt2f_d6919cz7v1yzx2ykc0000gn/T/ipykernel_22880/2520638475.py:27: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=nes_diff_df, x="Target_Gene", y="NES_Difference", palette="coolwarm")

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_26_2.png

[ ]:

#run for a single gene
for gene_of_interest in nes_diff_df["Target_Gene"].values:
    gsea_esr1_df,pre_res_ap, pre_res_mp = run_gsea_for_gene(adata, gene_of_interest)
    #add together res2d from pre_res_ap and pre_res_mp
    pre_res_ap.res2d["Promoter"] = "P2"
    pre_res_mp.res2d["Promoter"] = "P1"
    combined_res2d = pd.concat([pre_res_ap.res2d, pre_res_mp.res2d], ignore_index=True)
    from gseapy.scipalette import SciPalette
    sci = SciPalette()
    NbDr = sci.create_colormap()
    #add new column name 'Adjusted P-value' from FDR q-val
    combined_res2d["Adjusted P-value"] = combined_res2d["NOM p-val"]
    combined_res2d["Direction"] = np.where(combined_res2d["NES"] > 0, "Up", "Down")
    combined_res2d["Promoter_Direction"] = combined_res2d["Promoter"] + "_" + combined_res2d["Direction"]
    combined_res2d = combined_res2d[(combined_res2d["NOM p-val"] < 0.05) & (combined_res2d["FDR q-val"] < 0.25)].copy()
    ax = gp.dotplot(combined_res2d,figsize=(3,5),
                    x='Promoter_Direction',
                    x_order = ["P1_Up","P1_Down","P2_Up","P2_Down"],
                    title=f'GSEA Dotplot for {gene_of_interest}',
                    cmap = NbDr.reversed(),
                    size=3,
                    show_ring=True)
    ax.set_xlabel("")
    #make the size larger for plt.savefig
    plt.gcf().set_size_inches(6,10)
    plt.savefig(f"gsea_dotplot_{gene_of_interest}.pdf")
    #save plot
    plt.show()

[ ]:

../../_images/scripts_3_analysis_scripts_8_perturbseq_differentialexpression_28_0.png

[23]:

all_comp=pd.DataFrame(pd.concat(comp_gene_list))
all_comp.columns=MP.columns
all_comp["Promoter"]=all_comp["Guide"].str.split("_").str.get(1)
all_comp["Target_Gene"]=all_comp["Guide"].str.split("_").str.get(0)
all_comp_mean=all_comp.groupby("Target_Gene").agg({'logfoldchanges': 'mean'})
all_comp_mean.reset_index(inplace=True)
all_comp_mean.columns=["Target_Gene","logfoldchanges_mean"]
all_comp=all_comp.merge(all_comp_mean,on="Target_Gene")
all_comp_mean=all_comp.groupby("Target_Gene").agg({'Target_Gene':"count"}).rename(columns={"Target_Gene":"Count"})
all_comp_mean.reset_index(inplace=True)
all_comp=all_comp.merge(all_comp_mean,on="Target_Gene")
all_comp=all_comp.sort_values(by="Guide")
all_comp.to_csv(loc+"files/singlecell_shortread_analysis/differential_exp_list_full.csv")

[ ]:

all["Mutual_Exclusivity"]=all["Overlap_Gene_Num"]/(all["MP_Gene_Num"]+all["AP_Gene_Num"])

n_term=pd.read_table("alt-prom-crispr-fiveprime/reference/all_pivot_simple_nterm.txt")
n_term=all.merge(n_term, right_on="Gene_symbol",left_on="Target_Gene")

n_term=n_term[["MP_Gene_Num","AP_Gene_Num","Overlap_Gene_Num","Mutual_Exclusivity","Ensembl_ID","Gene_symbol","Nterminus_Change"]]
# all_comp_pivot
all_comp_all=all_comp.merge(n_term, left_on="Target_Gene", right_on="Gene_symbol" ,how="outer")
all_comp_all=all_comp_all.sort_values("Nterminus_Change",ascending=True)