import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import mygene
from fuzzywuzzy import fuzz


##Columns required for output
sgRNA_output_columns=['gene', 'distance', 'protospacer_A', 'protospacer_B', 'target_A', 'target_B']
single_sgRNA_output_columns=['type','gene', 'protospacer_A', 'sgID']
single_sgRNA_output_columns_wloc=['type','gene', 'protospacer_A', 'sgID','chr','start','end','strand']

#Read in the GI sheet
repogle_sgRNA_GI=pd.read_excel("./sgRNA_repogle_distance/sgRNA_repogle.xlsx",sheet_name="Supplementary Table 4",skiprows=1)

#Read in hCRISPRi negative controls and the sgRNA read counts/growth phenotypes
hCRISPRi=pd.read_excel("./hCRISPRi_Weissman/TableS3_hCRISPRiv2_libraries.xlsx", sheet_name="hCRISPRi-v2")
hCRISPRi_performance=pd.read_excel("./hCRISPRi_Weissman/TableS7_sgRNAreadcounts_growthphenotypes_hCRISPRi-v2_K562.xlsx",skiprows=1)
hCRISPRi_performance=hCRISPRi_performance[['sgId','gamma, ave_Rep1_Rep2']]

#Get the top guides with the lowest gamma value
hCRISPRi=hCRISPRi.merge(hCRISPRi_performance,right_on="sgId",left_on="sgID",how="left")
hCRISPRi["absGAMMA"]=hCRISPRi['gamma, ave_Rep1_Rep2'].abs()
hCRISPRi.to_csv("./hCRISPRi_Weissman/hCRISPRi_growthphenotypes.csv")

#Get the negative control
negcntrls_hCRISPRi=hCRISPRi[hCRISPRi["gene"]=="negative_control"]
negcntrls_hCRISPRi=negcntrls_hCRISPRi.sort_values("absGAMMA", ascending=True)


#Use the sgRNA repogle single guide 
negative_controls= repogle_sgRNA_GI[sgRNA_output_columns][(( repogle_sgRNA_GI["gene"]=="NegCtrl3" ) & ( repogle_sgRNA_GI["gene_B"]=="NegCtrl4")) | (( repogle_sgRNA_GI["gene"]=="NegCtrl4" ) & ( repogle_sgRNA_GI["gene_B"]=="NegCtrl3" ))]


dataframe=negcntrls_hCRISPRi
#Use the guides that have the lowest abs(gamma) from CRISPRi
output=[]

for num in range(0,19):
    num_2=((num*2)+1)
    distance="NaN"
    index=list(dataframe["gene"])[num]
    protospacerA=list(dataframe["protospacer_A"])[num]
    protospacerB=list(dataframe["protospacer_A"])[num_2]
    target_A=list(dataframe["sgID"])[num]
    target_B=list(dataframe["sgID"])[num_2]
    row = [index,distance, protospacerA,protospacerB,target_A,target_B]
    output.append(list(row))

negative_controls = pd.DataFrame(output, columns=sgRNA_output_columns).append(negative_controls)
print(negative_controls.head())

               gene distance         protospacer_A         protospacer_B  \
0  negative_control      NaN  GTAAAGTTGTCGGCAATTGC  GCGCTGGTAAAGGCGGAAAG   
1  negative_control      NaN  GCGCTGGTAAAGGCGGAAAG  GGATGGGCCCTGTTAATGCC   
2  negative_control      NaN  GTCTTCGCGCGGAAGGTCAC  GGCCGAGGTCACCGCAGCAT   
3  negative_control      NaN  GGATGGGCCCTGTTAATGCC  GAGTGTCGCCATCGCGGCAA   
4  negative_control      NaN  GCTACCCCTGATTGCCAGCC  GCTCCGACCCCCGAGATGAG   

              target_A             target_B  
0  non-targeting_00715  non-targeting_00373  
1  non-targeting_00373  non-targeting_02295  
2  non-targeting_03351  non-targeting_01500  
3  non-targeting_02295  non-targeting_02771  
4  non-targeting_00202  non-targeting_00303


###Check if single promoter using CAGE IDR 
##Extract list of genes with 1 TSS
CAGE=pd.read_csv("./CAGE_TSS_TPM/CAGErefTSS.txt")
CAGE_list_onepromoter=CAGE["ensembl.gene"].value_counts()[CAGE["ensembl.gene"].value_counts()==1].index

###Calculate TPM_calculations using stringtie using TPN_count.sh
##Read in
TPM_REP1=pd.read_csv("./CAGE_TSS_TPM/170223_BSF--000Aligned.sortedByCoord.out_abundance.tsv",sep="\t")
TPM_REP2=pd.read_csv("./CAGE_TSS_TPM/170223_BSF--001Aligned.sortedByCoord.out_abundance.tsv",sep="\t")
TPM_REP3=pd.read_csv("./CAGE_TSS_TPM/170223_BSF--012Aligned.sortedByCoord.out_abundance.tsv",sep="\t")

TPM=TPM_REP1.merge(TPM_REP2, on=['Gene ID', 'Gene Name', 'Reference', 'Strand','Start', 'End'],how="inner").merge(TPM_REP3, on=['Gene ID', 'Gene Name', 'Reference', 'Strand','Start', 'End'],how="inner")
TPM.columns=['Ensembl_ID', 'Gene Name', 'Reference', 'Strand', 'Start', 'End',
       'Coverage_REP1', 'FPKM_REP1', 'TPM_REP1', 'Coverage_REP2', 'FPKM_REP2', 'TPM_REP2', 'Coverage_REP3', 'FPKM_REP3', 'TPM_REP3']

##Calculate the mean TPM and filter for genes with greatr than 10
TPM["Mean_TPM"]=TPM[["TPM_REP1","TPM_REP2","TPM_REP3"]].mean(axis=1)
TPM_GREATER_THAN10=TPM["Ensembl_ID"][TPM["Mean_TPM"]> 10]

##Extract TOPBP1 sgRNA sequence from repogle
##the repogle guide is the same as the top ones found in the weissman list!!! So no need to do any of this 
# repogle_sgRNA_GI=pd.read_excel("./sgRNA_repogle_distance/sgRNA_repogle.xlsx",sheet_name="Supplementary Table 4",skiprows=1)
# listpos_genes=["TOPBP1"]
# repogle_sgRNA_POS=repogle_sgRNA_GI[single_sgRNA_output_columns][repogle_sgRNA_GI["gene"].isin(listpos_genes)].groupby("gene").head(n=1)

hCRISPRi_sgRNA_POS=hCRISPRi.sort_values("absGAMMA", ascending=False)

mg = mygene.MyGeneInfo()
#https://docs.mygene.info/en/latest/doc/query_service.html
#The list of available fields
##Merge for known gene ID 
##Finding the ensembl ID for all of the gene names
t=mg.querymany(hCRISPRi_sgRNA_POS["gene"].drop_duplicates(), fields='ensembl.gene', scopes='symbol', as_dataframe=True);
t["Gene_symbol"]=t.index
t_sub=t[["ensembl.gene","Gene_symbol","_score"]].dropna()
t_sub=t_sub.loc[t_sub['ensembl.gene'].str.startswith("ENSG"), ('ensembl.gene','_score','Gene_symbol')]
t_sub=t_sub.sort_values("_score",ascending=False).groupby("Gene_symbol").head(n=1)



##Filter for 1 TSS with gage
hCRISPRi_sgRNA_POS= hCRISPRi_sgRNA_POS.merge(t_sub, right_on="Gene_symbol", left_on="gene", how='left')
hCRISPRi_sgRNA_POS=hCRISPRi_sgRNA_POS[(hCRISPRi_sgRNA_POS["ensembl.gene"].isin(CAGE_list_onepromoter) )]

# ###TPM greater than 10
hCRISPRi_sgRNA_POS=hCRISPRi_sgRNA_POS[hCRISPRi_sgRNA_POS["ensembl.gene"].isin(TPM_GREATER_THAN10) ] 
##Extract top gamma genes from hCRISPRi 2.1 
##These are the top genes that are also do not have alt 5' on vast db
listpos_genes=["RPL31" ,"TOPBP1","ATF5","GINS1","SNRPD2"]

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-18906...done.
Finished.
17934 input query terms found dup hits:
	[('DNAJC19', 10), ('MED30', 10), ('NEDD1', 10), ('NOP2', 10), ('UBA1', 10), ('CAD', 10), ('INTS2', 1
552 input query terms found no hit:
	['C3orf17', 'CASC5', 'LPPR2', 'negative_control', 'SGOL1', 'MRP63', 'MINOS1-NBL1', 'C14orf166', 'C9o
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


hCRISPRi_sgRNA_POS_list=hCRISPRi_sgRNA_POS[hCRISPRi_sgRNA_POS["gene"].isin(listpos_genes)].append(hCRISPRi[hCRISPRi["gene"]=="TOPBP1"].sort_values("absGAMMA", ascending=False))
hCRISPRi_sgRNA_POS_list=hCRISPRi_sgRNA_POS_list.groupby("gene").head(n=2)
hCRISPRi_sgRNA_POS_list=hCRISPRi_sgRNA_POS_list[single_sgRNA_output_columns]
hCRISPRi_sgRNA_POS_list["type"]="positive_control"

fasta=[]
##Create a fasta file for all of the files 
for index,row in hCRISPRi_sgRNA_POS_list.iterrows(): 
    fasta.append((">"+row[3]))
    fasta.append((row[2]))

fasta_file = pd.DataFrame(fasta)
fasta_file.to_csv("./annotations/dual_guide_positive_controls.fa",sep="\t",index=None, header=None)

#os.system("makeblastdb -in /Users/helenking/Desktop/reference/GRCh38.primary_assembly.genome.fa -dbtype nucl")
##run blat to find the location of each of the sequences
os.system('blastn -word_size 11 -num_alignments 1 -num_descriptions 1 -outfmt "6 qseqid sseqid sstart send sstrand" -query ./annotations/dual_guide_positive_controls.fa -db /Users/helenking/Desktop/reference/GRCh38.primary_assembly.genome.fa -out ./annotations/dualguide_positive_controls_blat.txt')


hCRISPRi_sgRNA_POS_list_blat=pd.read_table("./annotations/dualguide_positive_controls_blat.txt",header=None, names=["sgID","chr","start","end","strand_let"])

hCRISPRi_sgRNA_POS_list_blat["strand"]=np.where(hCRISPRi_sgRNA_POS_list_blat["strand_let"]=="plus","+","-")
hCRISPRi_sgRNA_POS_list=hCRISPRi_sgRNA_POS_list_blat.merge(hCRISPRi_sgRNA_POS_list,on="sgID").drop(["strand_let"],axis=1)
#ensure same columns 
hCRISPRi_sgRNA_POS_list=hCRISPRi_sgRNA_POS_list[single_sgRNA_output_columns_wloc]
hCRISPRi_sgRNA_POS_list.to_csv("./dual_guide/hCRISPRi_sgRNA_list.txt")


##For two more sgRNAs targeting positive controls
##Need to extract region of TSS and then run FlashFry
TSS_CRISPRi=pd.read_csv("./hCRISPRi_Weissman/TSS_annotations_CRISPRi_hg38.sorted.bed", sep="\t",header=None)

TSS_CRISPRi.columns=["chr","TSSstart","TSSend","Promoter","Genesymbol","strand"]
TSS_CRISPRi_POS=TSS_CRISPRi[TSS_CRISPRi["Genesymbol"].isin(listpos_genes)].groupby("Genesymbol").head(1)

TSS_CRISPRi_POS["Start_target"]=np.where(TSS_CRISPRi_POS["strand"]=='+',( TSS_CRISPRi_POS["TSSstart"] -00 ), (TSS_CRISPRi_POS["TSSstart"]-300))
TSS_CRISPRi_POS["End_target"]= np.where(TSS_CRISPRi_POS["strand"]=='+', (TSS_CRISPRi_POS["TSSstart"]+300), (TSS_CRISPRi_POS["TSSstart"] +00))


TSS_CRISPRi_POS["chr"]=TSS_CRISPRi_POS["chr"].str.split("chr").str.get(1)
TSS_CRISPRi_POS["Coord"]=TSS_CRISPRi_POS["chr"].astype("str")+":"+TSS_CRISPRi_POS["Start_target"].astype("str")+"-"+TSS_CRISPRi_POS["End_target"].astype("str")

TSS_CRISPRi_POS[["Coord","Genesymbol"]].to_csv("./dual_guide/flashfry_positive.txt", index=None, header=None, sep=",")

###Find only
print(TSS_CRISPRi_POS)


######WHY IS THERE AN ERROR WITH THE INDEX FILE!!!! I even did md5sum w/ reference file!! and I NEVER do that
#lol realised it was chr #klassic
##Run Flashfry on for tests
%run ./scripts/KRISPR_kreator/KRISPR_kreator.py flashfry --input_coords "./dual_guide/flashfry_positive.txt" --outputfile "./dual_guide/flashfry_positive_output.txt" --chromHMM_file "./annotations/MCF7_PROM_CHROHMM.bed" --snps_txtfile "./variant_calling_GATK/sites.txt" --snps_output "./variant_calling_GATK/sites_extracted_positive.txt"  --fiveprime "" --threeprime "" --U6_add ""

      chr   TSSstart     TSSend Promoter Genesymbol strand  Start_target  \
15802  19   45691915   45691968     P1P2     SNRPD2      -      45691615   
16051  19   49928875   49928960     P1P2       ATF5      +      49928875   
17428   2  101002267  101002299     P1P2      RPL31      +     101002267   
18918  20   25407657   25407736     P1P2      GINS1      +      25407657   
21827   3  133661852  133661869     P1P2     TOPBP1      -     133661552   

       End_target                  Coord  
15802    45691915   19:45691615-45691915  
16051    49929175   19:49928875-49929175  
17428   101002567  2:101002267-101002567  
18918    25407957   20:25407657-25407957  
21827   133661852  3:133661552-133661852  
If you are reading this you are not a total bozo - at least you have provided all the parameters
SNRPD2

ATF5


RPL31


GINS1


TOPBP1


os.system("zsh -i ./scripts/CRISRDO_sgRNA_identifier_file.sh ./dual_guide/flashfry_positive.txt ./dual_guide/crisprdo_positive_output.txt")

0


%run ./scripts/KRISPR_kreator/KRISPR_kreator.py filter --flashfry_output "./dual_guide/flashfry_positive_output.txt" --hCRISPRi_input "./hCRISPRi_Weissman/hCRISPRi_growthphenotypes.csv" --flashfry_output_top_subset_df_file "./dual_guide/positiveflashfry_subset.txt" --flashfry_output_top_df_file "./dual_guide/positiveflashfry.txt" --previouslyfound "./dual_guide/hCRISPRi_sgRNA_list.txt" --listgenes True --label "positive_control" --number_sgRNAs 2 --crisprdo_file "./dual_guide/crisprdo_positive_output.txt"

positive_controls=pd.read_csv("./dual_guide/positiveflashfry_subset.txt",sep="\t").append(hCRISPRi_sgRNA_POS_list)

If you are reading this you are not a total bozo - at least you have provided all the parameters
Number of RE sites: nan    337
Name: RE_Site, dtype: int64
(337, 27)
check found previously False    322
True      15
Name: foundpreviously, dtype: int64
(12, 9) shape of previouisly found df
True    232
Name: foundcrisprdo, dtype: int64
(0, 10) shape of crisprdo found df
<bound method IndexOpsMixin.value_counts of GINS1     2
SNRPD2    2
RPL31     2
TOPBP1    2
ATF5      2
Name: GeneSymbol, dtype: int64>
(232, 30)
   contig     start      stop                   target  \
2      19  45691658  45691681  ATAACGCCCTGGGCTGGGCGCGG   
3      19  45691663  45691686  ATCAGATAACGCCCTGGGCTGGG   
4      19  45691664  45691687  TATCAGATAACGCCCTGGGCTGG   
5      19  45691668  45691691  GCTTTATCAGATAACGCCCTGGG   
6      19  45691669  45691692  AGCTTTATCAGATAACGCCCTGG   

                               context overflow orientation  \
2  TATCAGATAACGCCCTGGGCTGGGCGCGGGGGCTC       OK         RVS   
3  AGCTTTATCAGATAACGCCCTGGGCTGGGCGCGGG       OK         RVS   
4  CAGCTTTATCAGATAACGCCCTGGGCTGGGCGCGG       OK         RVS   
5  GGGACAGCTTTATCAGATAACGCCCTGGGCTGGGC       OK         RVS   
6  GGGGACAGCTTTATCAGATAACGCCCTGGGCTGGG       OK         RVS   

   Doench2014OnTarget  Moreno-Mateos2015OnTarget dangerous_GC  ...  \
2            0.434767                   0.534582         NONE  ...   
3            0.050833                   0.462022         NONE  ...   
4            0.091358                   0.190618         NONE  ...   
5            0.628303                   0.559415         NONE  ...   
6            0.143891                   0.032904         NONE  ...   

  AggregateRankedScore_topX otCount       Oligo_FWD_direction  \
2                       NaN     565  GATAACGCCCTGGGCTGGGCGCGG   
3                       NaN      83  GATCAGATAACGCCCTGGGCTGGG   
4                       NaN      78  GTATCAGATAACGCCCTGGGCTGG   
5                       1.0      43   GCTTTATCAGATAACGCCCTGGG   
6                       NaN      38  GAGCTTTATCAGATAACGCCCTGG   

    Oligo_reverse_direction  SNPs_Number GeneSymbol  RE_Site  \
2  CCGCGCCCAGCCCAGGGCGTTATC            0     SNRPD2      nan   
3  CCCAGCCCAGGGCGTTATCTGATC            0     SNRPD2      nan   
4  CCAGCCCAGGGCGTTATCTGATAC            0     SNRPD2      nan   
5   CCCAGGGCGTTATCTGATAAAGC            0     SNRPD2      nan   
6  CCAGGGCGTTATCTGATAAAGCTC            0     SNRPD2      nan   

  foundinCRISPRiv2.1  foundpreviously  foundcrisprdo  
2              False            False           True  
3              False            False           True  
4              False            False           True  
5              False            False           True  
6              False            False           True  

[5 rows x 30 columns]


df=positive_controls
df.index=df["gene"]

###Reset them so the first then third guide is added to the dual guide.
output=[]
dualguides_per_gene=2
alphabet=['a','b','c','d','e','f']

for index,row in df.iterrows():
    df_subset=df[df["gene"]==index]
    for num in range(0,dualguides_per_gene,1):
        # print(num)
        coord_2=num+2
        # print(index)
        # distance=(df_subset["end"].iloc[coord_2])-(df_subset["start"].iloc[num])
        if df_subset["strand"][num]=="+":
            distance=abs(df_subset["end"].iloc[num]-df_subset["start"].iloc[coord_2])
        else:
            distance=abs(df_subset["end"].iloc[num]-df_subset["start"].iloc[coord_2])
        
        protospacer_A=list(df_subset["protospacer_A"])[num]
        protospacer_B=list(df_subset["protospacer_A"])[coord_2]
        target_A="sg"+index+alphabet[(num*2)]
        target_B="sg"+index+alphabet[(num*2)+1]
        row = [index,distance, protospacer_A,protospacer_B,target_A,target_B]
        output.append(list(row))

positive_controls = pd.DataFrame(output, columns=sgRNA_output_columns).drop_duplicates()


####Run Integration integration_pacbiotss_proactiv_published_only.ipynb which will have the input coords for flashfry
#Run Flashfry on candidates
%run ./scripts/KRISPR_kreator/KRISPR_kreator.py flashfry --input_coords "./dual_guide/flashfry.txt" --outputfile "./dual_guide/flashfry_output.txt" --chromHMM_file "./annotations/MCF7_PROM_CHROHMM.bed" --snps_txtfile "./variant_calling_GATK/sites.txt" --snps_output "./variant_calling_GATK/sites_extracted.txt" --fiveprime "" --threeprime "" --U6_add ""

If you are reading this you are not a total bozo - at least you have provided all the parameters
STAU2

GPBP1


PSMC5


ESR1


SRSF5


STAG2


TPD52L1


PRRC2C


APP


HNRNPU


ADAR


RBM47


YWHAZ


NR2F2


SP1


P4HB


NCOR2


RC3H2


PKM


XRCC5


CHD8


NBN


GATA3


CCND1


CHD4


FHL2


NFE2L2


SET


UPF3B


GTF2F1


SMARCA4


DHX30


MYBBP1A


BRIP1


SLTM


TP53


ARNT


SAFB


ZNF3


EEF2


CTNNB1


STAT3


AHCYL1


PA2G4


KDM2A


TGIF1


CALR


NAP1L1


CSNK1E


CUX1


SIN3A


NME2


ZNF217


BZW1


JARID2


os.system("zsh -i ./scripts/CRISRDO_sgRNA_identifier_file.sh ./dual_guide/flashfry.txt ./dual_guide/crisprdo_NP_output.txt")

0


%run ./scripts/KRISPR_kreator/KRISPR_kreator.py filter --flashfry_output "./dual_guide/flashfry_output.txt" --hCRISPRi_input "./hCRISPRi_Weissman/hCRISPRi_growthphenotypes.csv" --flashfry_output_top_subset_df_file "./dual_guide/MPflashfry_subset.txt" --flashfry_output_top_df_file "./dual_guide/MPflashfry_output.txt" --label "MP" --crisprdo_file "./dual_guide/crisprdo_NP_output.txt"

If you are reading this you are not a total bozo - at least you have provided all the parameters
Number of RE sites: nan      3519
BstXI       4
BsmBI       2
Name: RE_Site, dtype: int64
(3519, 27)
False    1297
True      252
Name: foundcrisprdo, dtype: int64
(494, 10) shape of crisprdo found df
<bound method IndexOpsMixin.value_counts of ESR1       4
STAG2      4
NR2F2      4
BZW1       4
GPBP1      4
DHX30      4
PRRC2C     4
STAT3      4
CALR       4
P4HB       4
SET        4
TGIF1      4
FHL2       4
ZNF217     4
NBN        4
ARNT       4
SLTM       4
AHCYL1     4
JARID2     4
CTNNB1     4
ADAR       4
XRCC5      4
YWHAZ      4
RC3H2      4
PSMC5      4
SIN3A      4
NAP1L1     4
CHD4       4
RBM47      4
GTF2F1     4
UPF3B      4
PKM        4
HNRNPU     4
BRIP1      4
TP53       4
APP        4
SRSF5      4
SP1        4
NFE2L2     4
GATA3      4
ZNF3       4
SMARCA4    4
SAFB       4
CCND1      4
TPD52L1    4
NME2       4
CUX1       4
MYBBP1A    4
CHD8       4
EEF2       4
PA2G4      4
CSNK1E     4
KDM2A      4
STAU2      4
NCOR2      4
Name: GeneSymbol, dtype: int64>
(1549, 31)
   contig     start      stop                   target  \
19      8  73747217  73747240  GGACGAGGAGACTGCGGACCGGG   
20      8  73747218  73747241  GGGACGAGGAGACTGCGGACCGG   
35      8  73747257  73747280  GCCCACCCTTGGCAGACGAGGGG   
36      8  73747258  73747281  GGCCCACCCTTGGCAGACGAGGG   
37      8  73747259  73747282  AGGCCCACCCTTGGCAGACGAGG   

                                context overflow orientation  \
19  CGCCGGGGACGAGGAGACTGCGGACCGGGCGCGCT       OK         RVS   
20  GCGCCGGGGACGAGGAGACTGCGGACCGGGCGCGC       OK         RVS   
35  CCCCAGGCCCACCCTTGGCAGACGAGGGGGCGGGG       OK         RVS   
36  GCCCCAGGCCCACCCTTGGCAGACGAGGGGGCGGG       OK         RVS   
37  TGCCCCAGGCCCACCCTTGGCAGACGAGGGGGCGG       OK         RVS   

    Doench2014OnTarget  Moreno-Mateos2015OnTarget dangerous_GC  ... otCount  \
19            0.055895                   0.651484         NONE  ...     196   
20            0.079149                   0.569365         NONE  ...     182   
35            0.414708                   0.478278         NONE  ...      85   
36            0.373005                   0.465587         NONE  ...     286   
37            0.074858                   0.578442         NONE  ...     284   

         Oligo_FWD_direction   Oligo_reverse_direction SNPs_Number  \
19   GGACGAGGAGACTGCGGACCGGG   CCCGGTCCGCAGTCTCCTCGTCC           0   
20   GGGACGAGGAGACTGCGGACCGG   CCGGTCCGCAGTCTCCTCGTCCC           0   
35   GCCCACCCTTGGCAGACGAGGGG   CCCCTCGTCTGCCAAGGGTGGGC           0   
36   GGCCCACCCTTGGCAGACGAGGG   CCCTCGTCTGCCAAGGGTGGGCC           0   
37  GAGGCCCACCCTTGGCAGACGAGG  CCTCGTCTGCCAAGGGTGGGCCTC           0   

    GeneSymbol RE_Site  foundinCRISPRiv2.1 GC_content  polyN  foundcrisprdo  
19       STAU2     nan               False   0.739130      0          False  
20       STAU2     nan               False   0.739130      0          False  
35       STAU2     nan                True   0.565217      0          False  
36       STAU2     nan                True   0.565217      0          False  
37       STAU2     nan                True   0.565217      0          False  

[5 rows x 31 columns]


MP_flashfry=pd.read_csv("./dual_guide/MPflashfry_subset.txt",sep="\t")
MP_flashfry["gene"]=MP_flashfry["gene"]+"_MP"

MP_flashfry=MP_flashfry[MP_flashfry["gene"]!="MRPS5"]
df=MP_flashfry
df.index=df["gene"]

###Reset them so the first then third guide is added to the dual guide.
output=[]
dualguides_per_gene=2
alphabet=['a','b','c','d','e','f']

for index,row in df.iterrows():
    df_subset=df[df["gene"]==index]
    for num in range(0,dualguides_per_gene,1):
        # print(num)
        coord_2=num+2
        # print(index)
        # distance=(df_subset["end"].iloc[coord_2])-(df_subset["start"].iloc[num])
        if df_subset["strand"][num]=="+":
            distance=abs(df_subset["end"].iloc[num]-df_subset["start"].iloc[coord_2])
        else:
            distance=abs(df_subset["end"].iloc[num]-df_subset["start"].iloc[coord_2])
        
        protospacer_A=list(df_subset["protospacer_A"])[num]
        protospacer_B=list(df_subset["protospacer_A"])[coord_2]
        target_A="sg"+index+alphabet[(num*2)]
        target_B="sg"+index+alphabet[(num*2)+1]
        row = [index,distance, protospacer_A,protospacer_B,target_A,target_B]
        output.append(list(row))

MP_flashfry = pd.DataFrame(output, columns=sgRNA_output_columns).drop_duplicates()


#Run Flashfry on candidates
%run ./scripts/KRISPR_kreator/KRISPR_kreator.py flashfry --input_coords "./dual_guide/APflashfry.txt" --outputfile "./dual_guide/AP_flashfry_output.txt" --chromHMM_file "./annotations/MCF7_PROM_CHROHMM.bed" --snps_txtfile "./variant_calling_GATK/sites.txt" --snps_output "./variant_calling_GATK/sites_extracted.txt" --fiveprime "" --threeprime "" --U6_add ""

If you are reading this you are not a total bozo - at least you have provided all the parameters
STAU2

GPBP1


PSMC5


ESR1


SRSF5


STAG2


TPD52L1


PRRC2C


APP


HNRNPU


ADAR


RBM47


YWHAZ


NR2F2


SP1


P4HB


NCOR2


RC3H2


PKM


XRCC5


CHD8


NBN


GATA3


CCND1


CHD4


FHL2


NFE2L2


SET


UPF3B


GTF2F1


SMARCA4


DHX30


MYBBP1A


BRIP1


SLTM


TP53


ARNT


SAFB


ZNF3


EEF2


CTNNB1


STAT3


AHCYL1


PA2G4


KDM2A


TGIF1


CALR


NAP1L1


CSNK1E


CUX1


SIN3A


NME2


ZNF217


BZW1


JARID2


os.system("zsh -i ./scripts/CRISRDO_sgRNA_identifier_file.sh ./dual_guide/APflashfry.txt ./dual_guide/crisprdo_AP_output.txt")

0


%run ./scripts/KRISPR_kreator/KRISPR_kreator.py filter --flashfry_output "./dual_guide/AP_flashfry_output.txt" --hCRISPRi_input "./hCRISPRi_Weissman/hCRISPRi_growthphenotypes.csv" --flashfry_output_top_subset_df_file "./dual_guide/APflashfry_output_subset.txt" --flashfry_output_top_df_file "./dual_guide/APflashfry_output.txt" --label "AP" --crisprdo "./dual_guide/crisprdo_AP_output.txt"


AP_flashfry=pd.read_csv("./dual_guide/APflashfry_output_subset.txt",sep="\t")

If you are reading this you are not a total bozo - at least you have provided all the parameters
Number of RE sites: nan      2241
BstXI       4
Name: RE_Site, dtype: int64
(2241, 27)
False    1217
True      146
Name: foundcrisprdo, dtype: int64
(194, 10) shape of crisprdo found df
<bound method IndexOpsMixin.value_counts of ESR1       4
TGIF1      4
NBN        4
PA2G4      4
NAP1L1     4
NR2F2      4
BZW1       4
KDM2A      4
DHX30      4
PRRC2C     4
STAT3      4
CALR       4
SIN3A      4
FHL2       4
SAFB       4
GPBP1      4
ZNF217     4
SLTM       4
AHCYL1     4
JARID2     4
CTNNB1     4
ADAR       4
XRCC5      4
YWHAZ      4
RC3H2      4
PSMC5      4
CHD4       4
STAG2      4
RBM47      4
ZNF3       4
UPF3B      4
PKM        4
HNRNPU     4
ARNT       4
TP53       4
APP        4
P4HB       4
SRSF5      4
SP1        4
SET        4
GATA3      4
NFE2L2     4
STAU2      4
SMARCA4    4
TPD52L1    4
NME2       4
MYBBP1A    4
CUX1       4
CCND1      4
CHD8       4
EEF2       4
GTF2F1     4
BRIP1      4
CSNK1E     4
NCOR2      4
Name: GeneSymbol, dtype: int64>
(1363, 31)
  contig     start      stop                   target  \
0      8  73582542  73582565  AAATGGCAGTCTGTCTGCAATGG   
1      8  73582600  73582623  CCAGCTATTCTGTCAGTTCCAGG   
2      8  73582600  73582623  CCTGGAACTGACAGAATAGCTGG   
3      8  73582615  73582638  GTTCCAGGAAAGCAAACACCAGG   
4      8  73582618  73582641  AGTCCTGGTGTTTGCTTTCCTGG   

                               context overflow orientation  \
0  TGTATTAAATGGCAGTCTGTCTGCAATGGCTTAAC       OK         FWD   
1  AGTTTTCCAGCTATTCTGTCAGTTCCAGGAAAGCA       OK         FWD   
2  TGCTTTCCTGGAACTGACAGAATAGCTGGAAAACT       OK         RVS   
3  CTGTCAGTTCCAGGAAAGCAAACACCAGGACTCCT       OK         FWD   
4  CAAAGGAGTCCTGGTGTTTGCTTTCCTGGAACTGA       OK         RVS   

   Doench2014OnTarget  Moreno-Mateos2015OnTarget dangerous_GC  ... otCount  \
0            0.092817                   0.515983         NONE  ...     220   
1            0.035143                   0.419056         NONE  ...     193   
2            0.367871                   0.371760         NONE  ...     201   
3            0.533675                   0.388051         NONE  ...     327   
4            0.009039                   0.284143         NONE  ...     354   

        Oligo_FWD_direction   Oligo_reverse_direction  SNPs_Number  \
0  GAAATGGCAGTCTGTCTGCAATGG  CCATTGCAGACAGACTGCCATTTC            0   
1  GCCAGCTATTCTGTCAGTTCCAGG  CCTGGAACTGACAGAATAGCTGGC            0   
2  GCCTGGAACTGACAGAATAGCTGG  CCAGCTATTCTGTCAGTTCCAGGC            0   
3   GTTCCAGGAAAGCAAACACCAGG   CCTGGTGTTTGCTTTCCTGGAAC            0   
4  GAGTCCTGGTGTTTGCTTTCCTGG  CCAGGAAAGCAAACACCAGGACTC            0   

   GeneSymbol RE_Site  foundinCRISPRiv2.1 GC_content  polyN  foundcrisprdo  
0       STAU2     nan               False   0.565217      0          False  
1       STAU2     nan               False   0.391304      0          False  
2       STAU2     nan               False   0.608696      0          False  
3       STAU2     nan               False   0.652174      0          False  
4       STAU2     nan               False   0.347826      0          False  

[5 rows x 31 columns]


##Note that MRPS5 even with widened gap the AP shows only 1 promoter that is 
AP_flashfry=AP_flashfry[AP_flashfry["gene"]!="MRPS5"]
AP_flashfry["gene"]=AP_flashfry["gene"]+"_AP"
df=AP_flashfry
df.index=df["gene"]

###Reset them so the first then third guide is added to the dual guide.
output=[]
dualguides_per_gene=2
alphabet=['a','b','c','d','e','f']

for index,row in df.iterrows():
    df_subset=df[df["gene"]==index]
    for num in range(0,dualguides_per_gene,1):
        # print(num)
        coord_2=num+2
        # print(index)
        # distance=(df_subset["end"].iloc[coord_2])-(df_subset["start"].iloc[num])
        if df_subset["strand"][num]=="+":
            distance=abs(df_subset["end"].iloc[num]-df_subset["start"].iloc[coord_2])
        else:
            distance=abs(df_subset["end"].iloc[num]-df_subset["start"].iloc[coord_2])
        
        protospacer_A=list(df_subset["protospacer_A"])[num]
        protospacer_B=list(df_subset["protospacer_A"])[coord_2]
        target_A="sg"+index+alphabet[(num*2)]
        target_B="sg"+index+alphabet[(num*2)+1]
        row = [index,distance, protospacer_A,protospacer_B,target_A,target_B]
        output.append(list(row))

AP_flashfry = pd.DataFrame(output, columns=sgRNA_output_columns).drop_duplicates()


##append to singleRNA w/negative control
##append positive control x2
##append MP 
##append AP
sgRNA_candidates=negative_controls.append(positive_controls
).append(MP_flashfry).append(AP_flashfry)


##CR3cs1/CR1cs1
# With PCR Adapter
sgRNA_candidates["sgRNA_w_const_PCR"]="TCACAACTACACCAGAAGCCACCTTGTTGG"+sgRNA_candidates["protospacer_A"]+"GTTTCAGAGCGAGACGTGTTTGATCTCGGGCCGTCTCAGAAACATGG"+sgRNA_candidates["protospacer_B"]+"GTTTAAGAGCTAAGCTGGCAACACTTTGACGAAGA"

sgRNA_candidates.to_excel("./altprom_dualguide_fullexp/sgRNA_candidates.xlsx")


{
    "tags": [
        "hide_input",
    ]
}

sns.set_theme(style="whitegrid", palette="muted")

sgRNA_candidates["category"]=sgRNA_candidates["gene"].str.split("_").str.get(1)

# Draw a categorical scatterplot to show each observation
ax = sns.boxplot(data=sgRNA_candidates[sgRNA_candidates["category"]!="control"], x="distance", y="category",palette="ch:rot=-.25,hue=1,light=.75")
ax.set(ylabel="Promoter Type", xlabel="Distance Between Two Protospacer")

[Text(0, 0.5, 'Promoter Type'),
 Text(0.5, 0, 'Distance Between Two Protospacer')]


{
    "tags": [
        "hide_input",
    ]
}

AP_flashfry=pd.read_csv("./dual_guide/APflashfry_output.txt",sep="\t")
AP_flashfry["type"]="AP"
MP_flashfry=pd.read_csv("./dual_guide/MPflashfry_output.txt",sep="\t")
MP_flashfry["type"]="MP"

P_flashfry_output=MP_flashfry.append(AP_flashfry)

P_flashfry_output[["type","found"]].value_counts()

type  found             
MP    foundinCRISPRiv2.1    136
AP    foundcrisprdo          65
MP    foundcrisprdo          49
AP    foundinCRISPRiv2.1     44
dtype: int64


{
    "tags": [
        "hide_input",
    ]
}

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 6))

plot_flashfry=P_flashfry_output[["Doench2014OnTarget","Moreno-Mateos2015OnTarget","GC_content"]].melt()

ax = sns.boxplot(x="variable", y="value", data=plot_flashfry, whis=np.inf, palette="ch:rot=-.25,hue=1,light=.75")
ax = sns.swarmplot(x="variable", y="value", data=plot_flashfry, color=".2", palette="ch:rot=-.5,hue=1,light=.4")

/Users/helenking/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 30.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)


blat_fasta=[]

for index,row in sgRNA_candidates.iterrows():
    inp_list=list((">",row["target_A"],"_",row["distance"]))
    name_out=''.join(map(str, inp_list)) 
    blat_fasta.append(name_out)
    blat_fasta.append((row["protospacer_A"]))
    inp_list=list((">",row["target_B"],"_",row["distance"]))
    name_out=''.join(map(str, inp_list)) 
    blat_fasta.append(name_out)
    blat_fasta.append((row["protospacer_B"]))

# blat_fasta
blat_fasta=pd.DataFrame(blat_fasta)
blat_fasta.drop_duplicates().to_csv("./dual_guide/fasta_files/finalguide.fa", index=None, header=None)


blat_fasta=[]

for index,row in sgRNA_candidates.iterrows():
    if row["gene"] == "negative_control":
        continue
    else:
        inp_list=list((">",row["target_A"],"_",row["distance"]))
        name_out=''.join(map(str, inp_list)) 
        blat_fasta.append(name_out)
        blat_fasta.append((row["protospacer_A"]))
        inp_list=list((">",row["target_B"],"_",row["distance"]))
        name_out=''.join(map(str, inp_list)) 
        blat_fasta.append(name_out)
        blat_fasta.append((row["protospacer_B"]))

blat_fasta=pd.DataFrame(blat_fasta)
blat_fasta.drop_duplicates().to_csv("./dual_guide/fasta_files/finalguide.fa", index=None, header=None)

Type of control	% of sgRNAs	# of Genes	# of sgRNAs
Positive		5	10
Negative		Non-Targeting	25
Full KD Genes	45	118	118
Promoter KD Genes	45	118	118

Design sgRNA dual guides for MP targets, AP targets, positive and negative controls¶