3C_whippet_pseudobulk

Calculating the TPM values for the P1 & P2 promoters and then calculating the knockdown efficiency for off-target and on-target targets, as well as generating visualisations of the data.

The provided R code serves as the core validation engine, quantifying transcript-level changes to prove that CRISPR interference (CRISPRi) can selectively repress one promoter without affecting others in the same gene.

Isoform Quantification: It utilises Whippet outputs to quantify expression in Transcripts Per Million (TPM), allowing for a high-resolution view of which specific promoter is active in which cell.
On-Target vs. Off-Target Precision: It calculates the percentage knockdown for transcripts initiated by the targeted promoter (On-target) versus transcripts from untargeted promoters of the same gene (Off-target)
Quality Filtering: The script establishes strict criteria for “Successful KD” (e.g., P2 knockdown < 40%) and ensures that background expression in non-targeting control (NTC) populations is sufficient (NTC TPM ≥ 0.5) to make reliable conclusions.

Load libraries and define functions.

Define the function and the files.

rel_dir <- '../../'
dir <- '../../files/whippet_dedup'
files <- list.files(dir, pattern = ".gz", full.names = TRUE)
table_name <- file.path(dir, "all_data.txt")


# Print the list of files
print(files[1:3])

## [1] "../../files/whippet_dedup/ADAR_AP.deduplicated.isoform.tpm.gz"  
## [2] "../../files/whippet_dedup/ADAR_MP.deduplicated.isoform.tpm.gz"  
## [3] "../../files/whippet_dedup/AHCYL1_AP.deduplicated.isoform.tpm.gz"

Read through entire whippet output and create gene table if it does not exist. Here the off-target and on-target P1 and P2 knockdown percentages are calculated. Other information such as the transcripts assigned to the P1 anad P2 promoters, the distance from the TSS and the negative control TPMs are also calculated. Finally, boolean values for if the KD is significant, if there is a significant NTC count (> 0.5 TPM in negative control population), if KD is looser (less than the 85th percentile) or strict (both P1 and P2 < 0% KD) are also calculated.

#if gene_table.csv exists then skip 
if (file.exists(paste0(dir,"/gene_table.csv"))){
  print("gene_table.csv exists")
  #load the gene_table.csv
  gene_table <- read.csv(paste0(dir,"/gene_table.csv")) 
  } else {
    print("gene_table.csv does not exist")   
  # Read or create the table
  if (file.exists(table_name)) {
    all_data <- read.table(table_name, header = TRUE, sep = "\t", row.names = NULL)
  } else {
    #map the fiunction process to every element in list of files and merge the otuputs to simplest df use applyr
    all_data <- lapply(files, process_file)
    #flatten and merge on isforom column
    all_data <- Reduce(function(x, y) merge(x, y, by = "Isoform", all = TRUE), all_data)
    #instead of cbind use merge on Isoform
    write.table(all_data, table_name, sep = "\t", row.names = FALSE, quote = FALSE)
  }
  
  #tx2gene has list of transcripts and their associated genes 
  tx2gene_loc <- "files/kallisto/transcriptome/tx2gene.txt"
  tx2gene <- read_delim(tx2gene_loc, delim = "\t", col_names = c("transcript_id", "gene_id","gene_name","transcript_name","chromosome","start","end","strand"))
  print(sum((tx2gene$start - tx2gene$end) > 0))
  tx2gene$transcript_index <- 1:nrow(tx2gene)
  tx2gene$tss <- ifelse(tx2gene$strand == "+", tx2gene$start, tx2gene$end)
  
  
  #read in the cells that are assigned, place into metadata find
  #read in the bed file which conrtains p1 and p2 locations
  bed_loc <- "cellranger_input/candidate_AP_MP.bed"
  bed <- read_delim(bed_loc, delim = "\t", col_names = c("chromosome", "start", "end", "name", "strand"))
  print(sum((bed$start - bed$end) > 0))
  bed$tss <- ifelse(bed$strand == "+", bed$start, bed$end)
  #do an example for esr1 mp and ap subpopulations
  #split _ and get last 
  bed$promoter_type <- str_split(bed$name, "_") %>% map_chr(~last(.x))
  #get the second to last element
  bed$gene_name <- str_split(bed$name, "_") %>% map_chr(~.x[length(.x)-1])
  bed$promoter_gene <- paste0(bed$gene_name,"_",bed$promoter_type)
  #check the number of gene name prior to merge
  print(length(unique(bed$gene_name)))
  print(length(unique(bed$promoter_gene)))
  #merge the dataframe on gene name 
  
  #merge the bed and tx2gene on gene name
  bed <- merge(bed, tx2gene, by = "gene_name", suffixes = c("_target","_assigned"), how="inner")
  print(length(unique(bed$gene_name)))
  print(length(unique(bed$promoter_gene)))
  #filter for for rows that only have 
  #calculate column that has the distance from the tss
  bed$distance <- abs(bed$tss_target - bed$tss_assigned)
  #filter for rows that have a distance of less than 1000
  bed <- bed %>% group_by(transcript_id) %>% mutate(min_distance = distance == min(distance))
  print(length(unique(bed$transcript_id))==length(bed$transcript_id[bed$min_distance == TRUE]))
  ggplot(bed) + 
    labs(x = "Distance from TSS", y = "Count") + #add denistty plot
    geom_density(aes(x=distance, color=min_distance), alpha = 0.5)
  
  #then find the numbers that assign to those transcripts
  #get the number from the matrix number for those 
  bed <- bed %>% filter(min_distance == TRUE)
  #only keep genes with both P1 and P2
  all_data_names <- stringr::str_split_i(colnames(all_data), "Tpm_", 2) 
  #use these transcripts 
  bed <- bed %>% group_by(gene_name) %>% filter(promoter_gene %in% unique(all_data_names)) %>% filter(n_distinct(promoter_type) == 2)
  bed <- bed %>% filter(distance < 1000)
  #split transcript ID by . and get the first element name column ISoform
  bed$Isoform <- str_split_fixed(bed$transcript_id, "\\.", 2)[,1]

  #merge the bed and all_data on Isoform
  all_data <- merge(all_data, bed, by = "Isoform")
  head(all_data)
  
  #create empty table gene_table with columsn for both mp and ap
  gene_table_ontarget <- data.frame(gene = character(), percent_knockdown_mp = numeric(), distance_mp = numeric(), Isoform_mp = character(), transcript_name_mp = character(), neg_ctrl_tpm_mp= character(), percent_knockdown_ap = numeric(), distance_ap = numeric(), Isoform_ap = character(), transcript_name_ap = character(), neg_ctrl_tpm_ap= character(), stringsAsFactors = FALSE)
  gene_table_offtarget <- data.frame(gene = character(), percent_knockdown_mp = numeric(), distance_mp = numeric(), Isoform_mp = character(), transcript_name_mp = character(),  neg_ctrl_tpm_mp= character(),percent_knockdown_ap = numeric(), distance_ap = numeric(), Isoform_ap = character(), transcript_name_ap = character(),  neg_ctrl_tpm_ap= character(),stringsAsFactors = FALSE)
  gene_table_total <- data.frame(gene = character(), percent_knockdown_mp_total = numeric(), percent_knockdown_ap = numeric(), stringsAsFactors = FALSE)
  #loop through every gene 
  #unique gene names
  for (gene in unique(all_data$gene_name)){
    print(gene)
    gene_prom_mp <- paste0(gene, "_MP")
    gene_prom_mp_tpm <- paste0("Tpm_", gene_prom_mp)
    gene_prom_ap <- paste0(gene, "_AP")
    gene_prom_ap_tpm <- paste0("Tpm_", gene_prom_ap)
    negative_column <- "Tpm_Negative"
    #subset the data for the gene
    #firrst for main promoter
    gene_data_mp <- all_data %>% filter(promoter_gene == gene_prom_mp) 
    #then for the alternative promoter
    gene_data_ap <- all_data %>% filter(promoter_gene == gene_prom_ap)
    #check if all the sum(gene_data_mp[[gene_prom_mp_tpm]]) or sum(gene_data_mp[[gene_prom_ap_tpm]]) for every column is greater than the minimum filter tpm
    # if (( > filter) & (sum(gene_data_ap[[negative_column]]) > filter) ){
    #   print("skipped")
    #   next
    # }
    print(sum(gene_data_mp[[negative_column]]))
    print(sum(gene_data_ap[[negative_column]]))
    #calculate the % knockdown between negative control and ESR1_MP population by summing the Tpm and then dividing by the negative control
    #fill NA wuth 0 in gene_data_ap[[gene_prom_ap_tpm]] and gene_data_mp[[gene_prom_mp_tpm]] and negative control and gene_data_mp[[gene_prom_ap_tpm]]
    total_mp <- sum(gene_data_mp[[gene_prom_mp_tpm]]) + sum(gene_data_mp[[gene_prom_ap_tpm]])
    total_ap <- sum(gene_data_ap[[gene_prom_mp_tpm]]) + sum(gene_data_ap[[gene_prom_ap_tpm]])
    total_percentage_knockdown_mp <- (total_mp - sum(gene_data_mp[[negative_column]]))/sum(gene_data_mp[[negative_column]])*100
    total_percentage_knockdown_ap <- (total_ap - sum(gene_data_ap[[negative_column]]))/sum(gene_data_ap[[negative_column]])*100
    gene_data_mp_percent_knockdown_mp <- ( sum(gene_data_mp[[gene_prom_mp_tpm]])-sum(gene_data_mp[[negative_column]]))/sum(gene_data_mp[[negative_column]])*100
    gene_data_ap_percent_knockdown_ap <- ( sum(gene_data_ap[[gene_prom_ap_tpm]])-sum(gene_data_ap[[negative_column]]))/sum(gene_data_ap[[negative_column]])*100
    gene_data_mp_percent_knockdown_ap <- ( sum(gene_data_mp[[gene_prom_ap_tpm]])-sum(gene_data_mp[[negative_column]]))/sum(gene_data_mp[[negative_column]])*100
    gene_data_ap_percent_knockdown_mp <- ( sum(gene_data_ap[[gene_prom_mp_tpm]])-sum(gene_data_ap[[negative_column]]))/sum(gene_data_ap[[negative_column]])*100
    # whenever sum(gene_data_mp[[gene_prom_mp_tpm]]) == sum(gene_data_mp[[negative_column]])) make the value equal 0
    gene_data_mp_percent_knockdown_mp[sum(gene_data_mp[[gene_prom_mp_tpm]]) == sum(gene_data_mp[[negative_column]])] <- 0
    gene_data_ap_percent_knockdown_ap[sum(gene_data_ap[[gene_prom_ap_tpm]]) == sum(gene_data_ap[[negative_column]])] <- 0
    gene_data_mp_percent_knockdown_ap[sum(gene_data_mp[[gene_prom_ap_tpm]]) == sum(gene_data_mp[[negative_column]])] <- 0
    gene_data_ap_percent_knockdown_mp[sum(gene_data_ap[[gene_prom_mp_tpm]]) == sum(gene_data_ap[[negative_column]])] <- 0
    
   
    gene_list_kd <- list(gene,total_percentage_knockdown_mp,total_percentage_knockdown_ap)
    gene_list_ontarget <- list(gene, sum(gene_data_mp[[gene_prom_mp_tpm]]),
                               gene_data_mp_percent_knockdown_mp,median(gene_data_mp$distance),str_flatten_comma(gene_data_mp$Isoform),str_flatten_comma(gene_data_mp$transcript_name),sum(gene_data_mp[[negative_column]]), 
                               sum(gene_data_ap[[gene_prom_ap_tpm]]),
                               gene_data_ap_percent_knockdown_ap,median(gene_data_ap$distance),str_flatten_comma(gene_data_ap$Isoform),str_flatten_comma(gene_data_ap$transcript_name),sum(gene_data_ap[[negative_column]]))
    gene_list_offtarget <- list(gene, sum(gene_data_mp[[gene_prom_ap_tpm]]),
                                gene_data_mp_percent_knockdown_ap,median(gene_data_mp$distance),str_flatten_comma(gene_data_mp$Isoform),str_flatten_comma(gene_data_mp$transcript_name),sum(gene_data_mp[[negative_column]]), 
                                sum(gene_data_ap[[gene_prom_mp_tpm]]),
                                gene_data_ap_percent_knockdown_mp,median(gene_data_ap$distance),str_flatten_comma(gene_data_ap$Isoform),str_flatten_comma(gene_data_ap$transcript_name),sum(gene_data_ap[[negative_column]]))
    #add list to table keep column names
    print(gene_prom_ap_tpm)
    gene_table_ontarget <- rbind(gene_table_ontarget, gene_list_ontarget)
    gene_table_offtarget <- rbind(gene_table_offtarget, gene_list_offtarget)
    gene_table_total <- rbind(gene_table_total, gene_list_kd)
  }
  
  #change gene_table names 
  column_names <- c("gene", "sum_MP","percent_knockdown_MP", "distance_MP", "Isoform_MP", "transcript_name_MP", "neg_ctrl_tpm_MP",
                    "sum_AP","percent_knockdown_AP", "distance_AP", "Isoform_AP", "transcript_name_AP","neg_ctrl_tpm_AP")
  colnames(gene_table_ontarget) <-column_names
  colnames(gene_table_offtarget) <- column_names
  colnames(gene_table_total) <- c("gene", "percent_knockdown_MP_total", "percent_knockdown_AP_total")
  gene_table <- merge(gene_table_ontarget, gene_table_offtarget, by = "gene", suffixes = c("_ontarget", "_offtarget"))
  gene_table <- merge(gene_table, gene_table_total, by = "gene")
  head(gene_table)
  #plot the data
  gene_table <- as.data.frame(gene_table)
  dim(gene_table)
  gene_table$successfulKD <- ifelse(gene_table$percent_knockdown_AP_ontarget < 40,TRUE,FALSE)
  quantile_mp_on <- quantile(gene_table$percent_knockdown_MP_ontarget, c(0.1,0.9))
quantile_ap_on <- quantile(gene_table$percent_knockdown_AP_ontarget, c(0.1,0.85))

gene_table$sig_ntc_count <- ifelse(gene_table$neg_ctrl_tpm_MP_ontarget >= 0.5 & gene_table$neg_ctrl_tpm_AP_ontarget >= 0.5,TRUE,FALSE)
gene_table$looser_KD <-  ifelse((  (gene_table$percent_knockdown_AP_ontarget  < quantile_ap_on[2])), TRUE,FALSE)
gene_table$strict_KD <- ifelse(((gene_table$percent_knockdown_MP_ontarget < 0 )&  (gene_table$percent_knockdown_AP_ontarget  < 0) ), TRUE,FALSE)

  #save gene_table as a file
  write.csv(gene_table,paste0(dir,"/gene_table.csv"))
  }

## [1] "gene_table.csv exists"

Create heatmap of the percent knockdown for both promoters with background of on-target and off-target effects.

gene_table_heatmap <-gene_table %>%
 dplyr::select(gene,percent_knockdown_MP_ontarget, percent_knockdown_AP_ontarget, percent_knockdown_MP_offtarget, percent_knockdown_AP_offtarget,percent_knockdown_MP_total,percent_knockdown_AP_total) 
#put column to rowname
rownames(gene_table_heatmap) <- gene_table_heatmap$gene
gene_table_heatmap <- gene_table_heatmap %>% dplyr::select(-gene)
breaks2 <- seq(-100, 100, length.out = 150)


pheatmap(gene_table_heatmap %>% 
           dplyr::select(percent_knockdown_MP_ontarget,percent_knockdown_MP_offtarget,percent_knockdown_MP_total),
         cluster_rows = TRUE, cluster_cols = FALSE, 
         show_rownames = TRUE, show_colnames = TRUE, main = "P1", 
         fontsize = 8, cellwidth = 8, cellheight = 8 , scale ="none" ,cutree_rows=5 ,colorRampPalette(brewer.pal(9,"Spectral"))(150),
         breaks = breaks2)

#explain the number present 
print(paste0("Number of genes in heatmap: ", nrow(gene_table_heatmap %>% 
           dplyr::select(percent_knockdown_MP_ontarget,percent_knockdown_MP_offtarget,percent_knockdown_MP_total) %>% drop_na()) ))

## [1] "Number of genes in heatmap: 42"

Repeat with P2-targeted guides. This is to show the difference in knockdown efficiency between the two promoters.

breaks2 <- seq(-100, 100, length.out = 150)

pheatmap(gene_table_heatmap %>% 
           dplyr::select(percent_knockdown_AP_ontarget,percent_knockdown_AP_offtarget,percent_knockdown_AP_total) %>% drop_na(),
         cluster_rows = TRUE, cluster_cols = FALSE, 
         show_rownames = TRUE, show_colnames = TRUE, main = "P2", 
         fontsize = 8, cellwidth = 8, cellheight = 10 , scale ="none" ,cutree_rows=5 ,colorRampPalette(brewer.pal(9,"Spectral"))(150),
         breaks = breaks2)

Comparison of the boolean assignments, looser KD, strict KD and significant NTC expression.

ggven_plot <- gene_table[c("looser_KD","strict_KD","sig_ntc_count")] %>%
  dplyr::rename(
    "Successful \n KD" = looser_KD,
    "Strict \n Successful KD" = strict_KD,
    "NTC Count\n >= 0.5 TPM" = sig_ntc_count
  )

venn_diagram <- ggvenn(ggven_plot, show_stats="cp", text_size=3)
#save venndiagram 
ggsave(paste0(rel_dir,"figures/venn_diagram.pdf"), plot = venn_diagram, width = 10, height = 10, units = "cm")
venn_diagram

Visualisation of the knockdown percentages for both promoters and both on-target and off-target effects.

gene_table_melt <-  gene_table %>% 
   filter((looser_KD ==TRUE)) %>%
  melt(id.vars = c("gene","neg_ctrl_tpm_AP_ontarget","neg_ctrl_tpm_MP_ontarget"),
                                       measure.vars = c("percent_knockdown_MP_ontarget", "percent_knockdown_AP_ontarget", 
                                                        "percent_knockdown_MP_offtarget", "percent_knockdown_AP_offtarget"))
gene_table_melt$target <- ifelse(grepl("ontarget", gene_table_melt$variable), "ontarget", "offtarget")
gene_table_melt$promoter <- ifelse(grepl("MP", gene_table_melt$variable), "P1", "P2")
ggplot(gene_table_melt) +
  geom_boxplot(aes(x=variable, y=value, fill=variable)) + 
  theme_minimal() + labs(x = "Promoter", y = "Percent Knockdown") + #rotate 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Plot the knockdown percentages for both promoters split by on-target and off-target for all 35 genes with NTC > 0.5.

a <- gene_table_melt %>% filter(promoter=="P1") %>%
  ggviolin( x = "target", y = "value",fill = "target",add = "boxplot",    ylab = "Percent Knockdown", xlab = "Promoter", 
           palette = c(palletee1[1],palletee1[2]),add.params = list(fill = "white") ) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  stat_compare_means( paired = TRUE) 
b <- gene_table_melt %>% filter(promoter=="P2") %>%
  ggviolin( x = "target", y = "value",fill = "target",add = "boxplot",    ylab = "Percent Knockdown", xlab = "Promoter", 
            palette = c(palletee2[1],palletee2[2]) ,add.params = list(fill = "white") ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  stat_compare_means( paired = TRUE) 

plot_grid(a, b,  ncol = 1)

Visualisation of the knockdown percentages for both promoters and both on-target and off-target effects.

gene_table_melt <-  gene_table %>% 
   filter((looser_KD ==TRUE) & (sig_ntc_count==TRUE) ) %>%
  melt(id.vars = c("gene","neg_ctrl_tpm_AP_ontarget","neg_ctrl_tpm_MP_ontarget"),
                                       measure.vars = c("percent_knockdown_MP_ontarget", "percent_knockdown_AP_ontarget", 
                                                        "percent_knockdown_MP_offtarget", "percent_knockdown_AP_offtarget"))
gene_table_melt$target <- ifelse(grepl("ontarget", gene_table_melt$variable), "ontarget", "offtarget")
gene_table_melt$promoter <- ifelse(grepl("MP", gene_table_melt$variable), "P1", "P2")
ggplot(gene_table_melt) +
  geom_boxplot(aes(x=variable, y=value, fill=variable)) + 
  theme_minimal() + labs(x = "Promoter", y = "Percent Knockdown") + #rotate 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Plot the knockdown percentages for both promoters split by on-target and off-target of the 31 genes that show “successful KD” which removes the top 15 quantiles.

a <- gene_table_melt %>% filter(promoter=="P1") %>%
  ggviolin( x = "target", y = "value",fill = "target",add = "boxplot",    ylab = "Percent Knockdown", xlab = "Promoter", 
           palette = c(palletee1[1],palletee1[2]),add.params = list(fill = "white") ) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  stat_compare_means( paired = TRUE) 
b <- gene_table_melt %>% filter(promoter=="P2") %>%
  ggviolin( x = "target", y = "value",fill = "target",add = "boxplot",    ylab = "Percent Knockdown", xlab = "Promoter", 
            palette = c(palletee2[1],palletee2[2]) ,add.params = list(fill = "white") ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  stat_compare_means( paired = TRUE) 

plot_grid(a, b,  ncol = 1)

Plot the knockdown percentages against the negative control TPM for both promoters split by on-target and off-target.

p<-ggscatter(gene_table_melt %>% filter(target=="ontarget"), x = "neg_ctrl_tpm_MP_ontarget", y = "value",
         color = "promoter", line.color = "gray", line.size = 0.4,
         palette = "npg")+
  ggtitle("P1") + theme_minimal() + labs(y = "Percent Knockdown") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

a <-ggscatter(gene_table_melt %>% filter(target=="ontarget"), x = "neg_ctrl_tpm_MP_ontarget", y = "value",
         color = "promoter", line.color = "gray", line.size = 0.4,
         palette = "npg")+
  ggtitle("P1") + theme_minimal() + labs(y = "Percent Knockdown") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


gene_table_melt %>% #filter for value between -100 and 100
  melt(id.vars = c("gene"),measure.vars=c("neg_ctrl_tpm_AP_ontarget","neg_ctrl_tpm_MP_ontarget")) %>%
    unique() %>%
  ggplot(aes(x=value, fill=variable)) + #log scale
  geom_density() +
  labs(x = "Negative Control TPM", y = "Density") + theme_minimal() +
    scale_x_continuous(trans='log10') + scale_fill_manual(values = c("#00AFBB", "#E7B800"))

Plot heatmap of the negative control TPMs for both promoters. Negative control TPMs represent the background expression of the targeted transcripts in the NTC population.

#create heatmap
matrix_ng_cntrl <- gene_table_melt %>% #filter for value between -100 and 100
  dplyr::select( c("gene","neg_ctrl_tpm_MP_ontarget", "neg_ctrl_tpm_AP_ontarget")) %>%
  unique()
rownames(matrix_ng_cntrl) <- matrix_ng_cntrl$gene
matrix_ng_cntrl <- as.matrix(matrix_ng_cntrl %>% dplyr::select(-c("gene")))
#make it so tthis is TPM
matrix_ng_cntrl <- log10(matrix_ng_cntrl + 1)
break2 <- seq(0, max(matrix_ng_cntrl), length.out = 150)

pheatmap(matrix_ng_cntrl, cluster_rows = TRUE, cluster_cols = FALSE,
         show_rownames = TRUE, show_colnames = TRUE, main = "Negative Control",
         fontsize = 8, cellwidth = 8, cellheight = 8 , scale ="none" ,cutree_rows=5 ,colorRampPalette(brewer.pal(9,"YlGnBu"))(150),
         breaks = break2)

Plot the knockdown percentages for both promoters split by on-target only.

gene_table_melt %>% filter(target=="ontarget") %>%
   ggviolin( x = "promoter", y = "value",fill = "promoter",add = "boxplot",    ylab = "Percent Knockdown", xlab = "Promoter", 
                    palette = c(palletee1[1],palletee2[1]),add.params = list(fill = "white") ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

t.test(gene_table$sum_MP_ontarget, gene_table$neg_ctrl_tpm_MP_ontarget, paired = TRUE)

## 
##  Paired t-test
## 
## data:  gene_table$sum_MP_ontarget and gene_table$neg_ctrl_tpm_MP_ontarget
## t = -3.8011, df = 41, p-value = 0.00047
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -270.44973  -82.77884
## sample estimates:
## mean difference 
##       -176.6143

t.test(gene_table$sum_AP_ontarget, gene_table$neg_ctrl_tpm_AP_ontarget, paired = TRUE)

## 
##  Paired t-test
## 
## data:  gene_table$sum_AP_ontarget and gene_table$neg_ctrl_tpm_AP_ontarget
## t = -1.8942, df = 41, p-value = 0.06526
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -30.7019295   0.9828819
## sample estimates:
## mean difference 
##       -14.85952

Calculation of the effect size for the on-target P1 knockdown.

eff_size_mp <- gene_table %>%
  dplyr::filter((looser_KD ==TRUE) & (sig_ntc_count==TRUE) ) %>%
  wilcox_effsize( data = ., formula = sum_MP_ontarget ~ neg_ctrl_tpm_MP_ontarget, paired = TRUE)
eff_size_mp

## # A tibble: 465 × 7
##    .y.             group1 group2 effsize    n1    n2 magnitude
##  * <chr>           <chr>  <chr>    <dbl> <int> <int> <ord>    
##  1 sum_MP_ontarget 1      14.2         1     1     1 large    
##  2 sum_MP_ontarget 1      17.8         1     1     1 large    
##  3 sum_MP_ontarget 1      17.9         1     1     1 large    
##  4 sum_MP_ontarget 1      22           1     1     1 large    
##  5 sum_MP_ontarget 1      25.2         1     1     1 large    
##  6 sum_MP_ontarget 1      30.3         1     1     1 large    
##  7 sum_MP_ontarget 1      45.8         1     1     1 large    
##  8 sum_MP_ontarget 1      46.4         1     1     1 large    
##  9 sum_MP_ontarget 1      59.7         1     1     1 large    
## 10 sum_MP_ontarget 1      68.2         1     1     1 large    
## # ℹ 455 more rows

Another method to confirm base TPM expression in th enegative control population and the proportion of expression.

gene_table_melt_all <-  gene_table %>%
  melt(id.vars = c("gene","neg_ctrl_tpm_AP_ontarget","neg_ctrl_tpm_MP_ontarget","successfulKD"),
       measure.vars = c("percent_knockdown_MP_ontarget", "percent_knockdown_AP_ontarget", 
                        "percent_knockdown_MP_offtarget", "percent_knockdown_AP_offtarget"))

#plot the data negative control tpm
a <- gene_table_melt_all %>% 
  filter(successfulKD==TRUE) %>%
  dplyr::select( c("gene","neg_ctrl_tpm_MP_ontarget", "neg_ctrl_tpm_AP_ontarget")) %>%
  unique() %>%
  melt(id.vars = c("gene"),measure.vars=c("neg_ctrl_tpm_MP_ontarget","neg_ctrl_tpm_AP_ontarget")) %>% #sort P1 to P2
  mutate(promoter_type = ifelse(variable == "neg_ctrl_tpm_MP_ontarget", "P1", "P2")) 

a$promoter_type <- factor(a$promoter_type, levels=c( "P2","P1"))
#change the scientific 

a_plot_a <-  a %>%
  ggplot(aes(y=value, x=promoter_type, fill = promoter_type)) + #log scale
  geom_violin() +
  labs(y = "Sum TPM of Transcripts Under Promoter \n in NTC Population \n in Genes Assigned as Successful KD",x="Targeted Promoter") + yscale("log10")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + #hide legend
  theme(legend.position = "none")+
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) + #chaange labels for x axis
  geom_boxplot(width = 0.2, fill = "white")+ #flip
  coord_flip()  

#plot the data negative control tpm
a <- gene_table_melt_all %>% 
  dplyr::select( c("gene", "successfulKD","neg_ctrl_tpm_MP_ontarget", "neg_ctrl_tpm_AP_ontarget")) %>%
  unique() %>%
  melt(id.vars = c("gene","successfulKD"),measure.vars=c("neg_ctrl_tpm_MP_ontarget","neg_ctrl_tpm_AP_ontarget")) %>% #sort P1 to P2
  mutate(promoter_type = ifelse(variable == "neg_ctrl_tpm_MP_ontarget", "P1", "P2")) 

a$promoter_type <- factor(a$promoter_type, levels=c( "P2","P1"))
# Count observations per group
counts <- a %>%
  group_by(promoter_type, successfulKD) %>%
  summarise(n = n(), .groups = "drop")



a_plot <-  a %>%
  ggplot(aes(y=value, x=promoter_type, fill = successfulKD)) + #log scale
  geom_violin( position = position_dodge(0.9)) +
  labs(y = "Sum TPM of Transcripts Under Promoter \n in NTC Population \n in Genes Assigned as Successful KD",x="Targeted Promoter") + yscale("log10")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + #hide legend
  scale_fill_manual(values = c(palletee1[1], palletee1[2],palletee2[1], palletee2[2])) + #chaange labels for x axis
  geom_boxplot(width = 0.2, position = position_dodge(0.9) )+ #flip
  coord_flip()  +
  geom_text(data = counts, 
            aes(x = promoter_type, y = 10000, label = paste0("n=", n)),
            position = position_dodge(0.9), size = 3) 

#change from scientific notation to visual
options(scipen=999)

p4 <- a %>% #filter for value between -100 and 100
  ggdotchart( x = "gene", y = "value",
              color = "promoter_type",           # change fill color by mpg_level
              palette = c("#00AFBB", "#E7B800"),            # flip 
              orientation = "vertical",  # change the orientation
              yscale="log10" , xlab = "TPM of the Targeted Transcripts \n in NTC Population", 
              ylab="" , 
  ) 

plot_grid(a_plot_a,a_plot, p4, ncol = 1, labels = c("A", "B", "C"))

Plot the cell counts and UMI counts per pseudobulk population to visualise the distribution of these values across all populations.

Plot the sum of TPMs per isoform across all samples to visualise the distribution of expression levels. This is to confirm that the data has enough depth to capture lowly expressed isoforms.

# Read or create the table
if (file.exists(table_name)) {
  all_data <- read.table(table_name, header = TRUE, sep = "\t", row.names = NULL)
} else {
  #map the fiunction process to every element in list of files and merge the otuputs to simplest df use applyr
  all_data <- lapply(files, process_file)
  #flatten and merge on isforom column
  all_data <- Reduce(function(x, y) merge(x, y, by = "Isoform", all = TRUE), all_data)
  #instead of cbind use merge on Isoform
  write.table(all_data, table_name, sep = "\t", row.names = FALSE, quote = FALSE)
}


rowsums <- all_data %>% select(-Isoform) %>% rowSums() %>% as.data.frame() %>%
  rename("Rowsums" = ".") %>% arrange(desc(Rowsums)) %>% #add 1 
  mutate(Rowsums = Rowsums + 1, logRowsums = log10(Rowsums))
rowsums <- rowsums %>%
mutate(index_col = as.integer(rownames(rowsums)), logIndex= log10(index_col))
  
#plot the logrowsums
ggplot(rowsums, aes(x = logIndex, y = logRowsums)) +
  geom_line() +
  theme_minimal() +
  labs(x = "Log10 Index of Isoform (Descending Sum TPM)", y = "log10 Sum Isoform TPM")

3C_whippet_pseudobulk

2025-03-20