This file represents the clinical validation stage of the study. It transitions from cellular experiments to real-world patient data to determine if the alternative promoter usage observed in the lab translates into differences in patient survival rates.
Here is a breakdown of the computational workflow and the clinical insights it provides:
Patient Stratification: The analysis focuses on breast cancer patients, specifically subsetting 415 Luminal A patients and 192 Luminal B patients from the TCGA/GTEx data sources.
Transcript Resolution: Instead of looking at total gene expression, the code specifically tracks transcripts driven by the P1 and P2 promoters of the Estrogen Receptor 1 (ESR1) gene.
Hazard Ratios (HR): The code calculates the Hazard Ratio with a 95% confidence interval. This determines the relative risk of death for patients with high vs. low expression of specific promoter-driven transcripts.
Median Cutoff: Patients are grouped into “High” or “Low” expression categories based on the group median cutoff for overall survival
library(UCSCXenaTools)
## =========================================================================================
## UCSCXenaTools version 1.6.1
## Project URL: https://github.com/ropensci/UCSCXenaTools
## Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
##
## If you use it in published research, please cite:
## Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
## from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
## Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
## =========================================================================================
## --Enjoy it--
library(survival)
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(survminer)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(tibble)
library(reticulate)
## Warning: package 'reticulate' was built under R version 4.4.3
library(ggpubr)
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
survivalcurve_transcript <- function(df_expression_tcga,df_survival_sub,transcript_1,highcutoff,lowcutoff,highcol,lowcol,returnfit=FALSE){
# print(transcript_1)
#this is the code that will be used to generate the survival curve
#subset df_expression_tcga for the samples that express transcript 1 and transcript 2 and the paitents in the csurvival data
#check that the transcript was subset
df_survival_sub = df_survival_sub[!duplicated(df_survival_sub),]
#check that the transcripts were oresent
#grep transcripts found in transcript_1 transcript_2
transcript_match <- rownames(df_expression_tcga)[grep(paste(transcript_1,sep="|"),rownames(df_expression_tcga))]
df_subset <- df_expression_tcga %>%
subset(rownames(df_expression_tcga) %in% c(transcript_match))
df_subset <- t(df_subset[colnames(df_subset) %in% rownames(df_survival_sub)])
#merge the survival data with the gene expression data
df <- merge(df_subset,df_survival_sub,by.x = "row.names",by.y = "row.names")
dim(df)
#high and low cutoffs are the quantiles that will be used to split the data
high_df = rownames(df[df[,transcript_match] > quantile(df[,transcript_match],as.numeric(highcutoff)/100),1,drop = F])
low_df = rownames(df[df[,transcript_match] < quantile(df[,transcript_match],as.numeric(lowcutoff)/100),1,drop = F])
high_num = length(high_df)
low_num = length(low_df)
title = "Overall Survival"
df[,"OS.time.month"] = as.numeric(df[,"OS.time"]) %/% 30 + 1
#assign the class to the high and low groups
df[rownames(df) %in% high_df,"CLASS"] = "high"
df[rownames(df) %in% low_df,"CLASS"] = "low"
df$CLASS = factor(df$CLASS,levels = c("low","high"))
#count the number of high and low table(df$CLASS)
#if no high or low then output
if (length(high_df) == 0 | length(low_df) == 0){
if (returnfit == TRUE){
return("No high or low groups found and returnfit is TRUE")
}
if (returnfit == FALSE){
list( transcript=transcript_match ,HR_cox=NA, pval_cox=NA, ci_cox=NA,p.val = NA, high_num = high_num, low_num = low_num, high_cutoff = highcutoff, low_cutoff = lowcutoff) -> summary
return(summary)
}
}
else{
#if the class is not present in the data then return NA
color = c(lowcol,highcol)
#make df$OSDAY numeric
#create a survival object first entry is the time and the status (dead/alive)
mod = Surv(df$OS.time.month,df$OS)
#fit it with the class of the survival data
mfit = survfit(mod~df$CLASS)
sur = survdiff(mod~df$CLASS)
p.val <- 1 - pchisq(sur$chisq, length(sur$n) - 1)
p.val = signif(p.val,2)
fit <- survfit(Surv(OS.time.month, OS) ~ CLASS, data = df)
results_coxph = coxph(mod~df$CLASS)%>%
tbl_regression(exp = TRUE)
#extract the hazard risk
results_coxph_hr = results_coxph$table_body$estimate[3]
results_coxph_hr_p = results_coxph$table_body$p.value[3]
#confidence interval
results_coxph_conf = results_coxph$table_body$ci[3]
#change and space to _
results_coxph_conf <- gsub(" ","_",results_coxph_conf)
#remvoe ,
results_coxph_conf <- gsub(",","",results_coxph_conf)
#summary of the survival curve and add to dataframe
list( transcript=transcript_match ,HR_cox=results_coxph_hr, pval_cox=results_coxph_hr_p, ci_cox=results_coxph_conf,p.val = p.val, high_num = high_num, low_num = low_num, high_cutoff = highcutoff, low_cutoff = lowcutoff) -> summary
if (returnfit == TRUE){
return(list(fit,df,summary))
}
if (returnfit == FALSE){
return(summary)
}
}
}
filter <- "LumA"
highcol="#ff0000"
lowcol="#0000ff"
ifmonth="month"
highcutoff=50
lowcutoff=50
ifhr="hr"
ifconf="conf"
outputdir="../../../alt-prom-crispr-fiveprime/ffiles/test/"
transcript_1="ENST00000440973" #luminal A vs B split on low vs high ENSG00000091831
transcript_2="ENST00000206249"
data(XenaData)
head(XenaData)
## # A tibble: 6 × 17
## XenaHosts XenaHostNames XenaCohorts XenaDatasets SampleCount DataSubtype Label
## <chr> <chr> <chr> <chr> <int> <chr> <chr>
## 1 https://… publicHub Breast Can… ucsfNeve_pu… 51 gene expre… Neve…
## 2 https://… publicHub Breast Can… ucsfNeve_pu… 57 phenotype Phen…
## 3 https://… publicHub Glioma (Ko… kotliarov20… 194 copy number Kotl…
## 4 https://… publicHub Glioma (Ko… kotliarov20… 194 phenotype Phen…
## 5 https://… publicHub Lung Cance… weir2007_pu… 383 copy number CGH
## 6 https://… publicHub Lung Cance… weir2007_pu… 383 phenotype Phen…
## # ℹ 10 more variables: Type <chr>, AnatomicalOrigin <chr>, SampleType <chr>,
## # Tags <chr>, ProbeMap <chr>, LongTitle <chr>, Citation <chr>, Version <chr>,
## # Unit <chr>, Platform <chr>
Get the dataset: phenotype - Molecular subtype hub: https://pancanatlas.xenahubs.net https://xenabrowser.net/datapages/?dataset=TCGASubtype.20170308.tsv&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443
## [1] 628 2
## [1] "Subtype patient list already exists"
Get the survival data. Specifically using the curated clinical data from the Pan-cancer Atlas paper titled “An Integrated TCGA Pan-Cancer Clinical Data Resource (TCGA-CDR) to drive high quality survival outcome analytics”. The paper highlights four types of carefully curated survival endpoints, and recommends the use of the endpoints of OS, PFI, DFI, and DSS for each TCGA cancer type. - OS: overall survial - PFI: progression-free interval - DSS: disease-specific survival - DFI: disease-free interval
Specifically using the transcript expression RNAseq data of TOIL RSEM tpm (n=10,535) UCSC Toil RNA-seq Recompute.
XenaGenerate(subset = XenaHostNames=="toilHub") %>%
XenaFilter(filterCohorts = "GTEx") %>%
XenaFilter(filterDatasets = "survival") -> df_survival
df_survival <- XenaQuery(df_survival)
## This will check url status, please be patient.
# XenaDownload(df_survival, destdir = "~/test/", method = "wget", extra = "-c", force = TRUE)
df_survival <- read.table("../../../alt-prom-crispr-fiveprime/files/test/TCGA_survival_data", header = TRUE, sep = "\t", row.names = 1)
print(dim(df_survival))
## [1] 10496 8
Download and extract the breast cancer control patients and their TPM Repear with the other BRCA patients
XenaGenerate(subset = XenaHostNames=="toilHub") %>%
XenaFilter(filterDatasets = "gtex_rsem_isoform_tpm") -> df_expression_tpm
df_expression_tpm <- XenaQuery(df_expression_tpm)
## This will check url status, please be patient.
##filter df_expression_tcga to only include the breast cancer patients
# py_run_file("./12_survival_curve.py")
#run the ipynb file to get the patient ids
df_expression_tcga <- read.table("../../../alt-prom-crispr-fiveprime/files/test/TcgaTargetGtex_rsem_isoform_tpm.filtered.tsv", header = TRUE, sep = "\t", row.names = 1)
#change colnames to have - instead of .
colnames(df_expression_tcga) <- gsub("\\.","-",colnames(df_expression_tcga))
#subset patients for luminal A
df_survival_sub <- unique(df_survival[rownames(df_subtype) %in% df_subtype$sample,])
df_expression_tcga <- df_expression_tcga[colnames(df_expression_tcga) %in% rownames(df_survival_sub),]
#repeat with transcript_2
final <- survivalcurve_transcript(df_expression_tcga,df_survival_sub,transcript_2,highcutoff,lowcutoff,highcol,lowcol, returnfit = TRUE)
ggsurvplot(final[1], data=final[2], risk.table = TRUE, conf.int=TRUE, pval=TRUE, legend.title="Expression of Transcript",
palette=c("dodgerblue2", "orchid2"),
pval.size = 3, pval.coord = c(0, 0.25), pval.method = TRUE, pval.threshold = 0.05,
title = transcript_2,
xlab="Time (months)", ylab="Survival probability"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]
## Ignoring unknown labels:
## • colour : "Expression of Transcript"
##
## attr(,"class")
## [1] "list" "ggsurvplot_list"
#iterate the survival curve over every transcript
transcript_1<- "ENST00000440973"
final <- survivalcurve_transcript(df_expression_tcga,df_survival_sub,transcript_1,highcutoff,lowcutoff,highcol,lowcol, returnfit = TRUE)
ggsurvplot(final[1], data=final[2], risk.table = TRUE, conf.int=TRUE, pval=TRUE, legend.title="Expression of Transcript",
palette=c("dodgerblue2", "orchid2"),
pval.size = 3, pval.coord = c(0, 0.25), pval.method = TRUE, pval.threshold = 0.05,
title = transcript_1,
xlab="Time (months)", ylab="Survival probability"
)
## [[1]]
## Ignoring unknown labels:
## • colour : "Expression of Transcript"
##
## attr(,"class")
## [1] "list" "ggsurvplot_list"
Plot the basal expression data for the transcripts
# Define Transcript Metadata for Mapping
p1_id <- "ENST00000440973"
p2_ids <- c("ENST00000338799", "ENST00000443427", "ENST00000456483",
"ENST00000406599", "ENST00000206249", "ENST00000427531")
#get the rownames that sart with p1_id and P2_isa
promoter_rows <- rownames(df_expression_tcga)[grep(paste(c(p1_id, p2_ids), collapse = "|"), rownames(df_expression_tcga))]
# --- 1. Plot Basal Expression Data ---
df_expression_all <- df_expression_tcga %>% #take top five
dplyr::filter(rownames(df_expression_tcga) %in% promoter_rows) %>%
tibble::rownames_to_column(var = "transcript") %>%
reshape2::melt(id.vars = "transcript") %>%
tidyr::drop_na()
p_exp <- ggviolin(df_expression_all, x = "transcript", y = "value",
fill = "lightgray", add = "boxplot",
xlab = "All Transcripts", ylab = "log TPM (Transcripts Per Million)") +
ggtitle("Basal Transcript Expression Distribution") +
theme_minimal() + #reverse
coord_flip()
p_exp