10 Using UCSC Xena for Survival Curve

This file represents the clinical validation stage of the study. It transitions from cellular experiments to real-world patient data to determine if the alternative promoter usage observed in the lab translates into differences in patient survival rates.

Here is a breakdown of the computational workflow and the clinical insights it provides:

Clinical Data Integration (TCGA and GTEx) The code utilizes the pyGEPIA2 package to access large-scale clinical datasets.

Patient Stratification: The analysis focuses on breast cancer patients, specifically subsetting 415 Luminal A patients and 192 Luminal B patients from the TCGA/GTEx data sources.

Transcript Resolution: Instead of looking at total gene expression, the code specifically tracks transcripts driven by the P1 and P2 promoters of the Estrogen Receptor 1 (ESR1) gene.

Statistical Modeling of Survival (Cox PH) To quantify the impact of promoter usage on patient outcomes, the notebook implements the Cox Proportional Hazards (Cox PH) model.

Hazard Ratios (HR): The code calculates the Hazard Ratio with a 95% confidence interval. This determines the relative risk of death for patients with high vs. low expression of specific promoter-driven transcripts.

Median Cutoff: Patients are grouped into “High” or “Low” expression categories based on the group median cutoff for overall survival

library(UCSCXenaTools)

## =========================================================================================
## UCSCXenaTools version 1.6.1
## Project URL: https://github.com/ropensci/UCSCXenaTools
## Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
## 
## If you use it in published research, please cite:
## Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
##   from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
##   Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
## =========================================================================================
##                               --Enjoy it--

library(survival)
library(readr)

## Warning: package 'readr' was built under R version 4.4.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(survminer)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.4.3

library(tibble)
library(reticulate)

## Warning: package 'reticulate' was built under R version 4.4.3

library(ggpubr)
library(gtsummary)

## Warning: package 'gtsummary' was built under R version 4.4.3

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(cowplot)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggpubr':
## 
##     get_legend

survivalcurve_transcript <- function(df_expression_tcga,df_survival_sub,transcript_1,highcutoff,lowcutoff,highcol,lowcol,returnfit=FALSE){
    # print(transcript_1)
  #this is the code that will be used to generate the survival curve
  #subset df_expression_tcga for the samples that express transcript 1 and transcript 2 and the paitents in the csurvival data
  #check that the transcript was subset
  df_survival_sub = df_survival_sub[!duplicated(df_survival_sub),]
  #check that the transcripts were oresent 
  #grep transcripts found in transcript_1 transcript_2
  transcript_match <- rownames(df_expression_tcga)[grep(paste(transcript_1,sep="|"),rownames(df_expression_tcga))]
  df_subset <- df_expression_tcga %>% 
    subset(rownames(df_expression_tcga) %in% c(transcript_match)) 
  df_subset <- t(df_subset[colnames(df_subset) %in% rownames(df_survival_sub)])
  #merge the survival data with the gene expression data
  df <- merge(df_subset,df_survival_sub,by.x = "row.names",by.y = "row.names")
  dim(df)
  
  #high and low cutoffs are the quantiles that will be used to split the data
  high_df = rownames(df[df[,transcript_match] > quantile(df[,transcript_match],as.numeric(highcutoff)/100),1,drop = F])
  low_df = rownames(df[df[,transcript_match] < quantile(df[,transcript_match],as.numeric(lowcutoff)/100),1,drop = F])
  high_num = length(high_df)
  low_num = length(low_df)
  title = "Overall Survival"
  df[,"OS.time.month"] = as.numeric(df[,"OS.time"]) %/% 30 + 1
  
  
  #assign the class to the high and low groups
  df[rownames(df) %in% high_df,"CLASS"] = "high"
  df[rownames(df) %in% low_df,"CLASS"] = "low"
  
  df$CLASS = factor(df$CLASS,levels = c("low","high"))
  #count the number of high and low   table(df$CLASS)
  #if no high or low then output 
  if (length(high_df) == 0 | length(low_df) == 0){
      if (returnfit == TRUE){
    return("No high or low groups found and returnfit is TRUE")
  }
  if (returnfit == FALSE){
     list( transcript=transcript_match ,HR_cox=NA, pval_cox=NA, ci_cox=NA,p.val = NA, high_num = high_num, low_num = low_num, high_cutoff = highcutoff, low_cutoff = lowcutoff) -> summary
    return(summary)
  }
    

  }
  else{
     #if the class is not present in the data then return NA
  color = c(lowcol,highcol)
  #make df$OSDAY numeric 
  #create a survival object first entry is the time and the status (dead/alive)
  mod = Surv(df$OS.time.month,df$OS)
  #fit it with the class of the survival data 
  mfit = survfit(mod~df$CLASS)
  sur = survdiff(mod~df$CLASS)
  p.val <- 1 - pchisq(sur$chisq, length(sur$n) - 1)
  p.val = signif(p.val,2)
  fit <- survfit(Surv(OS.time.month, OS) ~ CLASS, data = df)
  results_coxph = coxph(mod~df$CLASS)%>% 
  tbl_regression(exp = TRUE)

  #extract the hazard risk 
  results_coxph_hr = results_coxph$table_body$estimate[3]
  results_coxph_hr_p = results_coxph$table_body$p.value[3]
  #confidence interval
  results_coxph_conf = results_coxph$table_body$ci[3]
  #change and space to _
  results_coxph_conf <- gsub(" ","_",results_coxph_conf)
  #remvoe , 
  results_coxph_conf <- gsub(",","",results_coxph_conf)
  #summary of the survival curve and add to dataframe 
  list( transcript=transcript_match ,HR_cox=results_coxph_hr, pval_cox=results_coxph_hr_p, ci_cox=results_coxph_conf,p.val = p.val, high_num = high_num, low_num = low_num, high_cutoff = highcutoff, low_cutoff = lowcutoff) -> summary
  if (returnfit == TRUE){
    return(list(fit,df,summary))
  }
  if (returnfit == FALSE){
    return(summary)
  }
  }
}

filter <- "LumA"
highcol="#ff0000"
lowcol="#0000ff"
ifmonth="month"
highcutoff=50
lowcutoff=50
ifhr="hr"
ifconf="conf"
outputdir="../../../alt-prom-crispr-fiveprime/ffiles/test/"
transcript_1="ENST00000440973" #luminal A vs B split on low vs high ENSG00000091831
transcript_2="ENST00000206249" 

data(XenaData)
head(XenaData)

## # A tibble: 6 × 17
##   XenaHosts XenaHostNames XenaCohorts XenaDatasets SampleCount DataSubtype Label
##   <chr>     <chr>         <chr>       <chr>              <int> <chr>       <chr>
## 1 https://… publicHub     Breast Can… ucsfNeve_pu…          51 gene expre… Neve…
## 2 https://… publicHub     Breast Can… ucsfNeve_pu…          57 phenotype   Phen…
## 3 https://… publicHub     Glioma (Ko… kotliarov20…         194 copy number Kotl…
## 4 https://… publicHub     Glioma (Ko… kotliarov20…         194 phenotype   Phen…
## 5 https://… publicHub     Lung Cance… weir2007_pu…         383 copy number CGH  
## 6 https://… publicHub     Lung Cance… weir2007_pu…         383 phenotype   Phen…
## # ℹ 10 more variables: Type <chr>, AnatomicalOrigin <chr>, SampleType <chr>,
## #   Tags <chr>, ProbeMap <chr>, LongTitle <chr>, Citation <chr>, Version <chr>,
## #   Unit <chr>, Platform <chr>

Get the dataset: phenotype - Molecular subtype hub: https://pancanatlas.xenahubs.net https://xenabrowser.net/datapages/?dataset=TCGASubtype.20170308.tsv&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

## [1] 628   2

## [1] "Subtype patient list already exists"

Get the survival data. Specifically using the curated clinical data from the Pan-cancer Atlas paper titled “An Integrated TCGA Pan-Cancer Clinical Data Resource (TCGA-CDR) to drive high quality survival outcome analytics”. The paper highlights four types of carefully curated survival endpoints, and recommends the use of the endpoints of OS, PFI, DFI, and DSS for each TCGA cancer type. - OS: overall survial - PFI: progression-free interval - DSS: disease-specific survival - DFI: disease-free interval

Specifically using the transcript expression RNAseq data of TOIL RSEM tpm (n=10,535) UCSC Toil RNA-seq Recompute.

XenaGenerate(subset = XenaHostNames=="toilHub") %>%
  XenaFilter(filterCohorts = "GTEx")  %>% 
    XenaFilter(filterDatasets = "survival") -> df_survival
 df_survival  <- XenaQuery(df_survival)

## This will check url status, please be patient.

 # XenaDownload(df_survival, destdir = "~/test/", method = "wget", extra = "-c", force = TRUE)

 df_survival <- read.table("../../../alt-prom-crispr-fiveprime/files/test/TCGA_survival_data", header = TRUE, sep = "\t", row.names = 1)
 print(dim(df_survival))

## [1] 10496     8

Download and extract the breast cancer control patients and their TPM Repear with the other BRCA patients

XenaGenerate(subset = XenaHostNames=="toilHub") %>%
  XenaFilter(filterDatasets = "gtex_rsem_isoform_tpm")  -> df_expression_tpm
 df_expression_tpm  <- XenaQuery(df_expression_tpm)

## This will check url status, please be patient.

##filter df_expression_tcga to only include the breast cancer patients
# py_run_file("./12_survival_curve.py")
#run the ipynb file to get the patient ids
df_expression_tcga <- read.table("../../../alt-prom-crispr-fiveprime/files/test/TcgaTargetGtex_rsem_isoform_tpm.filtered.tsv", header = TRUE, sep = "\t", row.names = 1)
#change colnames to have - instead of .
colnames(df_expression_tcga) <- gsub("\\.","-",colnames(df_expression_tcga))
#subset patients for luminal A 
df_survival_sub <- unique(df_survival[rownames(df_subtype) %in% df_subtype$sample,])
df_expression_tcga <- df_expression_tcga[colnames(df_expression_tcga) %in% rownames(df_survival_sub),]

#repeat with transcript_2
final <- survivalcurve_transcript(df_expression_tcga,df_survival_sub,transcript_2,highcutoff,lowcutoff,highcol,lowcol, returnfit = TRUE)
ggsurvplot(final[1], data=final[2], risk.table = TRUE, conf.int=TRUE, pval=TRUE, legend.title="Expression of Transcript",
            palette=c("dodgerblue2", "orchid2"),
           pval.size = 3, pval.coord = c(0, 0.25), pval.method = TRUE, pval.threshold = 0.05,
           title = transcript_2,
           xlab="Time (months)", ylab="Survival probability"
           )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [[1]]

## Ignoring unknown labels:
## • colour : "Expression of Transcript"

## 
## attr(,"class")
## [1] "list"            "ggsurvplot_list"

#iterate the survival curve over every transcript
transcript_1<- "ENST00000440973"
final <- survivalcurve_transcript(df_expression_tcga,df_survival_sub,transcript_1,highcutoff,lowcutoff,highcol,lowcol, returnfit = TRUE)

ggsurvplot(final[1], data=final[2], risk.table = TRUE, conf.int=TRUE, pval=TRUE, legend.title="Expression of Transcript",
            palette=c("dodgerblue2", "orchid2"),
           pval.size = 3, pval.coord = c(0, 0.25), pval.method = TRUE, pval.threshold = 0.05,
           title = transcript_1,
           xlab="Time (months)", ylab="Survival probability"
           )

## [[1]]

## Ignoring unknown labels:
## • colour : "Expression of Transcript"

## 
## attr(,"class")
## [1] "list"            "ggsurvplot_list"

Plot the basal expression data for the transcripts

# Define Transcript Metadata for Mapping
p1_id <- "ENST00000440973"
p2_ids <- c("ENST00000338799", "ENST00000443427", "ENST00000456483", 
            "ENST00000406599", "ENST00000206249", "ENST00000427531")

#get the rownames that sart with p1_id and P2_isa
promoter_rows <- rownames(df_expression_tcga)[grep(paste(c(p1_id, p2_ids), collapse = "|"), rownames(df_expression_tcga))]
# --- 1. Plot Basal Expression Data ---
df_expression_all <- df_expression_tcga %>% #take top five 
  dplyr::filter(rownames(df_expression_tcga) %in% promoter_rows) %>%
  tibble::rownames_to_column(var = "transcript") %>%
  reshape2::melt(id.vars = "transcript") %>%
  tidyr::drop_na() 

p_exp <- ggviolin(df_expression_all, x = "transcript", y = "value",
                  fill = "lightgray", add = "boxplot",
                  xlab = "All Transcripts", ylab = "log TPM (Transcripts Per Million)") +
  ggtitle("Basal Transcript Expression Distribution") +
  theme_minimal() + #reverse
  coord_flip()
  
p_exp

10 Using UCSC Xena for Survival Curve

2025-03-20