Code for reproducing the results shown in the manuscript.
### Loading functions. ### inst <- rownames(utils::installed.packages()) cran <- c("devtools","R.utils","Matrix","glmnet","pROC","BiocManager","ashr") # "googledrive", "httpuv" if(!all(cran %in% inst)){ for(i in seq_along(cran)){ if(!cran[i] %in% inst){ install.packages(cran[i]) } } } bioc <- c("edgeR","TCGAbiolinks") if(!all(bioc %in% inst)){ #source("http://bioconductor.org/biocLite.R") for(i in seq_along(bioc)){ if(!bioc[i] %in% inst){ #biocLite(bioc[i]) BiocManager::install(bioc[i]) } } } #if(!"ashr" %in% inst){ # devtools::install_github("stephens999/ashr") #} user <- Sys.getenv("USERNAME") path <- file.path("C:","Users",user,"Desktop","palasso") if(user=="arra"){path <- "C:/Users/arra/Desktop/MATHS/palasso_desktop"} if(user==""){path <- "/virdir/Scratch/arauschenberger/palasso"} setwd(path) folders <- c("data","results") invisible(sapply(folders,function(x) if(!dir.exists(x)){dir.create(x)})) if(user!="arra"){ devtools::install_github("kkdey/CorShrink") # ref="a9f6ba0" devtools::install_github("rauschenberger/palasso") # ref="4a995a2" } if(FALSE){ # The functions <<save>>, <<file.exists>> and <<file.remove>> access the hard disk, but also try to access googledrive. save <- function(object,file){ base::save(object,file=file) tryCatch(expr=googledrive::drive_upload(media=file,path=file), error=function(x) NULL) #Sys.sleep(0.5) } file.exists <- function(file){ offline <- base::file.exists(file) online <- FALSE if(!offline){ d <- googledrive::as_dribble(x=file) online <- tryCatch(expr=googledrive::some_files(d), error=function(x) FALSE) #Sys.sleep(0.5) } return(offline|online) } file.remove <- function(file){ base::file.remove(file) tryCatch(expr=googledrive::drive_trash(file=file), error=function(x) NULL) #Sys.sleep(0.5) } # The function <<update>> moves files from the research cloud to the remote drive. Given both paths, it first verifies whether the folders SIM, GDC and CCA are available, and then copies all missing files from the research cloud to the remote drive. # from: path to the origin # to: path to the destination update <- function(from,to){ dir <- c("SIM","GDC","CCA") if(any(!dir.exists(file.path(from,dir)))){stop("Invalid.")} if(any(!dir.exists(file.path(to,dir)))){stop("Invalid.")} pb <- utils::txtProgressBar(min=0,max=1,style=3) for(i in seq_along(dir)){ files0 <- dir(file.path(from,dir[i])) files1 <- dir(file.path(to,dir[i])) names <- files0[!files0 %in% files1] for(j in seq_along(names)){ utils::setTxtProgressBar(pb=pb,value=(i-1)/3+(j*i)/(3*length(names))) file.copy(from=file.path(from,dir[i],names[j]), to=file.path(to,dir[i],names[j]), copy.date=TRUE) } utils::setTxtProgressBar(pb=pb,value=i/3) } } # update(from="results",to="//tsclient/N/palasso/results") }
Last data download started on 2019-03-26 (R version 3.5.3).
### Downloading "Isoform Expression Quantification". ### #rm(list=ls()) #<<functions>> directory <- file.path(path,"data") setwd(directory) # Retrieving cancer types: project <- TCGAbiolinks::getGDCprojects()$id project <- project[grepl(x=project,pattern="TCGA")] # Downloading isoform expression data: y <- X <- list() for(i in seq_along(project)){ query <- TCGAbiolinks::GDCquery(project=project[i], data.category="Transcriptome Profiling", data.type="Isoform Expression Quantification") TCGAbiolinks::GDCdownload(query,method="api",directory=directory) trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)) X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory) X[[i]][,c("miRNA_ID","reads_per_million_miRNA_mapped", "cross-mapped","miRNA_region")] <- NULL y[[i]] <- rep(project[i],times=length(unique(X[[i]]$barcode))) } save(list=c("y","X"),file=file.path(path,"data","isoform_raw.RData")) load(file.path(path,"data","isoform_raw.RData"),verbose=TRUE) # Merging isoform expression data: Xs <- do.call(what=rbind,args=X) # sparse matrix y <- do.call(what="c",args=y) # Transform to matrix Xs$isoform_coords <- gsub(pattern="hg38:chr",replacement="",x=Xs$isoform_coords) samples <- unique(Xs$barcode) covariates <- unique(Xs$isoform_coords) row <- match(Xs$barcode,samples) col <- match(Xs$isoform_coords,covariates) X <- Matrix::sparseMatrix(i=row,j=col,x=Xs$read_count,dimnames=list(samples,covariates)) # Order by molecular location split <- strsplit(x=colnames(X),split=":|-") chr <- sapply(split,function(x) x[[1]]) pos <- sapply(split,function(x) x[[2]]) order <- order(chr,pos) X <- X[,order] if(FALSE){ # testing i <- sample(seq_len(nrow(Xs)),size=1) Xs$read_count[i] X[Xs$barcode[i],Xs$isoform_coords[i]] } save(list=c("y","X"),file=file.path(path,"data","isoform_all.RData"))
### Downloading "miRNA Expression Quantification". ### #rm(list=ls()) #<<functions>> directory <- file.path(path,"data") setwd(directory) # Downloading data. project <- TCGAbiolinks::getGDCprojects()$id project <- project[grepl(x=project,pattern="TCGA")] y <- X <- list() for(i in seq_along(project)){ query <- TCGAbiolinks::GDCquery(project=project[i], data.category="Transcriptome Profiling", data.type="miRNA Expression Quantification") TCGAbiolinks::GDCdownload(query,method="api",directory=directory) trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)) data <- TCGAbiolinks::GDCprepare(query,directory=directory) X[[i]] <- t(data[,c(seq(from=2,to=ncol(data),by=3))]) y[[i]] <- rep(project[i],times=nrow(X[[i]])) } save(list=c("y","X"),file=file.path(path,"data","miRNA_raw.RData")) load(file.path(path,"data","miRNA_raw.RData")) X <- do.call(what=rbind,args=X) y <- do.call(what="c",args=y) rownames(X) <- gsub(pattern="read_count_",replacement="",x=rownames(X)) save(list=c("y","X"),file=file.path(path,"data","miRNA_all.RData"))
### Downloading "Gene Expression Quantification". ### #rm(list=ls()) #<<functions>> directory <- file.path(path,"data") setwd(directory) # Retrieving cancer types: project <- TCGAbiolinks::getGDCprojects()$id project <- project[grepl(x=project,pattern="TCGA")] # Downloading data: memory.limit(size=16000) # Activate virtual memory in system control! y <- X <- list() for(i in seq_along(project)){ query <- TCGAbiolinks::GDCquery(project=project[i], data.category="Transcriptome Profiling", data.type="Gene Expression Quantification", workflow.type="HTSeq - Counts"); gc() TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory); gc() trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)); gc() X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory); gc() y[[i]] <- rep(project[i],times=ncol(X[[i]])); gc() } save(list=c("y","X"),file=file.path(path,"data","gene_raw.RData")) load(file.path(path,"data","gene_raw.RData")) genes <- SummarizedExperiment::rowData(X[[1]]) mart <- biomaRt::useMart("ensembl",dataset="hsapiens_gene_ensembl") # char <- biomaRt::getBM(attributes=c("ensembl_gene_id","chromosome_name","transcript_start","gene_biotype"),filters=c("biotype","chromosome_name"),values=list("protein_coding",c(1:22,"X")),mart=mart) select <- genes$ensembl_gene_id[genes$ensembl_gene_id %in% char$ensembl_gene_id] X <- lapply(X,function(x) t(SummarizedExperiment::assays(x)$"HTSeq - Counts"[select,])) X <- do.call(what=rbind,args=X) y <- do.call(what="c",args=y) save(list=c("y","X"),file=file.path(path,"data","gene_all.RData"))
### Downloading "Copy Number Variation". ### #rm(list=ls()) #<<functions>> directory <- file.path(path,"data") setwd(directory) project <- TCGAbiolinks::getGDCprojects()$id project <- project[grepl(x=project,pattern="TCGA")] y <- X <- list() for(i in seq_along(project)){ query <- TCGAbiolinks::GDCquery(project=project[i], data.category="Copy Number Variation", data.type="Masked Copy Number Segment") TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory) trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)) X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory) y[[i]] <- rep(project[i],times=length(unique(X[[i]]$Sample))) # correct? } save(list=c("y","X"),file=file.path(path,"data","CNV_raw.RData")) load(file.path(path,"data","CNV_raw.RData"),verbose=TRUE) # Merging CNV data: Xs <- do.call(what=rbind,args=X) # sparse matrix y <- do.call(what="c",args=y) #table(Xs$Sample) # Prepare cutting. cut <- list() cut$chr <- c(1:22,"X") cut$start <- sapply(cut$chr,function(x) min(Xs$Start[Xs$Chromosome==x])) cut$end <- sapply(cut$chr,function(x) max(Xs$End[Xs$Chromosome==x])) cut$length <- cut$end-cut$start cut$dist <- sum(cut$length)/10000 cut$num <- round(cut$length/cut$dist) # Create covariates. cov <- list() cov$p <- sum(cut$num) cov$chromosome <- unlist(sapply(cut$chr,function(i) rep(i,times=cut$num[i]))) cov$location <- unlist(sapply(cut$chr,function(i) round(seq(from=cut$start[i],to=cut$end[i],length.out=cut$num[i])))) cov$name <- paste0(cov$chromosome,":",cov$location) # Create indices for each covariate. index <- rep(list(integer()),times=cov$p) pb <- utils::txtProgressBar(min=0,max=cov$p,style=3) for(j in seq_len(cov$p)){ utils::setTxtProgressBar(pb=pb,value=j) index[[j]] <- which((Xs$Chromosome==cov$chromosome[j]) & (Xs$Start<=cov$location[j]) & (cov$location[j]<=Xs$End)) # consider < } # Expand indices to matrix. X <- matrix(0,nrow=length(unique(Xs$Sample)),ncol=cov$p, dimnames=list(unique(Xs$Sample),cov$name)) for(j in seq_along(index)){ mean <- Xs$Segment_Mean[index[[j]]] i <- Xs$Sample[index[[j]]] X[i,j] <- mean } if(FALSE){ # test sample <- sample(rownames(X),size=1) covariate <- sample(colnames(X),size=1) split <- strsplit(covariate,split=":")[[1]] a <- X[sample,covariate] b <- Xs$Segment_Mean[(Xs$Sample==sample) & (Xs$Chromosome==split[1]) & (Xs$Start<=as.numeric(split[2])) & (as.numeric(split[2]) < Xs$End)] all(a==b) } save(list=c("y","X","index"),file=file.path(path,"data","CNV_all.RData"))
### Extracting samples of interest. ### #rm(list=ls()) #<<functions>> type <- c("isoform","miRNA","CNV","gene") for(i in seq_along(type)){ cat(type[i],"\n") load(file.path(path,"data",paste0(type[i],"_all.RData")),verbose=TRUE) # TCGA barcode barcode <- rownames(X) code <- sapply(barcode,function(x) strsplit(x,split="-")) code <- as.data.frame(do.call(what=rbind,args=code)) colnames(code) <- c("project","TSS","participant","sample_vial", "portion_analyte","plate","center") code$sample <- substr(code$sample_vial,start=1,stop=2) code$vial <- substr(code$sample_vial,start=3,stop=3) code$portion <- substr(code$portion_analyte,start=1,stop=2) code$analyte <- substr(code$portion_analyte,start=3,stop=3) code$sample_vial <- code$portion_analyte <- NULL # solid tumour (except blood for LAML) solid <- (code$sample=="01" | (y=="TCGA-LAML" & code$sample=="03")) X <- X[solid,] y <- y[solid] # unique samples unique <- !duplicated(substr(rownames(X),start=1,stop=12)) X <- X[unique,] y <- y[unique] save(list=c("y","X"),file=file.path(path,"data",paste0(type[i],"_sub.RData"))) } # isoform: n=9'794, p=194'595, k=32 # miRNA: n=9'794, p=1'881, k=32 # gene: n=9'830, p=19'602, k=33 # CNV: n=10'578, p=10'000, k=33 ## Understanding barcodes: # overview: https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode # details: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables ## Understanding replicate samples: # http://gdac.broadinstitute.org/runs/sampleReports/latest/READ_Replicate_Samples.html
Last data analysis started on 2019-04-12 (R version 3.5.3).
### Analysing the TCGA data. ### #rm(list=ls()) #<<functions>> for(type in c("miRNA","isoform","CNV","gene")){ library(Matrix) for(k in c(207,sample(528))){ set.seed(k); cat(k," ") if(type %in% c("isoform","miRNA") & k > 496){next} if(type %in% c("CNV","gene") & k > 528){next} # searching for missing cancer-cancer combinations rm(list=setdiff(ls(),c("k","type","path","save","file.exists","file.remove"))); gc() file0 <- paste0("results/",type,"_start_",k,".RData") file1 <- paste0("results/",type,"_loss_",k,".RData") if(file.exists(file0)||file.exists(file1)){next} save(object=k,file=file0) load(paste0("data/",type,"_sub.RData")) # indicating the cancer-cancer combination cancer <- substring(text=unique(y),first=6) comb <- utils::combn(x=cancer,m=2) select <- paste0("TCGA-",comb[,k]) y <- ifelse(y==select[1],1,ifelse(y==select[2],0,NA)) rm(cancer,select) # removing other cancer types cond <- !is.na(y) y <- y[cond] X <- X[cond,] rm(cond) # pre-processing if(type %in% c("isoform","miRNA")){ x <- palasso:::.prepare(X,cutoff="zero") } else if(type=="gene"){ x <- palasso:::.prepare(X,cutoff="knee") } else if(type=="CNV"){ x <- list(X=X,Z=sign(X)) x <- lapply(x,function(x) scale(x)) attributes(x)$info <- data.frame(n=nrow(X),p=ncol(X),prop=mean(x$Z==0)) } rm(X) # cross-validation loss <- tryCatch(expr=palasso:::.predict(y=y,X=x,nfolds.int=10),error=function(e) palasso:::.predict(y=y,X=x,nfolds.int=10)) # information loss$info <- cbind(k=k, y0=comb[2,k], y1=comb[1,k], n0=sum(y==0), n1=sum(y==1), attributes(x)$info, loss$info) # refit object <- palasso::palasso(y=y,X=x,nfolds=10,family="binomial",standard=TRUE,elastic=TRUE,shrink=TRUE) model <- c(names(object),"elastic", paste0("paired.",c("adaptive","standard","combined"))) for(max in c(10,50,Inf)){ temp <- list() temp$nzero <- data.frame(model=model,x=NA,z=NA) for(i in seq_along(model)){ coef <- palasso:::coef.palasso(object=object,model=model[i],max=max) temp$nzero$x[i] <- sum(coef$x!=0) temp$nzero$z[i] <- sum(coef$z!=0) } temp$select <- palasso:::subset.palasso(x=object,max=max, model="paired.adaptive")$palasso$select temp$weights <- palasso:::weights.palasso(object=object,max=max, model="paired.adaptive") temp$coef <- palasso:::coef.palasso(object=object,max=max, model="paired.adaptive") loss[[paste0("fit",max)]] <- temp } save(object=loss,file=file1) file.remove(file0) } index <- sum(grepl(dir(),pattern="sessionInfo")) sink(paste0("sessionInfo",index+1,".txt")) date() utils::sessionInfo() devtools::session_info() sink() }
# The function <<collect>> loads all files from PATH including PATTERN in the file name, loads OBJECT into a list, and executes a function call. #<<functions>> # path: folder # pattern: character, or NULL (all files) # object: character vector, or NULL (all objects) # what: function call collect <- function(path=getwd(),pattern="",object=NULL,what="rbind"){ OBJECT = object # identify files files <- dir(path) files <- files[grepl(x=files,pattern=pattern)] files <- files[grepl(x=files,pattern=".RData")] number <- gsub(pattern=paste0(pattern,"|.RData"),replacement="",x=files) files <- files[order(as.numeric(number))] # trial1 names <- gsub(pattern=".RData",replacement="",x=files) # trial1 if(length(files)==0){stop("Invalid datasets.")} # load data all <- list() for(i in seq_along(files)){ x <- load(file.path(path,files[i])) x <- eval(parse(text=x)) if(is.null(OBJECT)){ all[[i]] <- x names(all)[i] <- names[i] } else { for(j in seq_along(OBJECT)){ all[[OBJECT[j]]][[i]] <- x[[OBJECT[j]]] names(all[[OBJECT[j]]])[i] <- names[i] } } } # fuse data if(is.null(OBJECT)){ all <- do.call(what=what,args=all) } else { all <- lapply(all,function(x) do.call(what=what,args=x)) } return(all) } LOSS <- list() type <- c("gene","isoform","miRNA","CNV") #type <- "miRNA" for(i in seq_along(type)){ LOSS[[type[i]]] <- collect(path="results", pattern=paste0(type[i],"_loss_"), object=c("deviance","auc","class","info", paste0("fit",c(10,50,Inf)))) } for(i in seq_along(LOSS)){ for(j in 1:3){ colnames(LOSS[[i]][[j]])[colnames(LOSS[[i]][[j]])=="paired.adaptive"] <- "paired" } } #type <- "gene" #a <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","paired"] #b <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","elastic"] #mean(a<b)
### Testing for significant differences. ### #rm(list=ls()) #<<functions>> #<<collect>> row <- c("gene","isoform","miRNA","CNV") col <- c("10","Inf") lay <- c("standard_x","standard_z","standard_xz", "adaptive_x","adaptive_z","adaptive_xz", "elastic") # added "elastic" M <- array(NA,dim=c(length(row),length(col),length(lay)),dimnames=list(row,col,lay)) for(i in seq_along(row)){ loss <- LOSS[[row[i]]][c("info","deviance")] y0 <- as.character(loss$info$y0) y1 <- as.character(loss$info$y1) cancer <- sort(unique(c(y0,y1))) Z <- palasso:::.design(x=cancer) for(j in seq_along(col)){ # differences cond <- rownames(loss$deviance)==col[j] for(k in seq_along(lay)){ fill <- loss$deviance[cond,lay[k]] - loss$deviance[cond,"paired"] X <- matrix(NA,nrow=length(cancer),ncol=length(cancer), dimnames=list(cancer,cancer)) X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- fill X[lower.tri(X)] <- NA # p-values pvalue <- rep(NA,times=max(Z)) for(l in seq_len(max(Z))){ x <- as.numeric(X[Z==l]) if(col[j]=="10"){ alternative <- "greater" # Never use "two.sided"! } if(col[j]=="Inf"){ alternative <- "less" # Never use "two.sided"! } pvalue[l] <- stats::wilcox.test(x=x,alternative=alternative, exact=FALSE)$p.value } # Simes M[i,j,k] <- palasso:::.combine(pvalue,method="simes") } } } # Table SIG: significance constraint <- "10" table <- format(M[,constraint,1:6],digits=1,scientific=FALSE) for(i in seq_len(nrow(table))){ for(j in seq_len(ncol(table))){ if(M[i,constraint,j]>=0.05){ table[i,j] <- paste0("{\\color{gray}{",table[i,j],"}}") } } } one <- c("","\\text{standard}","","","\\text{adaptive}","") two <- paste0("\\text{",c("x","z","xz","x","z","xz"),"}") rownames(table) <- paste0("\\text{",rownames(table),"}") table <- rbind(one,two,table,deparse.level=0) rownames(table)[1] <- "~" xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c")) xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### Comparison with elastic net. ### #rm(list=ls()) #<<functions>> #<<collect>> row <- c("gene","isoform","miRNA","CNV") col <- c("10","50","Inf") better <- worse <- less <- matrix(NA,nrow=length(row),ncol=length(col), dimnames=list(row,col)) for(i in seq_along(row)){ for(j in seq_along(col)){ # proportion of improvements (cross-validation) cond <- rownames(LOSS[[row[i]]]$deviance)==col[j] loss <- LOSS[[row[i]]]$deviance[cond,c("paired","elastic")] better[i,j] <- round(mean(loss[,"paired"]<loss[,"elastic"]),digits=2) worse[i,j] <- round(mean(loss[,"paired"]>loss[,"elastic"]),digits=2) # average difference in nzero (refitted models) df_paired <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x) sum(x$nzero[x$nzero[,"model"]=="paired.adaptive",c("x","z")])) df_elastic <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x) sum(x$nzero[x$nzero[,"model"]=="elastic95",c("x","z")])) df_diff <- df_elastic-df_paired less[i,j] <- round(mean(df_diff),digits=2) #graphics::hist(df_diff,main=paste(row[i],col[j]),xlim=c(-1,1)*max(abs(df_diff))) } } better worse less
### Analysing the refitted models. ### #rm(list=ls()) #<<functions>> #<<collect>> # Table SEL: selected model nzero <- paste0("fit",c(5,10,Inf)) model <- c(paste0("standard_",c("x","z","xz")), paste0("adaptive_",c("x","z","xz")), "between_xz","within_xz") type <- c("gene","isoform","miRNA","CNV") table <- array(NA,dim=c(length(nzero),length(model),length(type)), dimnames=list(nzero,model,type)) for(i in seq_along(nzero)){ for(j in seq_along(model)){ for(k in seq_along(type)){ sub <- LOSS[[type[k]]][[nzero[i]]] table[i,j,k] <- sum(sub[,"select"]==model[j]) } } } colSums(table["fit10",,]) # CHECK WHETHER COMPLETE! table <- round(prop.table(table["fit10",,],margin=2),digits=2) table <- t(table) table <- table[,apply(table,2,function(x) any(x!=0))] rownames(table) <- paste0("\\text{",rownames(table),"}") xtable <- xtable::xtable(table,align=c("r","|","c","c","c","c")) xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity) # selected weights and covariates type <- c("gene","isoform","miRNA","CNV") group <- c("x","z") model <- c(paste0("standard_",c("x","z","xz")), paste0("adaptive_",c("x","z","xz")), "paired.adaptive","elastic") # added "elastic" , paste0("elastic",c(100,75,50,25)) weights10 <- weightsInf <- matrix(NA,nrow=length(group),ncol=length(type), dimnames=list(group,type)) coef10 <- coefInf <- array(NA,dim=c(length(group),length(type),length(model)), dimnames=list(group,type,model)) for(i in seq_along(group)){ for(j in seq_along(type)){ weights10[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fit10[,"weights"],colMeans)) weightsInf[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fitInf[,"weights"],colMeans)) for(k in seq_along(model)){ coef10[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fit10[,"nzero"], function(x) sum(x[x$model==model[k],group[i]]))) coefInf[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fitInf[,"nzero"], function(x) sum(x[x$model==model[k],group[i]]))) } } } # coef10["x",,]+coef10["z",,] # with sparsity constraint round(prop.table(weights10,margin=2),2) round(prop.table(coef10[,,"paired.adaptive"],margin=2),2) round(colSums(coef10[,,"paired.adaptive"]),2) # natural sparsity round(prop.table(weightsInf,margin=2),2) round(prop.table(coefInf[,,"paired.adaptive"],margin=2),2) round(colSums(coefInf[,,"paired.adaptive"]),2) round(colSums(coefInf[,,"elastic"])/colSums(coefInf[,,"paired.adaptive"]),1) # multiple nzero of elastic net and paired lasso # Table NSC: number of non-zero coefficients table <- round(coefInf["x",,]+coefInf["z",,]) colnames(table)[7] <- "paired" one <- c("","\\text{standard}","","","\\text{adaptive}","","\\text{paired}","\\text{elastic}") two <- paste0("\\text{",c("x","z","xz","x","z","xz","xz","xz"),"}") rownames(table) <- paste0("\\text{",rownames(table),"}") table <- rbind(one,two,table,deparse.level=0) rownames(table)[1] <- "~" xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c","|","c","|","c")) xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### FIGURE CSW ### #rm(list=ls()) #<<functions>> set.seed(1) overfit <- TRUE # simulate n <- 10 cx <- stats::rbeta(n=n,shape1=0.9,shape2=1) cz <- stats::rbeta(n=n,shape1=0.4,shape2=0.9) # collection x <- list() y <- list() # adaptive weights (X only) x[[1]] <- rep(1,times=n) if(overfit){x[[1]] <- cx} y[[1]] <- rep(0,times=n) # adaptive weights (Z only) x[[2]] <- rep(0,times=n) y[[2]] <- rep(1,times=n) if(overfit){y[[2]] <- cz} # adaptive weights (X and Z) x[[3]] <- y[[3]] <- rep(0.5,times=n) if(overfit){x[[3]] <- cx} if(overfit){y[[3]] <- cz} # within-pair weights x[[4]] <- cx^2/(cx+cz) y[[4]] <- cz^2/(cx+cz) # visualisation graphics::par(mfrow=c(1,4),mar=c(4.5,0.5,0.5,0.5),oma=c(0,2,0,0)) for(i in seq_len(4)){ palasso:::plot_pairs(x=x[[i]],y=y[[i]],lwd=4) if(i==1){ graphics::mtext(text="covariate pair",side=2,line=1) } }
### FIGURE DIA ### #rm(list=ls()) #<<functions>> ellipse <- function(x,y,a=0.2,b=0.25,border=NA){ n <- max(c(length(x),length(y))) if(length(x)==1){x <- rep(x,times=n)} if(length(y)==1){y <- rep(y,times=n)} if(length(border)==1){border <- rep(border,times=n)} for(i in seq_len(n)){ angle <- seq(from=0,to=2*pi,length=100) xs <- x[i] + a * cos(angle) ys <- y[i] + b * sin(angle) graphics::polygon(x=xs,y=ys,col=grey(0.9),border=border[i]) } } cancer <- c("ACC","BLCA","BRCA","UVM") number <- c(80,409,1078,80) col <- grDevices::rgb(red=0,green=0,blue=sample(seq(from=75,to=255,length.out=length(number))),maxColorValue=255) lwd <- log(2*number)-2 lwd = pmax(5*number/max(number),1) x1 <- 1.3; x2 <- 2; x3 <- 3 set.seed(1) # first layer (bubble) graphics::plot.new() graphics::par(mar=c(0,0,0,0)) graphics::plot.window(xlim=c(1.1,3.3),ylim=c(-1.6,1.6)) ellipse(x=x1,y=0) graphics::text(x=x1,y=0,labels="TCGA",font=2,adj=c(0.5,0)) graphics::text(x=x1,y=0,labels="n=9794",adj=c(0.5,1.2),cex=0.9) # second layer (colon) y1 <- seq(from=1,to=-1,length.out=length(cancer)+1) graphics::text(x=x2,y=y1[length(cancer)],labels="...",font=2,srt=90) y1 <- y1[-length(cancer)] # first-second layer (connect) graphics::segments(x0=x1+0.2,y0=0,x1=x2-0.2,y1=y1,lwd=lwd,col=col) # second layer (bubble) ellipse(x=x2,y=y1,a=0.2,b=0.2) graphics::text(x=x2,y=y1,labels=cancer,font=2,col=col,adj=c(0.5,0)) graphics::text(x=x2,y=y1,labels=paste0("n=",number,""),adj=c(0.5,1.2),cex=0.9) comb <- utils::combn(x=seq_along(y1),m=2) # third layer (colon) y2 <- seq(from=1.5,to=-1.5,length.out=ncol(comb)+1) graphics::text(x=x3,y=y2[ncol(comb)],labels="...",font=2,srt=90) y2 <- y2[-ncol(comb)] # second-third layer (connect) graphics::segments(x0=2.2,y0=y1[comb[1,]],x1=2.7,y1=y2,lwd=lwd[comb[1,]],col=col[comb[1,]]) graphics::segments(x0=2.2,y0=y1[comb[2,]],x1=2.7,y1=y2,lwd=lwd[comb[2,]],col=col[comb[2,]]) # third layer (bubble) ellipse(x=x3,y=y2,a=0.3,b=0.22) graphics::text(x=x3,y=y2,labels=paste0(cancer[comb[1,]]," "), font=2,col=col[comb[1,]],adj=c(1,0)) graphics::text(x=x3,y=y2,labels=paste0(" ",cancer[comb[2,]]), font=2,col=col[comb[2,]],adj=c(0,0)) graphics::text(x=x3,y=y2,labels=":",font=2,adj=c(0.5,0)) labels <- apply(comb,2,function(x) sum(number[x])) labels <- paste0("n=",labels,"") graphics::text(x=x3,y=y2,labels=labels,adj=c(0.5,1.2),cex=0.9)
### FIGURE CLA ### #rm(list=ls()) #<<functions>> graphics::par(mfrow=c(4,2),oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5)) for(type in c("gene","isoform","miRNA","CNV")){ loss <- LOSS[[type]][c("deviance","auc","class")] choice <- "paired" loss <- lapply(loss,function(x) x[,c(paste0("standard_",c("x","z","xz")),paste0("adaptive_",c("x","z","xz")),choice)]) for(constraint in c("10")){ # c("5","10","Inf") # change sub <- lapply(loss,function(x) x[rownames(x)==constraint,]) palasso:::plot_score(sub$deviance,choice=choice) change <- sub$deviance[,7]-sub$deviance[,-7] palasso:::plot_box(change,ylab="change",zero=TRUE,choice=NA) # info info <- list() info$select <- names(which.min(apply(sub$deviance,2,median)[-7])) info$DEV_paired <- median(sub$deviance[,choice]) info$DEV_select <- median(sub$deviance[,info$select]) info$improve <- mean(sub$deviance[,info$select]>sub$deviance[,choice]) info$AUC_paired <- median(sub$auc[,choice]) info$CLASS_paired <- median(sub$class[,choice]) print(as.data.frame(info)) # important } }
### FIGURE DEC ### #rm(list=ls()) #<<functions>> #graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(1,1)) graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(2,2)) for(type in c("gene","isoform","miRNA","CNV")){ models <- c(paste0("standard_",c("x","z","xz")), paste0("adaptive_",c("x","z","xz")),"paired") constraint <- c("3","4","5","10","15","20","25","50","Inf") loss <- LOSS[[type]]["deviance"] loss <- lapply(loss,function(x) x[,models]) table <- matrix(NA,nrow=length(constraint),ncol=length(models), dimnames=list(constraint,models)) for(i in seq_along(constraint)){ sub <- lapply(loss,function(x) x[rownames(x)==constraint[i],]) table[i,] <- apply(sub$deviance,2,median) } # table <- log(table) graphics::plot.new() graphics::plot.window(xlim=c(1,length(constraint)),ylim=range(table)) graphics::box() constraint[constraint=="Inf"] <- "n" graphics::axis(side=2) graphics::axis(side=1,at=seq_along(constraint),labels=constraint,tick=FALSE,line=-1) for(k in c(1,2)){ for(i in seq_along(models)){ lty <- ifelse(i%in%c(1,2,3),3,ifelse(i%in%c(4,5,6),2,1)) col <- ifelse(i==7,"#00007F","#FF3535") pch <- ifelse(i%in%c(1,4),"x",ifelse(i%in%c(2,5),"z",1)) if(k==1){ graphics::lines(table[,i],col=col,lty=lty,lwd=2) graphics::points(table[,i],col="white",pch=16,cex=1.2) } else { graphics::points(table[,i],col=col,pch=1,font=2) } } } } graphics::title(ylab="deviance",line=0.0,outer=TRUE) graphics::title(xlab="sparsity constraint",ylab="deviance",line=0.0,outer=TRUE)
### FIGURE CNV ### #rm(list=ls()) #<<functions>> graphics::par(oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5)) graphics::layout(matrix(c(1,1,2,2),nrow=1)) loss <- LOSS[[type]][c("deviance","auc","class")] loss <- lapply(loss,function(x) x[rownames(x)=="10",]) model <- c(paste0("standard_",c("x","z","xz")), paste0("adaptive_",c("x","z","xz"))) diff <- loss$auc[,"paired"]-loss$auc[,model] palasso:::plot_box(diff,zero=TRUE,invert=TRUE,ylab="change") diff <- loss$class[,"paired"]-loss$class[,model] palasso:::plot_box(diff,zero=TRUE,ylab="change")
### FIGURE MAP ### #rm(list=ls()) #<<functions>> loss <- LOSS[["CNV"]][c("info","auc")] cancer <- sort(unique(c(levels(loss$info$y0),levels(loss$info$y1)))) X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),dimnames=list(cancer,cancer)) #Z <- palasso:::.design(x=cancer) y0 <- as.character(loss$info$y0) y1 <- as.character(loss$info$y1) X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- loss$auc[rownames(loss$auc)=="10","paired"] graphics::par(mar=c(0.5,3.0,3.0,0.5)) dimnames(X) <- lapply(dimnames(X),function(x) paste0(" ",x," ")) palasso:::plot_table(X=X,margin=-1,labels=FALSE,las=2,cex=0.7) #sort(rowMeans(X,na.rm=TRUE),decreasing=TRUE)[1:2] # keep!
### FIGURE COM ### # 32 cancer types for isoform and miRNA # 33 cancer types for gene and CNV #rm(list=ls()) #<<functions>> for(type in c("miRNA")){ cancer <- sort(unique(as.character(unlist(LOSS[[type]]$info[,c("y0","y1")])))) n <- length(cancer) z <- as.numeric(palasso:::.design(x=n)) x <- rep(seq_len(n),each=n) y <- rep(seq(from=n,to=1,by=-1),times=n) pch <- z pch[pch==0] <- NA pex <- c(".","O","*","+","o","-","'","x") # colour base <- grDevices::colorRampPalette(colors=c('darkblue','blue','red','darkred'))(n) col <- rep(NA,times=length(z)) col[z==0] <- "white" for(i in seq_len(n)){ col[z==i] <- base[i] } graphics::par(mfrow=c(1,1),mar=c(0,0,2,2)) graphics::plot.new() graphics::plot.window(xlim=c(1,n),ylim=c(1,n)) graphics::points(x=x[pch<=25],y=y[pch<=25], pch=pch[pch<=25],col=col[pch<=25],cex=0.9) graphics::points(x=x[pch>25],y=y[pch>25], pch=pex[(pch-25)[pch>25]],col=col[pch>25],cex=0.9) graphics::segments(x0=0,x1=n+1,y0=n+1) graphics::segments(x0=n+1,y0=n+1,y1=0) graphics::segments(x0=0,x1=n+1,y0=n+1,y1=0,lty=2) graphics::mtext(text=cancer,side=3,at=1:n,las=2,cex=0.7) graphics::mtext(text=cancer,side=4,at=n:1,las=2,cex=0.7) }