Skip to contents

We obtained our results on 2025-01-03 using R version 4.4.0 (2024-04-24, “Puppy Cup”) running on a Dell Latitude 5440 laptop with a 13th Gen Intel(R) Core(TM) i7-1365U processor (1800 Mhz) and the operating system Microsoft Windows 10 Enterprise. The total computation time was around 2 hours (excluding data download, without parallel computing).

Working environment

This vignette includes code chunks with a long computation time (e.g., analysing simulated and experimental data) and code chunks with a short computation time (e.g., generating figures and tables). The logical options sim.app and fig.tab determine whether the chunks for (i) running the simulation and application or (ii) generating figures and tables are evaluated, respectively. Running this vignette with sim.app=FALSE and fig.tab=TRUE means that figures and tables will be generated from previously obtained results. Running this vignette with sim.app=TRUE and fig.tab=TRUEand means that all results will be reproduced (including data download and data processing). It is also possible to execute individual chunks (e.g., for reproducing a specific figure or table), but then it is important to provide the required inputs (e.g., “Requires: file X in folder Y, execution of chunk Z”).

knitr::opts_chunk$set(echo=TRUE,eval=FALSE)
sim.app <- FALSE # reproduce simulation and application?
fig.tab <- FALSE # reproduce figures and tables?

This chunk verifies the working environment. The working directory, specified by the object path, must contain the R functions in “package/R/functions.R” as well as the folders “results” and “manuscript”. Alternatively, the R functions can be loaded from the R package sparselink. This chunk also installs missing R packages from CRAN or Bioconductor.

path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows)
#path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac)

dir <- c("results","manuscript","package/R/functions.R")
for(i in seq_along(dir)){
  if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){
    stop(paste0("Require folder/file'",dir[i],"'."))
  } 
}
source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package.

inst <- rownames(utils::installed.packages())
pkgs <- c("knitr","rmarkdown","glmnet","BiocManager")
for(i in seq_along(pkgs)){
  if(!pkgs[i]%in%inst){
    utils::install.packages(pkgs[i])
  }
}
pkgs <- c("recount3","edgeR")
for(i in seq_along(pkgs)){
  if(!pkgs[i]%in%inst){
    BiocManager::install(pkgs[i])
  }
}

blue <- "blue"; red <- "red"

if(exists("sim.app")&exists("fig.tab")){
  if(!sim.app&fig.tab){
    files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData")
    for(i in seq_along(files)){
      if(!file.exists(file.path(path,"results",files[i]))){
        stop("File",files[i],"is missing.")
      }
    }
  }
}

Methods

This chunk generates the figure for the methods section.

  • Requires: execution of chunk setup

  • Execution time: 11 second

  • Ensures: file fig_flow.eps in folder manuscript

#<<setup>>

grDevices::postscript(file=file.path(path,"manuscript","fig_flow.eps"),width=6,height=2.5)
graphics::par(mfrow=c(1,1),mar=c(0,0,0,0))
graphics::plot.new()
graphics::plot.window(xlim=c(-0.2,1.0),ylim=c(0.0,1.0))
cex <- 0.8

pos <- data.frame(left=0.2,right=0.8,top=0.8,centre=0.45,bottom=0.1)
mar <- data.frame(vertical=0.08,horizontal=0.08,dist=0.04)

graphics::text(labels=paste("problem",1:2),x=c(pos$left,pos$right),y=pos$top+2*mar$vertical,font=2,col=c(blue,red),cex=cex)
graphics::text(labels=expression(hat(beta)["j,1"]^{init}),x=pos$left,y=pos$top,col=blue)
graphics::text(labels=expression(hat(beta)["j,2"]^{init}),x=pos$right,y=pos$top,col=red)

graphics::arrows(x0=rep(c(pos$left,pos$right),each=2),x1=rep(c(pos$left,pos$right),times=2)+c(-mar$horizontal,-mar$horizontal,mar$horizontal,mar$horizontal),y0=pos$top-mar$vertical,y1=pos$centre+mar$vertical,length=0.1,col=rep(c(blue,red),each=2),lwd=2)

graphics::text(labels=expression(w["j,1"]^{int}),x=pos$left-mar$horizontal-mar$dist,y=pos$centre,col=blue)
graphics::text(labels=expression(w["p+j,1"]^{int}),x=pos$left-mar$horizontal+mar$dist,y=pos$centre,col=blue)
graphics::text(labels=expression(w["j,1"]^{ext}),x=pos$left+mar$horizontal-mar$dist,y=pos$centre,col=red)
graphics::text(labels=expression(w["p+j,1"]^{ext}),x=pos$left+mar$horizontal+mar$dist,y=pos$centre,col=red)

graphics::text(labels=expression(w["j,2"]^{ext}),x=pos$right-mar$horizontal-mar$dist,y=pos$centre,col=blue)
graphics::text(labels=expression(w["p+j,2"]^{ext}),x=pos$right-mar$horizontal+mar$dist,y=pos$centre,col=blue)
graphics::text(labels=expression(w["j,2"]^{int}),x=pos$right+mar$horizontal-mar$dist,y=pos$centre,col=red)
graphics::text(labels=expression(w["p+j,2"]^{int}),x=pos$right+mar$horizontal+mar$dist,y=pos$centre,col=red)

graphics::arrows(x0=c(pos$left,pos$right),y0=pos$centre-mar$vertical,y1=pos$bottom+mar$vertical,col=c(blue,red),length=0.1,lwd=2)
graphics::text(labels=expression(hat(beta)["j,1"]^{final}==hat(gamma)["j,1"]-hat(gamma)["p+j,1"]),x=pos$left,y=pos$bottom,col=blue)
graphics::text(labels=expression(hat(beta)["j,2"]^{final}==hat(gamma)["j,2"]-hat(gamma)["p+j,2"]),x=pos$right,y=pos$bottom,col=red)

graphics::text(x=-0.1,y=c(pos$top,pos$bottom),labels=paste("stage",1:2),font=2,cex=cex)
grDevices::dev.off()

Simulation

This chunk performs the simulation.

  • Requires: execution of chunk setup

  • Execution time: 0.5 hours

  • Ensures: files simulation_transfer.RData (transfer learning), simulation_multiple.RData (multi-task learning) and info_sim.txt (session information) in folder results

#<<setup>>

alpha.init <- 0.95; type <- "exp";  repetitions <- 10

for(mode in c("transfer","multiple")){
  
  grid <- expand.grid(prob.separate=c(0.0,0.025,0.05),prob.common=c(0.0,0.025,0.05),family="gaussian")
  grid <- grid[rep(seq_len(nrow(grid)),each=repetitions),] #
  grid$seed <- seq_len(nrow(grid))
  grid$family <- as.character(grid$family)
  deviance <- auc <- time <- mse.coef <- mse.zero <- mse.nzero <- sel.num <- sel.coef <- sel.count <- hyperpar <- list()
  for(i in seq(from=1,to=nrow(grid))){
    set.seed(seed=grid$seed[i])
    cat("i=",i,"\n")
    if(mode=="transfer"){
      data <- sim.data.transfer(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i])
      method <- c("glm.separate","glm.glmtrans","sparselink")
    } else if(mode=="multiple"){
      #--- multi-task learning ---
      data <- sim.data.multiple(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i])
      method <- c("glm.separate","glm.mgaussian","sparselink") # add glmnet "mgaussian" (only for linear case) or "MTPS" (not sparse)?
    }
    
    result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid$family[i],method=method,alpha.init=alpha.init,type=type,alpha=1)
    hyperpar[[i]] <- result$hyperpar
    time[[i]] <- result$time
    auc[[i]] <- result$auc
    deviance[[i]] <- result$deviance
    sel.num[[i]] <- t(sapply(result$coef,function(x) colSums(x!=0)))
    sel.count[[i]] <- t(sapply(result$coef,function(x) rowMeans(count_matrix(truth=sign(data$beta),estim=sign(x))))) # Add na.rm=TRUE?
    
    sel.coef[[i]] <- t(sapply(result$coef,function(x) colMeans(sign(x)!=sign(data$beta))))
    # CONTINUE HERE: consider sparsity, true positives, false negatives, signs
    
    mse.coef[[i]] <- t(sapply(result$coef,function(x) colMeans((data$beta-x)^2)))
    mse.zero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta==0)*(data$beta-x))^2)))
    mse.nzero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta!=0)*(data$beta-x))^2)))
  }
  save(grid,deviance,auc,sel.num,sel.count,sel.coef,mse.coef,mse.zero,mse.nzero,time,file=file.path(path,"results",paste0("simulation_",mode,".RData")))
}

writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),      sessioninfo::session_info()),con=paste0(path,"/results/info_sim.txt"))

The following chunk generate the figures for the simulation study.

  • Requires: execution of chunk setup, files simulation_transfer.RData and simulation_multiple.RData in folder results (generated by chunk simulation)

  • execution time: 11 second

  • Ensures: files fig_sim_multiple.eps and fig_sim_transfer.eps in folder manuscript

#<<setup>>

caption <- paste(c("\\textbf{Multi-task learning.}","\\textbf{Transfer learning.}"),"Comparison of different measures (rows) between an available method (red) and the proposed method (blue) in different simulation settings (columns), based on the average of three problems",c("(tasks)","(datasets)"),"for each repetition out of ten. Measures: performance metric (mean squared error on hold-out data, as a fraction of the one from standard lasso regression; a point below the dashed line means that",c("multi-task","transfer"),"learning improves predictions), sparsity (number of non-zero coefficients), precision (number of coefficients with correct signs divided by number of non-zero coefficients). The arrows point in the direction of improvement. Settings: percentage of features with a common effect for all problems ($\\pi_\\theta$), percentage of features with a specific effect for each problem ($\\pi_\\delta$).",c("\\label{fig_sim_multiple}","\\label{fig_sim_transfer}"))

figure_change <- function(){
  
  mode <- paste0(100*grid$prob.common,"%\n",100*grid$prob.separate,"%")
  
  graphics::par(mfrow=c(3,1),mar=c(3,3,1,1))
  
  label <- function(){
    cex <- 0.5
    at <- 0.3
    graphics::mtext(text=expression(pi[theta]==phantom(.)),side=1,line=0.2,at=at,cex=cex)
    graphics::mtext(text=expression(pi[delta]==phantom(.)),side=1,line=1.2,at=at,cex=cex)
  }
  
  #--- predictive performance ---
  means <- t(sapply(X=deviance,FUN=rowMeans))
  means <- means/means[,"glm.separate"]
  change(x=mode,y0=means[,model.ref],y1=means[,model.own],main="metric",increase=FALSE)
  graphics::abline(h=1,lty=2,col="grey")
  label()
  
  #--- sparsity ---
  nzero <- sapply(X=sel.num,FUN=rowMeans)
  change(x=mode,y0=nzero[model.ref,],y1=nzero[model.own,],main="sparsity",increase=FALSE)
  graphics::abline(h=0,lty=2,col="grey")
  label()
  
  #--- precision ---
  precision <- sapply(X=sel.count,FUN=function(x) x[,"precision"])
  precision[is.na(precision)] <- 0
  change(x=mode,y0=precision[model.ref,],y1=precision[model.own,],main="precision",increase=TRUE)
  graphics::abline(h=0,lty=2,col="grey")
  label()
  
}

grDevices::postscript(file=file.path(path,"manuscript","fig_sim_multiple.eps"),width=6.5,height=6)
load(file.path(path,paste0("results/simulation_multiple.RData")))
model.ref <- "glm.mgaussian"
model.own <- "sparselink"
figure_change()
grDevices::dev.off()

grDevices::postscript(file=file.path(path,"manuscript","fig_sim_transfer.eps"),width=6.5,height=6)
load(file.path(path,paste0("results/simulation_transfer.RData")))
model.ref <- "glm.glmtrans"
model.own <- "sparselink"
figure_change()
grDevices::dev.off()

Sample size and sparsity (under development)

# Effect of sample size in source or target dataset (TL), effect of sample size (MTL).
#<<setup>>

alpha.init <- 0.95; type <- "exp";  repetitions <- 10 # increase to 10
grid <- metric <- list()

for(mode in c("TL-source","TL-target","MTL-size","TL-prop","MTL-prop")){
  metric[[mode]] <- list()
  cand <- c(20,40,60,80,100)
  if(mode=="TL-source"){
    grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=cand,n_target=50)
  } else if(mode=="TL-target"){
    grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=50,n_target=cand) 
  } else if(mode=="MTL-size"){
    grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n0=cand) 
  } else if(mode %in% c("TL-prop","MTL-prop")){
    cand <- c(0.025,0.05,0.10,0.15,0.20)
    grid[[mode]] <- expand.grid(prob.common=cand,prob.separate=NA,family="gaussian",n_source=50,n_target=50,n0=50)
  } else {
    stop("Wrong mode.")
  }
  grid[[mode]] <- grid[[mode]][rep(seq_len(nrow(grid[[mode]])),each=repetitions),]
  grid[[mode]]$seed <- seq_len(nrow(grid[[mode]]))
  grid[[mode]]$family <- as.character(grid[[mode]]$family)
  cond <- is.na(grid[[mode]]$prob.separate)
  grid[[mode]]$prob.separate[cond] <- 0.5*grid[[mode]]$prob.common[cond]
  
  for(i in seq(from=1,to=nrow(grid[[mode]]))){
      set.seed(seed=grid$seed[i])
      cat("i=",i,"\n")
      if(mode %in% c("TL-source","TL-target","TL-prop")){
        n0 <- rep(c(grid[[mode]]$n_source[i],grid[[mode]]$n_target[i]),times=c(2,1))
        data <- sim.data.transfer(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=n0)
        method <- c("glm.separate","glm.glmtrans","sparselink","glm.xrnet")
      } else if(mode %in% c("MTL-size","MTL-prop")){
        data <- sim.data.multiple(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=grid[[mode]]$n0[i])
        method <- c("glm.separate","glm.mgaussian","sparselink","glm.spls")
      } else {
        stop("Wrong mode.")
      }
      result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid[[mode]]$family[i],method=method,alpha.init=alpha.init,type=type,alpha=1,trial=TRUE)
      metric[[mode]][[i]] <- result$deviance
    }
}

save(grid,metric,file=file.path(path,"results","simulation_devel.RData"))

Logarithmic transformation of mean squared error calculated on hold-out data (yy-axis), averaged across all three problems (TL-prop, MTL-size, MTL-prop) Left: Sample size of source data sets or sample size of target data set (left) or proportion of features with common effects and proportion of features with separate effects (right). standard lasso regression (dotted black line), proposed method (solid blue line), competing methods (dashed orange and red lines, namely ). Added competing methods (orange) tend to perform worse than previously included competing methods (red).

load(file.path(path,"results","simulation_devel.RData"))

graphics::par(mfrow=c(1,3),mar=c(4.5,4.5,1.5,1),oma=c(0,0,0,0))
#graphics::layout(mat=matrix(data=c(1,1,2,2,3,3,0,4,4,0,5,5),ncol=2))
for(mode in c("MTL-size","TL-source","TL-target")){
  if(mode %in% c("TL-source","TL-target")){
    mse <- sapply(metric[[mode]],function(x) x[,3])
  } else {
    mse <- sapply(metric[[mode]],function(x) rowMeans(x))
  }
  if(mode %in% c("TL-source","TL-target","TL-prop")){
    col <- c("glm.separate"="black","glm.glmtrans"="red","glm.xrnet"="orange","sparselink"="blue")
    lty <- c("glm.separate"=3,"glm.glmtrans"=2,"glm.xrnet"=2,"sparselink"=1)
  } else if(mode %in% c("MTL-size","MTL-prop")) {
    col <- c("glm.separate"="black","glm.mgaussian"="red","glm.spls"="orange","sparselink"="blue")
    lty <- c("glm.separate"=3,"glm.mgaussian"=2,"glm.spls"=2,"sparselink"=1)
  }
  if(mode=="TL-source"){
    params <- grid[[mode]]$n_source
  } else if(mode=="TL-target"){
    params <- grid[[mode]]$n_target
  } else if(mode=="MTL-size"){
    params <- grid[[mode]]$n0
  } else if(mode %in% c("TL-prop","MTL-prop")){
    params <- grid[[mode]]$prob.common
  }
  unique <- unique(params)
  #params <- grid[[mode]]$prob.separate
  graphics::plot.new()
  graphics::plot.window(xlim=range(params),ylim=range(log(mse)))
  graphics::box()
  #graphics::title(main=mode)
  if(mode=="MTL-size"){
    graphics::title(main="MTL - varying sample size")
  } else if(mode=="TL-source"){
    graphics::title(main="TL - varying source sample size")
    xlab <- paste0("source sample size (n1=n2)")
    graphics::legend(x="topleft",legend="n3=xxx (target sample size)",bty="n")
  } else if(mode=="TL-target"){
    graphics::title(main="TL - varying target sample size")
    xlab <- paste0("target sample size (n3)")
    graphics::legend(x="topleft",legend="n1=n2=xxx (source sample size)",bty="n")
  }
  graphics::title(ylab="log MSE",line=2.5,xlab=xlab)
  if(mode %in% c("TL-prop","MTL-prop")){
    graphics::axis(side=1,at=unique,labels=paste0(100*unique,"%"))
  } else {
    graphics::axis(side=1,at=unique)
  }
  graphics::axis(side=2)
  
  for(i in names(col)){
    val <- tapply(X=mse[i,],INDEX=params,FUN=function(x) mean(x))
    graphics::lines(x=unique,y=log(val),col=col[i],type="o",pch=16,lty=lty[i])
  }
}

# Change settings so that competing methods are useful.

Application

Data preparation

This chunk defines the references and the project identifiers for the application.

  • Requires: nothing

  • Execution time: 11 second

  • Ensures: list project in working environment

project <- list()
project$IBD <- c("Tew (2016)"="SRP063496",
                 "Haberman (2019)"="SRP129004",
                 "Verstockt (2019)"="ERP113396",
                 "Verstockt (2020)"="ERP114636",
                 "Boyd (2018)"="SRP100787")
project$RA <- c("Baker (2019)"="SRP169062",
                "Moncrieffe (2017)"="SRP074736",
                "Goldberg (2018)"="SRP155483")
extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091

This chunk downloads the data for the application.

  • Requires: execution of chunks setup and define_projects

  • Execution time: depends on internet speed and cached files

  • Ensures: files recount3_data.RData (data sets) and info_data.txt (system information) in folder results

#<<setup>>
#<<define_projects>>

data <- list()
for(i in c(unlist(project),extra)){
  data[[i]] <- recount3::create_rse_manual(
    project=i,
    project_home="data_sources/sra",
    organism="human",
    annotation = "gencode_v26",
    type="gene")
}
save(data,file=file.path(path,"results/recount3_data.RData"))

writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
      sessioninfo::session_info()),con=paste0(path,"/results/info_data.txt"))

This chunk preprocesses the data.

  • Requires: execution of chunks setup and define_projects, file recount3_data.RData (generated by chunk download_data)

  • Execution time: 55 seconds

  • Ensures: lists y (targets) and x (features) in working environment

#<<setup>>
#<<define_projects>>

load(file.path(path,"results/recount3_data.RData"))

#- - - - - - - - - - - - - - -
#- - - extract features  - - - 
#- - - - - - - - - - - - - - -

# extract features
x <- list()
for(i in c(unlist(project),extra)){
  counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts)
  colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name
  x[[i]] <- counts
}

# select most expressed protein-coding genes (for all TL projects together)
select <- list()
total <- numeric()
for(i in unlist(project)){
  total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering
  #total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # alternative: variance filtering
}
type <- SummarizedExperiment::rowData(data[[i]])$gene_type
cond <- type=="protein_coding"
total[,!cond] <- 0
rank <- apply(X=total,MARGIN=1,FUN=rank)
mean_rank <- rowMeans(rank)
temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # double-check (remove?) second condition
for(i in unlist(project)){
  select[[i]] <- temp
}

# select most expressed protein-coding genes (for MTL project)
mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean)
warning("change number in next line")
temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[100] # increase to 2000!
select[[extra]] <- temp

# pre-processing
for(i in c(unlist(project),extra)){
  lib.size <- Matrix::rowSums(x[[i]])
  x[[i]] <- x[[i]][,select[[i]],drop=FALSE]
  norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size)
  gamma <- norm.factors*lib.size/mean(lib.size)
  gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]]))
  x[[i]] <- x[[i]]/gamma
  x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform
  x[[i]] <- scale(x[[i]]) # scale because of different datasets!?
}

#- - - - - - - - - - - - - -
#- - - extract targets - - -
#- - - - - - - - - - - - - -

# extract information on samples
frame <- list()
for(i in c(unlist(project),extra)){
  list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|")
  data[[i]]$sra.experiment_attributes
  # What about sra.experiment_attributes?
  n <- length(list)
  cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1]))
  ncol <- length(cols)
  frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols))
  for(j in seq_len(n)){
    for(k in seq_len(ncol)){
      vector <- list[[j]]
      which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k])
      string <- vector[which]
      if(length(string)==0){next}
      frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2]
    }
  }
  frame[[i]] <- as.data.frame(frame[[i]])
}

# extract binary outcome
y <- z <- list()
for(i in unlist(project)){
  # CONTINUE HERE!!!
  if(i=="ERP113396"){
    y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid")))
  } else if(i=="ERP114636"){
    y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid")))
    warning("Inverting response and non-response!")
  } else if(i=="SRP100787"){
    y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid")))
  } else if(i=="SRP129004"){
    y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid")))
    suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`)))
  } else if(i=="SRP063496"){
    y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid")))
  } else if(i=="SRP169062"){
    y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid")))
  } else if(i=="SRP155483"){
    y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid")))
    z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid")))
  } else if(i=="SRP074736"){
    y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid")))
  }
}

# overlap
for(j in unlist(project)){
  is.na <- is.na(y[[j]])
  if(length(is.na)!=nrow(x[[j]])){stop()}
  y[[j]] <- y[[j]][!is.na]
  if(!is.null(z[[j]])){
    if(is.vector(z[[j]])){
      z[[j]] <- z[[j]][!is.na]
    } else {
      z[[j]] <- z[[j]][!is.na,]
    }
  }
  x[[j]] <- x[[j]][!is.na,]
}

Data exploration

This chunk performs the exploratory data analysis.

  • Requires: execution of chunks setup, define_projects and preprocess_data

  • Execution time: 0.50.5 minutes

  • Ensures: files explore_data.RData (results) and info_explore.txt (session information) in folder results

#<<setup>>
#<<define_projects>>
#<<preprocess_data>>

set.seed(1)
alpha.holdout <- 0
alpha.crossval <- 1
family <- "binomial"
nfolds <- 10
codes <- unlist(project)
coef <- matrix(data=NA,nrow=ncol(x[[1]]),ncol=length(codes),dimnames=list(NULL,codes))
auc <- auc.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes))
foldid <- make.folds.trans(y=y,family="binomial",nfolds=nfolds)

ridge <- lasso <- list()
for(i in seq_along(codes)){
  ridge[[i]] <- glmnet::cv.glmnet(x=x[[codes[i]]],y=y[[codes[i]]],family=family,alpha=alpha.holdout,foldid=foldid[[i]])
  coef[,i] <- stats::coef(ridge[[i]],s="lambda.min")[-1]
  for(j in seq_along(codes)){
    if(i==j){
      y_hat <- rep(x=NA,times=length(y[[i]]))
      for(k in seq_len(nfolds)){
        holdout <- foldid[[i]]==k
        temp <- glmnet::cv.glmnet(x=x[[codes[i]]][!holdout,],y=y[[codes[i]]][!holdout],family=family,alpha=alpha.crossval)
        y_hat[holdout] <- predict(object=temp,newx=x[[codes[i]]][holdout,],s="lambda.min",type="response")
      }
    } else {
      y_hat <- as.numeric(predict(object=ridge[[i]],newx=x[[j]],s="lambda.min",type="response"))
    }
    auc[i,j] <- pROC::auc(response=y[[codes[j]]],predictor=y_hat,direction="<",levels=c(0,1))
    auc.pvalue[i,j] <- stats::wilcox.test(rank(y_hat)~y[[codes[[j]]]],alternative="less",exact=FALSE)$p.value
  }
}

save(coef,auc,auc.pvalue,codes,file=file.path(path,"results","explore_data.RData"))

writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
      sessioninfo::session_info()),con=paste0(path,"/results/info_explore.txt"))

This chunk generates the tables for the exploratory data analysis.

  • Requires: execution of chunk setup, file explore_data.RData in folder results (generated by chunk explore_apply)

  • execution time: 11 second

  • Ensures: files tab_cor.tex and tab_auc.tex in folder manuscript

#<<setup>>
#if(any(unlist(project)!=names(refs))){stop("not compatible")}

load(file.path(path,"results/explore_data.RData"))
names <- gsub(pattern="IBD.|RA.",replacement="",x=names(unlist(project)))
codes <- colnames(coef)
cor.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes))
for(i in seq_along(codes)){
  for(j in seq_along(codes)){
    cor.pvalue[i,j] <- stats::cor.test(x=coef[,i],y=coef[,j],method="spearman",exact=FALSE)$p.value
  }
}
diag(cor.pvalue) <- NA

insert.space <- function(table,cut){
  index.left <- index.top <- seq_len(cut)
  index.right <- index.bottom <- seq(from=cut+1,to=ncol(table))
  top <- cbind(table[index.top,index.left],"",table[index.top,index.right])
  bottom <- cbind(table[index.bottom,index.left],"",table[index.bottom,index.right])
  out <- rbind(top,"",bottom)
  colnames(out)[colnames(out)==""] <- " "
  return(out)
}

table <- stats::cor(coef,method="spearman")
rownames(table) <- colnames(table) <- names
black <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05)
star <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05/choose(n=length(codes),k=2))
nonnegative <- table>=0
table <- format(round(table,digits=2),digits=2,trim=TRUE)
table[nonnegative] <- paste0("\\phantom{-}",table[nonnegative])
table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}")
table[star] <- paste0(table[star],"$^\\star$")
table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}")
#table[nonnegative] <- paste0("-",table[nonnegative])
diag(table) <- "-"
table <- insert.space(table=table,cut=5)
xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Spearman correlation coefficients between the ridge regression coefficients from different datasets. Pairwise combinations of datasets with significantly correlated regression coefficients are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/28$). We expect a correlation coefficient close to $0$ for unrelated problems and close to $1$ for identical problems.",label="tab_cor")
xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_cor.tex")) #add.to.row=list(pos=list(5),command="\\hdashline \n")

table <- auc
rownames(table) <- colnames(table) <- names
table <- format(round(table,digits=2),digits=2)
black <- auc.pvalue<=0.05
star <- auc.pvalue<=0.05/(length(codes)*length(codes))
diag(table) <- paste0("(",diag(table),")")
table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}")
table[star] <- paste0(table[star],"$^\\star$")
table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}")
table <- insert.space(table=table,cut=5)
xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Out-of-sample area under the receiver operating characteristic curve (\\textsc{roc-auc}) from logistic ridge regression trained on the dataset in the row and tested on the dataset in the column (off-diagonal entries), or cross-validated \\textsc{roc-auc} from logistic lasso regression trained and tested on the same dataset by $10$-fold external cross-validation (diagonal entries, between brackets). The \\textsc{roc-auc} of a random classifier is $0.5$, while that of a perfect classifier is $1.0$. Entries on and off the diagonal are not comparable. Predictions that are significantly better than random predictions (according to the one-sided Mann-Whitney $U$ test for testing whether the ranks of the predicted probabilities are significantly higher for the cases than for the controls) are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/64$).",label="tab_auc")
xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_auc.tex"))

Transfer learning

This chunk performs the transfer learning analysis.

  • Requires: execution of chunks setup and define_projects, file recount3_data.RData in folder results (generated by chunk download_data), execution of chunk preprocess_data

  • Execution time: 1.5 hours

  • Ensures: application.RData (results) and info_app.txt (session information) in folder results

#<<setup>>
#<<define_projects>>
#<<preprocess_data>>

alpha.init <- 0.95; type <- "exp"

result <- list()
for(i in names(project)){
  cat("project:",i,"\n")
  result[[i]] <- list()
  for(j in seq_len(5)){ # 5 repetitions of 10-fold CV
    set.seed(j)
    codes <- project[[i]]
    result[[i]][[j]] <- crossval(y=y[codes],X=x[codes],family="binomial",method=c("glm.separate","glm.glmtrans","sparselink","glm.common","glm.xrnet"),nfolds=10,alpha=1,alpha.init=alpha.init,type=type)
  }
}
save(result,project,file=file.path(path,"results","application.RData"))

writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
      sessioninfo::session_info()),con=paste0(path,"/results/info_app.txt"))

This chunk generates the figure on the predictive performance.

  • Requires: execution of chunk setup, file application.RData in folder results (generated by chunk transfer_apply)

  • Execution time: 11 second

  • Ensures: file fig_app.eps in folder manuscript

#<<setup>>

grDevices::postscript(file=file.path(path,"manuscript","fig_app.eps"),width=6.5,height=4)
graphics::par(mfrow=c(2,1),mar=c(4,2,1,1),oma=c(0,0,0,0))
load(file.path(path,paste0("results/application.RData")))

reference <- "glm.glmtrans"
experimental <- "sparselink"

# predictivity
metric <- lapply(result,function(x) do.call(what="rbind",args=lapply(x,function(x) x$auc))) # DEV and AUC need different directions (increase=FALSE/TRUE)!
metric <- do.call(what="rbind",args=metric)
metric <- metric/metric[,"glm.separate"]
#xlab <- refs[rownames(metric)]
#names <- gsub(pattern="",replacement="\n",x=unlist(project))

label <- gsub(pattern="IBD.|RA.",replacement="",x=gsub(pattern=" ",replacement="\n",x=names(unlist(project))))
index <- match(x=rownames(metric),table=unlist(project))

xlab <- label[index]
change(x=xlab,y0=metric[,reference],y1=metric[,experimental],main="metric",increase=TRUE,cex.main=0.8)
graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2)
graphics::abline(h=0.5,lty=2,col="grey")
graphics::abline(h=1,lty=2,col="grey")

# sparsity
nzero <- lapply(result,function(x) lapply(x,function(x) sapply(x$refit$coef,function(x) colSums(x!=0))))
nzero <- do.call(what="rbind",args=do.call(what="c",args=nzero))
change(x=xlab,y0=nzero[,reference],y1=nzero[,experimental],main="sparsity",increase=FALSE,cex.main=0.8)
graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2)
graphics::abline(h=0,lty=2,col="grey")
grDevices::dev.off()

# percentage change
# (reported in section 4 "application" subsection 4.3 "transfer learning")

disease <- ifelse(rownames(metric) %in% project$IBD,"IBD",ifelse(rownames(metric) %in% project$RA,"RA",NA))

#round(100*colMeans(metric)-100,digits=2)
round(100*colMeans(metric[disease=="IBD",])-100,digits=2)
round(100*colMeans(metric[disease=="RA",])-100,digits=2)

colMeans(nzero)
colMeans(nzero[disease=="IBD",])
colMeans(nzero[disease=="RA",])

#--- revision: report AUC ---
Reduce(f="+",x=lapply(result$IBD,function(x) x$auc))/length(result$IBD)
lapply(result,function(x) round(colMeans(Reduce(f="+",x=lapply(x,function(x) x$auc))/length(result$IBD)),digits=3))

This chunk generates the figure on feature selection.

  • Requires: execution of chunk setup, file application.RData in folder results (generated by chunk transfer_apply)

  • Execution time: 11 second

  • Ensures: file fig_coef.eps in folder manuscript

#<<setup>>

load(file.path(path,"results","application.RData"))

coefs <- list()
for(i in seq_along(result$IBD)){
  coefs[[i]] <- result$IBD[[i]]$refit$coef$sparselink
  colnames(coefs[[i]]) <- names(project$IBD)
  rownames(coefs[[i]]) <- rownames(result$IBD[[1]]$refit$coef$glm.glmtrans) # try to avoid this
}

any <- rowSums(sapply(coefs,function(x) apply(x,1,function(x) any(x!=0))))!=0
for(i in seq_along(result$IBD)){
  coefs[[i]] <- coefs[[i]][any,]
}
table <- Reduce(f="+",x=coefs)/5

cex <- 0.7

grDevices::postscript(file=file.path(path,"manuscript","fig_coef.eps"),width=7,height=4)
graphics::par(mfrow=c(1,1),mar=c(2.5,4.5,0.5,1.5),oma=c(0,0,0,0))
graphics::plot.new()
graphics::plot.window(xlim=c(0.6,ncol(table)+0.4),ylim=c(0.5,nrow(table)+0.5))
col <- apply(table,1,function(x) ifelse(all(x<=0),"blue",ifelse(all(x>=0),"red","black")))
colnames <- gsub(x=colnames(table),pattern=" ",replacement="\n")
graphics::mtext(text=colnames,side=1,at=seq_len(ncol(table)),cex=cex,line=1)
rownames <- rownames(table)
graphics::mtext(text=rownames,side=2,at=seq_len(nrow(table)),las=2,cex=cex,line=0.7,col=col)
star <- rowSums(table!=0)>1
graphics::mtext(text="*",side=2,at=which(star),line=-0.3)
graphics::mtext(text=ifelse(col=="blue","-",ifelse(col=="red","+",".")),side=4,at=seq_len(nrow(table)),las=2,cex=cex,line=0.5,col=col)
for(i in seq_len(nrow(table))){
  for(j in seq_len(ncol(table))){
    for(k in 1:5){
      col <- ifelse(coefs[[k]][i,j]<0,"blue",ifelse(coefs[[k]][i,j]>0,"red","white"))
      cex <- pmax(sqrt(5*abs(coefs[[k]][i,j])),0.2)
      graphics::points(x=j-(-3+k)*0.17,y=i,col=col,cex=cex,pch=16)
    }
  }
}
graphics::abline(v=seq(from=0.5,to=5.5,by=1))
grDevices::dev.off()

Data-based simulation (under development)

rm(list=ls())
path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows)
#path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac)

dir <- c("results","manuscript","package/R/functions.R")
for(i in seq_along(dir)){
  if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){
    stop(paste0("Require folder/file'",dir[i],"'."))
  } 
}
source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package.

inst <- rownames(utils::installed.packages())
pkgs <- c("knitr","rmarkdown","glmnet","BiocManager")
for(i in seq_along(pkgs)){
  if(!pkgs[i]%in%inst){
    utils::install.packages(pkgs[i])
  }
}
pkgs <- c("recount3","edgeR")
for(i in seq_along(pkgs)){
  if(!pkgs[i]%in%inst){
    BiocManager::install(pkgs[i])
  }
}

blue <- "blue"; red <- "red"

if(exists("sim.app")&exists("fig.tab")){
  if(!sim.app&fig.tab){
    files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData")
    for(i in seq_along(files)){
      if(!file.exists(file.path(path,"results",files[i]))){
        stop("File",files[i],"is missing.")
      }
    }
  }
}
project <- list()
project$IBD <- c("Tew (2016)"="SRP063496",
                 "Haberman (2019)"="SRP129004",
                 "Verstockt (2019)"="ERP113396",
                 "Verstockt (2020)"="ERP114636",
                 "Boyd (2018)"="SRP100787")
project$RA <- c("Baker (2019)"="SRP169062",
                "Moncrieffe (2017)"="SRP074736",
                "Goldberg (2018)"="SRP155483")
extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091
#<<setup>>
#<<define_projects>>

load(file.path(path,"results/recount3_data.RData"))

#- - - - - - - - - - - - - - -
#- - - extract features  - - - 
#- - - - - - - - - - - - - - -

# extract features
x <- list()
for(i in c(unlist(project),extra)){
  counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts)
  colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name
  x[[i]] <- counts
}

# select most expressed protein-coding genes (for all TL projects together)
select <- list()
total <- numeric()
for(i in unlist(project)){
  total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering
  #total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # alternative: variance filtering
}
type <- SummarizedExperiment::rowData(data[[i]])$gene_type
cond <- type=="protein_coding"
total[,!cond] <- 0
rank <- apply(X=total,MARGIN=1,FUN=rank)
mean_rank <- rowMeans(rank)
temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # double-check (remove?) second condition
for(i in unlist(project)){
  select[[i]] <- temp
}

# select most expressed protein-coding genes (for MTL project)
mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean)
warning("change number in next line")
temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[100] # increase to 2000!
select[[extra]] <- temp

# pre-processing
for(i in c(unlist(project),extra)){
  lib.size <- Matrix::rowSums(x[[i]])
  x[[i]] <- x[[i]][,select[[i]],drop=FALSE]
  norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size)
  gamma <- norm.factors*lib.size/mean(lib.size)
  gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]]))
  x[[i]] <- x[[i]]/gamma
  x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform
  x[[i]] <- scale(x[[i]]) # scale because of different datasets!?
}

#- - - - - - - - - - - - - -
#- - - extract targets - - -
#- - - - - - - - - - - - - -

# extract information on samples
frame <- list()
for(i in c(unlist(project),extra)){
  list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|")
  data[[i]]$sra.experiment_attributes
  # What about sra.experiment_attributes?
  n <- length(list)
  cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1]))
  ncol <- length(cols)
  frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols))
  for(j in seq_len(n)){
    for(k in seq_len(ncol)){
      vector <- list[[j]]
      which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k])
      string <- vector[which]
      if(length(string)==0){next}
      frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2]
    }
  }
  frame[[i]] <- as.data.frame(frame[[i]])
}

# extract binary outcome
y <- z <- list()
for(i in unlist(project)){
  # CONTINUE HERE!!!
  if(i=="ERP113396"){
    y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid")))
  } else if(i=="ERP114636"){
    y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid")))
    warning("Inverting response and non-response!")
  } else if(i=="SRP100787"){
    y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid")))
  } else if(i=="SRP129004"){
    y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid")))
    suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`)))
  } else if(i=="SRP063496"){
    y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid")))
  } else if(i=="SRP169062"){
    y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid")))
  } else if(i=="SRP155483"){
    y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid")))
    z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid")))
  } else if(i=="SRP074736"){
    y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid")))
  }
}

# overlap
for(j in unlist(project)){
  is.na <- is.na(y[[j]])
  if(length(is.na)!=nrow(x[[j]])){stop()}
  y[[j]] <- y[[j]][!is.na]
  if(!is.null(z[[j]])){
    if(is.vector(z[[j]])){
      z[[j]] <- z[[j]][!is.na]
    } else {
      z[[j]] <- z[[j]][!is.na,]
    }
  }
  x[[j]] <- x[[j]][!is.na,]
}
source(file.path(path,"package/R/development.R"))

x <- x$SRP100787
y <- y$SRP100787

#-----------
#--- MTL ---
#-----------

prob <- c(0.00,0.05,0.10,0.15,0.20)

q <- 2
n <- nrow(x)
auc_mtl <- list()
for(i in seq_along(prob)){
  auc_mtl[[i]] <- list()
  iter <- 1
  for(j in seq_len(iter)){
    Y <- matrix(data=NA,nrow=n,ncol=q)
    for(k in seq_len(q)){
      Y[,k] <- abs(y - stats::rbinom(n=n,size=1,prob=prob[i]))
    }
    auc_mtl[[i]][[j]] <- cv.multiple(y=Y,X=x,method=c("glm.separate","sparselink"),family="binomial",alpha.init=0.95,alpha=1,type="exp")$auc
  }
}
# MTL is beneficial under prob=0.15

#----------
#--- TL ---
#----------

auc_tl <- list()
iter <- 3
cond <- rep(x=NA,times=n)
Y <- X <- list()
for(i in seq_len(iter)){
  n0 <- round(0.3*sum(y==0))
  n1 <- round(0.3*sum(y==1))
  cond[y==0] <- sample(rep=c(0,1),times=c(n0,n-n0)) #sample(rep(x=c(0,1),length.out=sum(y==0)))
  cond[y==1] <- sample(rep=c(0,1),times=c(n1,n-n1)) #sample(rep(x=c(0,1),length.out=sum(y==1)))
  Y <- list(y1=y[cond==0],y2=y[cond==1])
  X <- list(x1=x[cond==0,],x2=x[cond==1,])
  auc_tl[[i]] <- crossval(y=Y,X=X,family="binomial",alpha.init=0.95,alpha=1,type="exp")$auc
}

rowMeans(sapply(auc_mtl,colMeans))
rowMeans(sapply(auc_tl,colMeans))

#glm.separate   sparselink 
#   0.7054149    0.7067895 
#glm.separate glm.glmtrans   sparselink   glm.common 
#   0.6155303    0.7424242    0.7113636    0.7856061 
# Increase or decrease prop=0.1 to make MTL useful?

Multi-task learning (under development)

rm(list=ls())
path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows)
#path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac)

dir <- c("results","manuscript","package/R/functions.R")
for(i in seq_along(dir)){
  if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){
    stop(paste0("Require folder/file'",dir[i],"'."))
  } 
}
source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package.

inst <- rownames(utils::installed.packages())
pkgs <- c("knitr","rmarkdown","glmnet","BiocManager")
for(i in seq_along(pkgs)){
  if(!pkgs[i]%in%inst){
    utils::install.packages(pkgs[i])
  }
}
pkgs <- c("recount3","edgeR")
for(i in seq_along(pkgs)){
  if(!pkgs[i]%in%inst){
    BiocManager::install(pkgs[i])
  }
}

blue <- "blue"; red <- "red"

if(exists("sim.app")&exists("fig.tab")){
  if(!sim.app&fig.tab){
    files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData")
    for(i in seq_along(files)){
      if(!file.exists(file.path(path,"results",files[i]))){
        stop("File",files[i],"is missing.")
      }
    }
  }
}
project <- list()
project$IBD <- c("Tew (2016)"="SRP063496",
                 "Haberman (2019)"="SRP129004",
                 "Verstockt (2019)"="ERP113396",
                 "Verstockt (2020)"="ERP114636",
                 "Boyd (2018)"="SRP100787")
project$RA <- c("Baker (2019)"="SRP169062",
                "Moncrieffe (2017)"="SRP074736",
                "Goldberg (2018)"="SRP155483")
extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091
#<<setup>>
#<<define_projects>>

load(file.path(path,"results/recount3_data.RData"))

#- - - - - - - - - - - - - - -
#- - - extract features  - - - 
#- - - - - - - - - - - - - - -

# extract features
x <- list()
for(i in c(unlist(project),extra)){
  counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts)
  colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name
  x[[i]] <- counts
}

# select most expressed protein-coding genes (for all TL projects together)
select <- list()
total <- numeric()
for(i in unlist(project)){
  total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering
  #total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # alternative: variance filtering
}
type <- SummarizedExperiment::rowData(data[[i]])$gene_type
cond <- type=="protein_coding"
total[,!cond] <- 0
rank <- apply(X=total,MARGIN=1,FUN=rank)
mean_rank <- rowMeans(rank)
temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # double-check (remove?) second condition
for(i in unlist(project)){
  select[[i]] <- temp
}

# select most expressed protein-coding genes (for MTL project)
mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean)
warning("change number in next line")
temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[100] # increase to 2000!
select[[extra]] <- temp

# pre-processing
for(i in c(unlist(project),extra)){
  lib.size <- Matrix::rowSums(x[[i]])
  x[[i]] <- x[[i]][,select[[i]],drop=FALSE]
  norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size)
  gamma <- norm.factors*lib.size/mean(lib.size)
  gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]]))
  x[[i]] <- x[[i]]/gamma
  x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform
  x[[i]] <- scale(x[[i]]) # scale because of different datasets!?
}

#- - - - - - - - - - - - - -
#- - - extract targets - - -
#- - - - - - - - - - - - - -

# extract information on samples
frame <- list()
for(i in c(unlist(project),extra)){
  list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|")
  data[[i]]$sra.experiment_attributes
  # What about sra.experiment_attributes?
  n <- length(list)
  cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1]))
  ncol <- length(cols)
  frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols))
  for(j in seq_len(n)){
    for(k in seq_len(ncol)){
      vector <- list[[j]]
      which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k])
      string <- vector[which]
      if(length(string)==0){next}
      frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2]
    }
  }
  frame[[i]] <- as.data.frame(frame[[i]])
}

# extract binary outcome
y <- z <- list()
for(i in unlist(project)){
  # CONTINUE HERE!!!
  if(i=="ERP113396"){
    y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid")))
  } else if(i=="ERP114636"){
    y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid")))
    warning("Inverting response and non-response!")
  } else if(i=="SRP100787"){
    y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid")))
  } else if(i=="SRP129004"){
    y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid")))
    suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`)))
  } else if(i=="SRP063496"){
    y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid")))
  } else if(i=="SRP169062"){
    y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid")))
  } else if(i=="SRP155483"){
    y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid")))
    z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid")))
  } else if(i=="SRP074736"){
    y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid")))
  }
}

# overlap
for(j in unlist(project)){
  is.na <- is.na(y[[j]])
  if(length(is.na)!=nrow(x[[j]])){stop()}
  y[[j]] <- y[[j]][!is.na]
  if(!is.null(z[[j]])){
    if(is.vector(z[[j]])){
      z[[j]] <- z[[j]][!is.na]
    } else {
      z[[j]] <- z[[j]][!is.na,]
    }
  }
  x[[j]] <- x[[j]][!is.na,]
}
source(file.path(path,"package/R/development.R"))

#- - - - - - - - - - 
#- - prepare data - -
#- - - - - - - - - - 

# explore SRP048801

project <- "ERP104864"
if(project=="SRP155483"){
  X <- x$SRP155483
  vars <- c("disease activity","das score")
  Y <- frame$SRP155483[,vars]
  sapply(unique(Y$`disease activity`),function(x) range(as.numeric(Y$`das score`[Y$`disease activity`==x])))
  warning("DAS score defines disease activity!")
  problem <- list()
  problem$unique <- colnames(Y)
} else if(project=="SRP129004"){
  X <- x$SRP129004
  vars <- c("histology severity score","total mayo score")
  Y <- frame$SRP129004[,vars]
  names <- intersect(rownames(X),rownames(Y))
  X <- X[names,]
  Y <- Y[names,]
  cond <- frame$SRP129004[names,"diagnosis"]=="Ulcerative Colitis"
  X <- X[cond,]
  Y <- Y[cond,]
  problem <- list()
  problem$unique <- colnames(Y)
} else if(project=="ERP104864"){
  X <- x$ERP104864
  vars <- c("CCP","CRP","crp","DAS28","ESR","esr","HAQ","VAS","SWOLLEN","TENDER") # "inflammatory score" dropped due to NA # "RF" dropped because binary
  Y <- frame$ERP104864[,vars]
  pathotype <- frame$ERP104864$pathotype
  if(any(rownames(X)!=rownames(Y))){stop()}
  #Y <- Y[Y$pathotype=="lymphoid",vars]
  if(any(!is.na(Y$CRP)&!is.na(Y$crp))){
    stop("Invalid.")
  } else {
    Y$CRP[is.na(Y$CRP)] <- Y$crp[is.na(Y$CRP)]
    Y$crp <- NULL
  }
  if(any(!is.na(Y$ESR)&!is.na(Y$esr))){
    stop("Invalid.")
  } else {
    Y$ESR[is.na(Y$ESR)] <- Y$esr[is.na(Y$ESR)]
    Y$esr <- NULL
  }
  problem <- list()
  #problem$joint <- c("SWOLLEN","TENDER")
  #problem$proms <- c("VAS","HAQ")
  #problem$labor <- c("CRP","ESR") # CCP might be too different (check cor)
  problem$trial <- c("CRP","ESR","SWOLLEN")
  problem$trial <- c("DAS28","SWOLLEN")
  #problem$all <- colnames(y)
}

for(i in seq_len(ncol(Y))){
  class(Y[[i]]) <- "numeric"
}

cond <- apply(X=Y,MARGIN=1,FUN=function(x) all(!is.na(x))) #& pathotype=="lymphoid" # subtypes seem to be defined based on clinical scores
#Y <- as.matrix(Y)[cond,] # temporary: binarisation
y <- scale(as.matrix(Y)[cond,])
x <- X[cond,]

#- - - - - - - - - - - - - 
#- - explore similarity - -
#- - - - - - - - - - - - - 

cor <- stats::cor(y,use="pairwise.complete.obs",method="spearman")
round(x=cor,digits=2)
graphics::image(t(cor)[,ncol(cor):1])
#stats::heatmap(x=as.matrix(Y),Rowv=NA)
d <- stats::dist(x=t(Y))
hclust <- stats::hclust(d=d)
graphics::plot(hclust)

graphics::par(mfrow=c(2,4),mar=c(2,2,1,1))
for(i in seq_len(ncol(y))){
    pvalue <- apply(x,2,function(x) cor.test(x,y[,i],method="spearman")$p.value)
    graphics::hist(x=pvalue,main=colnames(y)[i])
}

#--- relationship between cor coef and cor p-value ---
coef <- stats::cor(y=y[,"CRP"],x=x,method="spearman")
pval <- apply(X=x,MARGIN=2,FUN=function(x) stats::cor.test(x=x,y=y[,"CRP"],method="spearman")$p.value)
graphics::plot(x=coef,y=sign(coef)*(-log10(pval))^0.5)
#--- conclusion: equivalent ---

#--- examine correlation between ridge coefficients ---
coef <- matrix(data=NA,nrow=ncol(x),ncol=ncol(y),dimnames=list(NULL,colnames(y)))
for(i in seq_len(ncol(y))){
  object <- glmnet::cv.glmnet(x=x,y=y[,i],family="gaussian",alpha=0)
  coef[,i] <- stats::coef(object=object,s="lambda.min")[-1]
}
round(stats::cor(coef,method="spearman"),digits=2)
#--- conclusion: some sets are strongly correlated ---

#cor(rowSums(scale(Y)),Y,method="spearman")

#- - - - - - - - - - - - - 
#- - cross-validation - -
#- - - - - - - - - - - - - 

alpha.init <- 0.95; alpha <- 1; type <- "exp"; trial <- FALSE
#alpha.init <- NA; alpha <- 1; type <- "exp"; trial <- TRUE
results <- list()
for(k in seq_along(problem)){
  results[[k]] <- list()
  for(i in seq_len(1)){ # 5 repetitions of 10-fold CV
    set.seed(i)
    method <- c("glm.empty","glm.separate","glm.mgaussian","group.devel") #"sparselink" "group.devel"
    results[[k]][[i]] <-  cv.multiple(y=y[,problem[[k]]],X=x,family="gaussian",method=method,nfolds=10,alpha=alpha,alpha.init=alpha.init,type=type,trial=trial)
  }
}

colMeans(results[[1]][[1]]$deviance/results[[1]][[1]]$deviance[,"glm.empty"])
names(results) <- names(problem)

colMeans(results$trial[[1]]$deviance/results$trial[[1]]$deviance[,"glm.empty"])

lapply(results,function(x) rowMeans(sapply(x,function(x) rowMeans(apply(x$deviance,1,rank)))))

object <- devel(y=y[,problem$trial],x=x,family="gaussian")

cm <- numeric()
for(k in seq_along(problem)){
  for(i in seq_len(1)){
    dev <- results[[k]][[i]]$deviance
    temp <- colMeans(dev/dev[,"glm.separate"])
    cm <- rbind(cm,temp)
    #cat(cm,"\n")
  }
}
colMeans(cm)

# separate, mgaussian, sparselink and devel have similar performance (i.e., MTL has no benefit with any of these three approaches), examine whether other approaches are better

# examine whether group lasso for TL/MTL performs better

# Consider using different candidate values, e.g., 0.01,0.5,1,1.5,2,10 also for sparselink. Removing 0 might be a good choice. (If 0 is not included, however, initial ridge regression might be better.)

#- - - - - - - - - - - -
#- - learning curve - -
#- - - - - - - - - - - -

size <- unique(c(seq(from=50,to=nrow(x),by=25),nrow(x)))
iter <- 10 # increase to 10
size <- nrow(x) # suppress learning curve

metric <- list()
setting <- expand.grid(problem=names(problem),size=size,iter=seq_len(iter))
for(i in seq_len(nrow(setting))){
  cat(progress=paste0(100*i/nrow(setting),"%"),"\n")
  cond <- sample(x=rep(x=c(FALSE,TRUE),times=c(nrow(x)-setting$size[i],setting$size[i])))
  vars <- problem[[setting$problem[i]]]
  metric[[i]] <- joinet:::cv.joinet(Y=y[cond,vars],X=x[cond,],family="gaussian",compare="mnorm")
  #method <- c("glm.empty","glm.separate","glm.mgaussian","sparselink")
  #metric[[i]] <-  cv.multiple(y=y[cond,vars],X=x[cond,],family="gaussian",method=method,nfolds=10,alpha.init=0.95,alpha=1,type="exp",trial=TRUE)
}
frac <- sapply(X=metric,FUN=function(x) colMeans(t(x)/x["none",]))

#frac <- sapply(X=metric,FUN=function(x) colMeans(x$deviance/x$deviance[,"glm.empty"]))

graphics::par(mfrow=c(1,length(problem)),mar=c(3,3,2,0.5))
for(i in seq_along(problem)){
  graphics::plot.new()
  graphics::plot.window(xlim=range(size),ylim=range(frac))
  graphics::box()
  graphics::title(main=names(problem)[i])
  graphics::axis(side=1)
  graphics::axis(side=2)
  graphics::abline(h=1,col="grey",lty=2)
  cond <- setting$problem==names(problem)[i]
  col <- c(base="red",mnorm="orange",meta="blue")
  #col <- c(glm.separate="red",sparselink="blue")
  for(j in names(col)){
    val <- tapply(X=frac[j,cond],INDEX=setting$size[cond],FUN=mean)
    graphics::points(x=setting$size[cond],y=frac[j,cond],col=col[j])
    graphics::lines(x=size,y=val,col=col[j],type="o",pch=16)
  }
}

#  JOINET is better than standard lasso when n>=100, performance is similar to spls and better than glmnet-mgaussian 

#--- --- --- --- --- ---
# binarisation ---
#--- --- --- --- --- ---


yy <- 1*cbind(y[,"CCP"]>20,y[,"CRP"]>10,y[,"DAS28"]>5)
metric <- list()
for(i in 1:10){
  metric[[i]] <- joinet:::cv.joinet(Y=yy,X=x,family="binomial")
}

#rowMeans(sapply(metric,function(x) rowMeans(x)))

#--- --- --- --- --- --- ---
#--- explore other datasets ---
#--- --- --- --- --- --- ---

#projects <- recount3::available_projects(organism="human")
#recount3::read_metadata()


#data <- xlsx::read.xlsx(paste0(path,"/results/PPMI_Curated_Data_Cut_Public_20250321.xlsx"),sheetIndex=1)
data <- read.csv(paste0(path,"/results/PPMI_Curated_Data_Cut_Public_20250321.csv"))

# baseline data (predictors)
x <- data[data$COHORT==1 & data$EVENT_ID=="BL",]
rownames(x) <- x$PATNO
prop.miss <- colMeans(is.na(x))
x <- x[,prop.miss<=0.5 & sapply(x,class) %in% c("numeric","integer")]
x <- x[,which(colnames(x)=="age"):ncol(x)]
x <- missRanger::missRanger(data=x,pmm.k=3,num.trees=100,verbose=0,seed=1)
x <- scale(x)

# follow-up data (outcomes)
visit <- c("V04","V06","V08")
y <- data[data$COHORT==1 & data$EVENT_ID %in% visit,]
colnames(y)[colnames(y)=="updrs_totscore"] <- "updrs"
vars <- c("moca","quip","updrs","gds","scopa","ess","bjlot","rem")
y <- y[,c("EVENT_ID","PATNO",vars)]
y <- reshape(data=y,idvar="PATNO",timevar="EVENT_ID",direction="wide")
rownames(y) <- y$PATNO; y$PATNO <- NULL

# overlap
names <- intersect(rownames(x),rownames(y))
x <- x[names,]
y <- y[names,]
y <- as.matrix(y)
#y <- scale(y)

metric <- list()
for(i in seq_along(vars)){
  cat(vars[i],"\n")
  cols <- paste0(vars[i],".",visit)
  cond <- rowSums(is.na(y[,cols]))==0 # temporary
  if(sum(cond)>250){cond[cumsum(cond)>250] <- FALSE} # temporary
  metric[[i]] <- joinet:::cv.joinet(Y=y[cond,cols],X=x[cond,],family="gaussian",compare="mnorm")
  #method <- c("glm.empty","glm.separate","glm.mgaussian","sparselink","devel") # ,"sparselink")
  #metric[[i]] <- cv.multiple(y=y[cond,cols],X=x[cond,],family="gaussian",method=method,nfolds=10,alpha.init=0.95,alpha=1,type="exp",trial=TRUE)
}

#rowMeans(sapply(metric,function(x) colMeans(t(x)/x["none",])))
rowMeans(sapply(metric,function(x) colMeans(x$deviance/x$deviance[,"glm.empty"])))

This chunk saves the session information for generating figures and tables.