Loading dataset files

setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set to current directory

labels=read.csv("labels.csv")
features=read.csv("tpm_expression.csv")
covariates=read.csv("tcga_pancancer_lung_cov.csv")

Formatting

Adjusting rows and columns names and NAs in covariates:

#labels
labels[,2]=factor(labels[,2]) #LABELS FACTORIZED ALL TOGETHER AT BEGINNING
colnames(labels)=c("ID","Label")
#features
row.names(features)=features[,1]
features=features[,-1]
#covariates
row.names(covariates)=covariates[,1]
covariates=as.data.frame(t(covariates[,-1]))
covariates[covariates == "[Not Applicable]" | covariates == "[Not Available]"] <- NA

Preprocessing

Train/test split:

#train/test split
set.seed(123)
train_index <- sample(nrow(labels), nrow(labels) * 0.8)

Eventual data transformation to bring data towards normality/stabilising variance:

#leave to all false ---> no transformation
#or select TRUE (at most one) ----> perform selected transformation
log2transf=FALSE
log10transf=FALSE
deseqVST=FALSE

if (log2transf) {
  features=log2(1+features) #add +1 to avoid Inf 
}

if (log10transf) {
  features=log10(1+features)
}

if (deseqVST) {
  features <- DESeq2::DESeqDataSetFromMatrix(countData=features, 
               colData=data.frame(condition=rep(1,ncol(traindata$x))),design = ~ 1)
  features <- vst(dds_train, blind=TRUE)
  features <- as.matrix(assay(vst_train))
}

Organizing data in the format PAMR algorithms wants that is:

“A list with components: x- an expression genes in the rows, samples in the columns), and y- a vector of the class labels for each sample. Optional components- genenames, a vector of gene names, and geneid- a vector of gene identifiers.” (from ?pamr.train)

traindata=list()
traindata$y=labels[train_index,2]
traindata$x=as.matrix(features[,train_index])
traindata$geneid=rownames(features)
testdata=list()
testdata$y=labels[-train_index,2]
testdata$x=as.matrix(features[,-train_index])
testdata$geneid=rownames(features)
rm(features) #remove features to empty memory space
print(paste0("Nr. training obvs: ", length(traindata$y)))
## [1] "Nr. training obvs: 872"
print(paste0("Nr. test obvs: ", length(testdata$y)))
## [1] "Nr. test obvs: 219"

Fitting

library(pamr)
## Loading required package: cluster
## Loading required package: survival
pamr=pamr.train(traindata) #first fit
## 123456789101112131415161718192021222324252627282930
thres=pamr.adaptthresh(pamr) #find best scaling factor for each class
## Initial errors: 219.26667  20.83333   4.86667 Roc 450.5882 
## Update 1 
## 123456789101112131415161718192021222324252627282930
## Errors 231.90000  18.46667  11.93333 Roc 442.6114 
## Update 2 
## 123456789101112131415161718192021222324252627282930
## Errors 244.46667  15.63333  18.66667 Roc 431.2626 
## Update 3 
## 123456789101112131415161718192021222324252627282930
## Errors 245.30000  13.96667  24.00000 Roc 412.5635 
## Update 4 
## 123456789101112131415161718192021222324252627282930
## Errors 245.26667  13.06667  29.06667 Roc 390.527 
## Update 5 
## 123456789101112131415161718192021222324252627282930
## Errors 245.13333  12.36667  34.00000 Roc 366.911 
## Update 6 
## 123456789101112131415161718192021222324252627282930
## Errors 245.30000  12.03333  38.36667 Roc 353.8755 
## Update 7 
## 123456789101112131415161718192021222324252627282930
## Errors 245.43333  11.56667  41.66667 Roc 350.5562 
## Update 8 
## 123456789101112131415161718192021222324252627282930
## Errors 245.26667  11.36667  45.03333 Roc 346.8804 
## Update 9 
## 123456789101112131415161718192021222324252627282930
## Errors 244.56667  11.30000  49.06667 Roc 343.6389 
## Update 10 
## 123456789101112131415161718192021222324252627282930
## Errors 243.7667  11.2000  51.7000 Roc 338.56
pamr=pamr.train(traindata, threshold.scale = thres) #refit with adaptive threshold
## 123456789101112131415161718192021222324252627282930
pamrcv=pamr.cv(pamr, traindata) #show performance of models for different threshold in CV
## 123Fold 1 :123456789101112131415161718192021222324252627282930
## Fold 2 :123456789101112131415161718192021222324252627282930
## Fold 3 :123456789101112131415161718192021222324252627282930
## Fold 4 :123456789101112131415161718192021222324252627282930
## Fold 5 :123456789101112131415161718192021222324252627282930
## Fold 6 :123456789101112131415161718192021222324252627282930
## Fold 7 :123456789101112131415161718192021222324252627282930
## Fold 8 :123456789101112131415161718192021222324252627282930
## Fold 9 :123456789101112131415161718192021222324252627282930
## Fold 10 :123456789101112131415161718192021222324252627282930
pamrcv
## Call:
## pamr.cv(fit = pamr, data = traindata)
##    threshold nonzero errors
## 1   0.000    22830   60    
## 2   0.842    14931   65    
## 3   1.684    11450   62    
## 4   2.527     8811   58    
## 5   3.369     6760   57    
## 6   4.211     5258   56    
## 7   5.053     4145   54    
## 8   5.895     3230   51    
## 9   6.737     2456   52    
## 10  7.580     1897   48    
## 11  8.422     1446   46    
## 12  9.264     1119   46    
## 13 10.106      891   44    
## 14 10.948      644   43    
## 15 11.790      481   43    
## 16 12.633      343   40    
## 17 13.475      243   42    
## 18 14.317      180   40    
## 19 15.159      144   42    
## 20 16.001      104   42    
## 21 16.844       75   45    
## 22 17.686       45   52    
## 23 18.528       31   50    
## 24 19.370       24   56    
## 25 20.212       16   78    
## 26 21.054       12   118   
## 27 21.897        7   134   
## 28 22.739        5   137   
## 29 23.581        1   270   
## 30 24.423        0   452
pamr.plotcv(pamrcv) #plotting results

Two threshold values seems interesting to further evaluate. - One is around \(To\), where the lowest misclassification error rate seems to happen (with around 200 genes selected for classification) - The other one between \(Th\) and \(Th\) before the misclassification error rate start to rise drastically especially for the “NORMAL” samples.

Evaluating results through classmap graphical displays

Selection of two threshold value to evaluate

\(To\) - threshold “optimal” \(Th\) - threshold “higher” , suboptimal (but still errors do not explode), with fewer genes selected

To=14.3  #  Optimal threshold
Th=19.3  #   Higher threshold

In training set

library(classmap) 
## Warning: package 'classmap' was built under R version 4.2.3
library(classmapExt) #package that extends classmap functionalities

Using vcr.pamr.train function to produce all the quantities needed for the visualization of the two models with the two thresholds:

vcrtrainTo=vcr.pamr.train(data=traindata, pamrfit = pamr, threshold=To)
vcrtrainTh=vcr.pamr.train(data=traindata, pamrfit = pamr, threshold=Th)

Saving also the index of the retained genes for classification at the two thresholds (to run another classification algorithm that does not perform also feature selection):

selectedgenes=pamr.listgenes(pamr, traindata, threshold = vcrtrainTo$threshold)
##        id           LUAD-score LUSC-score NORM-score
##   [1,] SFTPC        0          0          2.244     
##   [2,] AGER         0          0          1.0194    
##   [3,] CLEC3B       0          0          0.5766    
##   [4,] C13orf15     0          0          0.5065    
##   [5,] TRIM29       -0.3787    0          0         
##   [6,] EMP2         0          0          0.3709    
##   [7,] PVRL1        -0.3467    0          0         
##   [8,] MUC1         0.3407     0          0         
##   [9,] PERP         -0.3328    0          0         
##  [10,] FHL1         0          0          0.3269    
##  [11,] KRT5         -0.3185    0          0         
##  [12,] NKX2-1       0.3123     0          0         
##  [13,] CD63         0.2996     0          0         
##  [14,] IRF6         -0.2612    0          0         
##  [15,] GOLM1        0.258      0          0         
##  [16,] RBM47        0.2564     0          0         
##  [17,] TP63         -0.2541    0          0         
##  [18,] TNNC1        0          0          0.2311    
##  [19,] ABCC3        0.23       0          0         
##  [20,] LOC440335    0.2239     0          0         
##  [21,] PKP1         -0.2223    0          0         
##  [22,] ERBB3        0.2215     0          0         
##  [23,] C17orf28     0.2203     0          0         
##  [24,] PRSS8        0.2131     0          0         
##  [25,] ATP1B3       -0.2084    0          0         
##  [26,] KRT6A        -0.2026    0          0         
##  [27,] DSC3         -0.2024    0          0         
##  [28,] ACSL5        0.202      0          0         
##  [29,] MVP          0.1989     0          0         
##  [30,] EPCAM        0.1952     0          0         
##  [31,] DDAH1        0.1822     0          0         
##  [32,] GJB5         -0.1738    0          0         
##  [33,] JAG1         -0.1726    0          0         
##  [34,] SERPINB5     -0.1724    0          0         
##  [35,] TMC5         0.1707     0          0         
##  [36,] ST6GALNAC2   -0.1694    0          0         
##  [37,] PRR15L       0.163      0          0         
##  [38,] CEACAM6      0.1545     0          0         
##  [39,] LLGL2        0.1497     0          0         
##  [40,] ZDHHC9       0.1456     0          0         
##  [41,] RORC         0.1436     0          0         
##  [42,] SH3BP1       -0.1433    0          0         
##  [43,] CLDN3        0.1401     0          0         
##  [44,] QSOX1        0.1392     0          0         
##  [45,] MLPH         0.1386     0          0         
##  [46,] DSG3         -0.1378    0          0         
##  [47,] SLC2A1       -0.1377    0          0         
##  [48,] PTP4A2       0.1337     0          0         
##  [49,] CNBP         -0.1311    0          0         
##  [50,] ICA1         0.1295     0          0         
##  [51,] CD9          -0.1281    0          0         
##  [52,] TMEM125      0.124      0          0         
##  [53,] ERGIC1       0.1232     0          0         
##  [54,] CD46         0.1228     0          0         
##  [55,] CD2AP        0.1222     0          0         
##  [56,] PON2         0.1195     0          0         
##  [57,] TMEM63A      0.1192     0          0         
##  [58,] MIR205HG     -0.1185    0          0         
##  [59,] SMPDL3B      0.1177     0          0         
##  [60,] XXYLT1       -0.1171    0          0         
##  [61,] SFTA3        0.117      0          0         
##  [62,] PTTG1IP      0.1146     0          0         
##  [63,] TMEM189      -0.1137    0          0         
##  [64,] NEK6         0.1131     0          0         
##  [65,] CYB561D2     0.112      0          0         
##  [66,] GALE         0.1115     0          0         
##  [67,] SFTA2        0.1114     0          0         
##  [68,] TMEM62       0.1111     0          0         
##  [69,] KCNK5        0.111      0          0         
##  [70,] ZNF385A      -0.1103    0          0         
##  [71,] SLC6A8       -0.1099    0          0         
##  [72,] EFNB1        -0.1062    0          0         
##  [73,] TMEM8A       0.1039     0          0         
##  [74,] SFN          -0.1039    0          0         
##  [75,] WFDC2        0.1029     0          0         
##  [76,] HDGFRP3      -0.1017    0          0         
##  [77,] SIAH2        -0.1008    0          0         
##  [78,] LGALS3BP     0.097      0          0         
##  [79,] CSDA         -0.0956    0          0         
##  [80,] PNKD         0.0954     0          0         
##  [81,] CALML3       -0.094     0          0         
##  [82,] MMP15        0.093      0          0         
##  [83,] LOC100129034 0.0912     0          0         
##  [84,] ATP11A       0.0911     0          0         
##  [85,] GPC1         -0.0897    0          0         
##  [86,] CGN          0.0878     0          0         
##  [87,] CYB561       0.0875     0          0         
##  [88,] RASSF7       0.0842     0          0         
##  [89,] PECAM1       0          0          0.0833    
##  [90,] VSNL1        -0.083     0          0         
##  [91,] TJP3         0.0824     0          0         
##  [92,] CAPN8        0.0823     0          0         
##  [93,] C1orf210     0.0815     0          0         
##  [94,] ST3GAL5      0.0811     0          0         
##  [95,] TMEM150A     0.077      0          0         
##  [96,] ARL6IP5      0.0743     0          0         
##  [97,] GALM         0.074      0          0         
##  [98,] FAM107B      0.0725     0          0         
##  [99,] FYTTD1       -0.0714    0          0         
## [100,] KRT17        -0.0713    0          0         
## [101,] SUCLG2       0.071      0          0         
## [102,] FZD6         -0.0704    0          0         
## [103,] SNAI2        -0.0702    0          0         
## [104,] FSCN1        -0.07      0          0         
## [105,] FAM162A      -0.0682    0          0         
## [106,] ARF4         0.0671     0          0         
## [107,] ARPC2        -0.067     0          0         
## [108,] CLDN7        0.066      0          0         
## [109,] ACTL6A       -0.0639    0          0         
## [110,] TCEA3        0.0609     0          0         
## [111,] MICALL1      -0.0589    0          0         
## [112,] OCIAD2       0.0585     0          0         
## [113,] PCYT1A       -0.0578    0          0         
## [114,] HNF1B        0.0566     0          0         
## [115,] NAPSA        0.055      0          0         
## [116,] C4orf34      0.0544     0          0         
## [117,] GPR87        -0.0535    0          0         
## [118,] ITGA6        -0.0526    0          0         
## [119,] BMP7         -0.0516    0          0         
## [120,] CLDND1       -0.0503    0          0         
## [121,] TUBA1C       -0.0503    0          0         
## [122,] TMEM59       0.0495     0          0         
## [123,] SLC22A31     0.0487     0          0         
## [124,] TMEM219      0.0477     0          0         
## [125,] PSMD2        -0.0459    0          0         
## [126,] PRNP         -0.0452    0          0         
## [127,] CLCA2        -0.0451    0          0         
## [128,] LPCAT1       0.0441     0          0         
## [129,] FUCA1        0.0432     0          0         
## [130,] TRA2B        -0.0425    0          0         
## [131,] GMPS         -0.0422    0          0         
## [132,] PDCD10       -0.0409    0          0         
## [133,] TUBB6        -0.0398    0          0         
## [134,] MRPL47       -0.0395    0          0         
## [135,] JUP          -0.0378    0          0         
## [136,] ANXA11       0.0365     0          0         
## [137,] TIMMDC1      -0.0364    0          0         
## [138,] RAB7A        -0.0363    0          0         
## [139,] TMEM45B      0.0361     0          0         
## [140,] CD151        0.0359     0          0         
## [141,] CORO1C       -0.0332    0          0         
## [142,] RGL3         0.0332     0          0         
## [143,] SOX15        -0.0329    0          0         
## [144,] TSPAN15      0.0324     0          0         
## [145,] EMB          0.0321     0          0         
## [146,] TMC4         0.0316     0          0         
## [147,] SPDEF        0.0307     0          0         
## [148,] DLG1         -0.0297    0          0         
## [149,] NAA50        -0.0277    0          0         
## [150,] C3orf37      -0.0276    0          0         
## [151,] GOLT1A       0.0263     0          0         
## [152,] BICD2        -0.0255    0          0         
## [153,] PTDSS1       -0.0242    0          0         
## [154,] MRPS22       -0.0236    0          0         
## [155,] UFC1         0.0235     0          0         
## [156,] TRIM8        0.0229     0          0         
## [157,] CCNJL        0.0208     0          0         
## [158,] ZDHHC16      0.019      0          0         
## [159,] FAM83B       -0.0184    0          0         
## [160,] HRAS         -0.0179    0          0         
## [161,] SLC9A3R1     -0.0175    0          0         
## [162,] KRTCAP2      0.0172     0          0         
## [163,] PBXIP1       0.0155     0          0         
## [164,] SIGIRR       0.0153     0          0         
## [165,] TRAM1        0.0142     0          0         
## [166,] EFCAB4A      0.0129     0          0         
## [167,] CD44         -0.0109    0          0         
## [168,] GNPTG        0.0096     0          0         
## [169,] LMAN2        0.0095     0          0         
## [170,] AP2M1        -0.0095    0          0         
## [171,] GPRC5C       0.0077     0          0         
## [172,] KRT7         0.0074     0          0         
## [173,] RFC4         -0.0044    0          0         
## [174,] BSPRY        0.0028     0          0         
## [175,] RAB17        0.0019     0          0         
## [176,] DPP4         0.0015     0          0         
## [177,] FRMD6        -0.0012    0          0         
## [178,] SLC34A2      8e-04      0          0         
## [179,] SLC43A3      5e-04      0          0         
## [180,] CLIC6        2e-04      0          0
selectedgenes=as.vector(selectedgenes[,1])
sgenes_indexTo <- rownames(traindata$x) %in% selectedgenes
selectedgenes=pamr.listgenes(pamr, traindata, threshold = vcrtrainTh$threshold)
##       id        LUAD-score LUSC-score NORM-score
##  [1,] SFTPC     0          0          0.6671    
##  [2,] TRIM29    -0.1894    0          0         
##  [3,] PVRL1     -0.1574    0          0         
##  [4,] MUC1      0.1514     0          0         
##  [5,] PERP      -0.1435    0          0         
##  [6,] KRT5      -0.1291    0          0         
##  [7,] NKX2-1    0.1229     0          0         
##  [8,] CD63      0.1103     0          0         
##  [9,] IRF6      -0.0719    0          0         
## [10,] GOLM1     0.0687     0          0         
## [11,] RBM47     0.067      0          0         
## [12,] TP63      -0.0647    0          0         
## [13,] ABCC3     0.0406     0          0         
## [14,] LOC440335 0.0346     0          0         
## [15,] PKP1      -0.033     0          0         
## [16,] ERBB3     0.0322     0          0         
## [17,] C17orf28  0.031      0          0         
## [18,] PRSS8     0.0237     0          0         
## [19,] ATP1B3    -0.0191    0          0         
## [20,] KRT6A     -0.0132    0          0         
## [21,] DSC3      -0.013     0          0         
## [22,] ACSL5     0.0127     0          0         
## [23,] MVP       0.0095     0          0         
## [24,] EPCAM     0.0058     0          0
selectedgenes=as.vector(selectedgenes[,1])
sgenes_indexTh<- rownames(traindata$x) %in% selectedgenes
sgenes_index=data.frame(sgenes_indexTo,sgenes_indexTh)
write.csv(sgenes_index, "sgenes_index.csv" , row.names = FALSE)

Silhouette plot (and confusion matrix)

cfm=caret::confusionMatrix(factor(vcrtrainTo$pred),factor(vcrtrainTo$y))
cfm$table
##           Reference
## Prediction LUAD LUSC NORM
##       LUAD  380   23    1
##       LUSC   10  377    0
##       NORM    2    3   76
silplot(vcrtrainTo, main=paste0("(Training) Silplot at threshold: ",To))
##  classNumber classLabel classSize classAveSi
##            1       LUAD       392       0.91
##            2       LUSC       403       0.87
##            3       NORM        77       0.98

cfm=caret::confusionMatrix(factor(vcrtrainTh$pred),factor(vcrtrainTh$y))
cfm$table
##           Reference
## Prediction LUAD LUSC NORM
##       LUAD  384   36   10
##       LUSC    5  366    0
##       NORM    3    1   67
silplot(vcrtrainTh, main=paste0("(Training) Silplot at threshold: ",Th))
##  classNumber classLabel classSize classAveSi
##            1       LUAD       392       0.50
##            2       LUSC       403       0.56
##            3       NORM        77       0.58

Classmap plot

par(mfrow=c(1,3))
classmap(vcrtrainTo, whichclass = 1)
classmap(vcrtrainTo, whichclass = 2)
classmap(vcrtrainTo, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",To), line=-5, side=3, outer=TRUE, cex=1)

par(mfrow=c(1,3))
classmap(vcrtrainTh, whichclass = 1)
classmap(vcrtrainTh, whichclass = 2)
classmap(vcrtrainTh, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",Th), line=-5, side=3, outer=TRUE, cex=1)

MDScolorscale plot

#add a way to print in markdown documentation of mdscolorscale??
mdscolorscale(vcrtrainTo, diss=vcrtrainTo$pwd, main=
                paste0("(Train) MDScolorscale at thresh: ",To))
mdscolorscale(vcrtrainTh, diss=vcrtrainTh$pwd, main=
                paste0("(Train) MDScolorscale at thresh: ",Th))

Quasi-residual plots

Just for threshold \(Th\)

On continuos covariates:

index_continus_covariates=c(1,4,5,14,18,26,27,28,44)

par(mfrow=c(3,3))

for (i in index_continus_covariates) {
  continuos_covariate=as.numeric(unlist(covariates[train_index,i], use.names = FALSE))
  qresplot(vcrtrainTh$PAC,continuos_covariate, plotErrorBars = TRUE, main = colnames(covariates)[i])
}

We can also plot some discrete covariates coding them to integers values:

index_discrete_covariates=c(2,22)

par(mfrow=c(1,2))

legend=data.frame(matrix(nrow = 20, ncol = 0))

for (i in index_discrete_covariates) {
  discrete_covariate=factor(unlist(covariates[train_index,i], use.names = FALSE)) 
  levels=levels(discrete_covariate)
  if (length(levels) < 20) {
    levels <- c(levels, rep(NA, 20 - length(levels)))
  }
  legend=cbind(legend,levels)
  if(i!=22) {name="Stage"} else {name="Histology code"}
  qresplot(vcrtrainTh$PAC,as.numeric(discrete_covariate), plotErrorBars = TRUE,
             main = name)
}

legend=cbind(c(1:20), legend)
colnames(legend)=c("Integer coded","Stage","Histology code")

knitr::kable(legend, caption = "Legend")
Legend
Integer coded Stage Histology code
1
2 STAGE I 8052/3
3 STAGE IA 8070/3
4 STAGE IB 8071/3
5 STAGE II 8072/3
6 STAGE IIA 8073/3
7 STAGE IIB 8083/3
8 STAGE III 8140/3
9 STAGE IIIA 8230/3
10 STAGE IIIB 8250/3
11 STAGE IV 8252/3
12 NA 8253/3
13 NA 8255/3
14 NA 8260/3
15 NA 8310/3
16 NA 8480/3
17 NA 8490/3
18 NA 8507/3
19 NA 8550/3
20 NA NA

On selected genes for the classification in dataset:

selectedgenes=which(sgenes_index$sgenes_indexTh)

#in all the genes
coeff=rep(0,nrow(traindata$x[selectedgenes,]))

#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(traindata$x[selectedgenes,], 1, function(x) {
    unique_count <- length(unique(x))
    zero_count <- sum(x == 0)
    return(unique_count >= 7 & zero_count / length(x) < 0.35)}))

for (i in index_nonsparse) {
  lm=lm(vcrtrainTh$PAC~traindata$x[i,])
  coeff[i]=lm$coefficients[2]
}

sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 11 18  8 24  3  4  2 20  5
sorted_coeff[1:9]
## [1] 0.220692155 0.113886093 0.063452879 0.021180450 0.013086051 0.011393740 0.003220045 0.002280854 0.001019688
par(mfrow=c(3,3))

for (i in sorted_indices[1:9]) {
  gene=traindata$x[i,]
  qresplot(vcrtrainTh$PAC,gene,plotErrorBars = TRUE, main = rownames(traindata$x)[i])
}

On all genes in dataset:

#in all the genes
coeff=rep(0,nrow(traindata$x))

#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
index_nonsparse <- which(apply(traindata$x, 1, function(x) length(unique(x)) >= 8)) 

#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(traindata$x, 1, function(x) {
    unique_count <- length(unique(x))
    zero_count <- sum(x == 0)
    return(unique_count >= 7 & zero_count / length(x) < 0.35)}))

for (i in index_nonsparse) {
  lm=lm(vcrtrainTh$PAC~traindata$x[i,])
  coeff[i]=lm$coefficients[2]
}

sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 11529 17404 13647  2943  1516 10195  4907 16301 10933
sorted_coeff[1:9]
## [1] 1.0471477 0.7730503 0.5717340 0.5355485 0.5255370 0.4841109 0.4669274 0.4653891 0.4572245
par(mfrow=c(3,3))

for (i in sorted_indices[1:9]) {
  gene=traindata$x[i,]
  qresplot(vcrtrainTh$PAC,gene,plotErrorBars = TRUE, main = rownames(traindata$x)[i])
}

In test set

Using vcr.pamr.newdata function to produce all the quantities needed for the visualization of the two models with the two thresholds:

vcrtestTo=vcr.pamr.newdata(newdata=testdata, vcr.pamr.train.out = vcrtrainTo)
vcrtestTh=vcr.pamr.newdata(newdata=testdata, vcr.pamr.train.out = vcrtrainTh)

Silhouette plot (and confusion matrix)

cfm=caret::confusionMatrix(factor(vcrtestTo$pred),factor(vcrtestTo$ynew))
cfm$table
##           Reference
## Prediction LUAD LUSC NORM
##       LUAD   95    6    0
##       LUSC   10   82    0
##       NORM    4    0   22
silplot(vcrtestTo, main=paste0("(Test) Silplot at threshold: ",To))
##  classNumber classLabel classSize classAveSi
##            1       LUAD       109       0.74
##            2       LUSC        88       0.87
##            3       NORM        22       1.00

cfm=caret::confusionMatrix(factor(vcrtestTh$pred),factor(vcrtestTh$ynew))
cfm$table
##           Reference
## Prediction LUAD LUSC NORM
##       LUAD  101    7    2
##       LUSC    6   81    0
##       NORM    2    0   20
silplot(vcrtrainTh, main=paste0("(Test) Silplot at threshold: ",Th))
##  classNumber classLabel classSize classAveSi
##            1       LUAD       392       0.50
##            2       LUSC       403       0.56
##            3       NORM        77       0.58

Classmap plot

par(mfrow=c(1,3))
classmap(vcrtestTo, whichclass = 1)
classmap(vcrtestTo, whichclass = 2)
classmap(vcrtestTo, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",To), line=-5, side=3, outer=TRUE, cex=1)

par(mfrow=c(1,3))
classmap(vcrtestTh, whichclass = 1)
classmap(vcrtestTh, whichclass = 2)
classmap(vcrtestTh, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",Th), line=-5, side=3, outer=TRUE, cex=1)

MDScolorscale plot

mdscolorscale(vcrtestTo, diss=vcrtestTo$pwd, main=
                paste0("(Test) MDScolorscale at thresh: ",To))
mdscolorscale(vcrtestTh, diss=vcrtestTh$pwd, main=
                paste0("(Test) MDScolorscale at thresh: ",Th))

Quasi-residual plots

Just for threshold \(Th\)

On continuos covariates:

index_continus_covariates=c(1,4,5,14,18,26,27,28,44)

par(mfrow=c(3,3))

for (i in index_continus_covariates) {
  continuos_covariate=as.numeric(unlist(covariates[-train_index,i], use.names = FALSE))
  qresplot(vcrtestTh$PAC,continuos_covariate, plotErrorBars = TRUE, main = colnames(covariates)[i])
}

We can also plot some discrete covariates coding them to integers values:

index_discrete_covariates=c(2,22)

par(mfrow=c(1,2))

legend=data.frame(matrix(nrow = 20, ncol = 0))

for (i in index_discrete_covariates) {
  discrete_covariate=factor(unlist(covariates[-train_index,i], use.names = FALSE)) 
  levels=levels(discrete_covariate)
  if (length(levels) < 20) {
    levels <- c(levels, rep(NA, 20 - length(levels)))
  }
  legend=cbind(legend,levels)
  if(i!=22) {name="Stage"} else {name="Histology code"}
  qresplot(vcrtestTh$PAC,as.numeric(discrete_covariate), plotErrorBars = TRUE,
             main = name)
}

legend=cbind(c(1:20), legend)
colnames(legend)=c("Integer coded","Stage","Histology code")

knitr::kable(legend, caption = "Legend")
Legend
Integer coded Stage Histology code
1 STAGE I 8052/3
2 STAGE IA 8070/3
3 STAGE IB 8071/3
4 STAGE II 8083/3
5 STAGE IIA 8140/3
6 STAGE IIB 8230/3
7 STAGE IIIA 8252/3
8 STAGE IIIB 8255/3
9 STAGE IV 8260/3
10 NA 8310/3
11 NA 8480/3
12 NA 8550/3
13 NA NA
14 NA NA
15 NA NA
16 NA NA
17 NA NA
18 NA NA
19 NA NA
20 NA NA

On selected genes for the classification in dataset:

selectedgenes=which(sgenes_index$sgenes_indexTh)

#in all the genes
coeff=rep(0,nrow(testdata$x[selectedgenes,]))

#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(testdata$x[selectedgenes,], 1, function(x) {
    unique_count <- length(unique(x))
    zero_count <- sum(x == 0)
    return(unique_count >= 7 & zero_count / length(x) < 0.35)}))

for (i in index_nonsparse) {
  lm=lm(vcrtestTh$PAC~testdata$x[i,])
  coeff[i]=lm$coefficients[2]
}

sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 18 24  4  8  3  5  2 14 21
sorted_coeff[1:9]
## [1] 0.0489389471 0.0387591959 0.0184546417 0.0148206676 0.0113137019 0.0099551486 0.0019554276 0.0008352847 0.0003943720
par(mfrow=c(3,3))

for (i in sorted_indices[1:9]) {
  gene=testdata$x[i,]
  qresplot(vcrtestTh$PAC,gene,plotErrorBars = TRUE, main = rownames(testdata$x)[i])
}

On all genes in dataset:

#in all the genes
coeff=rep(0,nrow(testdata$x))

#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
index_nonsparse <- which(apply(testdata$x, 1, function(x) length(unique(x)) >= 8)) 

#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(testdata$x, 1, function(x) {
    unique_count <- length(unique(x))
    zero_count <- sum(x == 0)
    return(unique_count >= 7 & zero_count / length(x) < 0.35)}))

for (i in index_nonsparse) {
  lm=lm(vcrtestTh$PAC~testdata$x[i,])
  coeff[i]=lm$coefficients[2]
}

sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 11529  2547 13647 15640 16151 18524  7158 10982  1934
sorted_coeff[1:9]
## [1] 1.4775282 1.3338545 0.9201371 0.6643222 0.5330686 0.5316600 0.4690691 0.4098608 0.3177889
par(mfrow=c(3,3))

for (i in sorted_indices[1:9]) {
  gene=testdata$x[i,]
  qresplot(vcrtestTh$PAC,gene,plotErrorBars = TRUE, main = rownames(testdata$x)[i])
}