setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set to current directory
labels=read.csv("labels.csv")
features=read.csv("tpm_expression.csv")
covariates=read.csv("tcga_pancancer_lung_cov.csv")
Adjusting rows and columns names and NAs in covariates:
#labels
labels[,2]=factor(labels[,2]) #LABELS FACTORIZED ALL TOGETHER AT BEGINNING
colnames(labels)=c("ID","Label")
#features
row.names(features)=features[,1]
features=features[,-1]
#covariates
row.names(covariates)=covariates[,1]
covariates=as.data.frame(t(covariates[,-1]))
covariates[covariates == "[Not Applicable]" | covariates == "[Not Available]"] <- NA
Train/test split:
#train/test split
set.seed(123)
train_index <- sample(nrow(labels), nrow(labels) * 0.8)
Eventual data transformation to bring data towards normality/stabilising variance:
#leave to all false ---> no transformation
#or select TRUE (at most one) ----> perform selected transformation
log2transf=FALSE
log10transf=FALSE
deseqVST=FALSE
if (log2transf) {
features=log2(1+features) #add +1 to avoid Inf
}
if (log10transf) {
features=log10(1+features)
}
if (deseqVST) {
features <- DESeq2::DESeqDataSetFromMatrix(countData=features,
colData=data.frame(condition=rep(1,ncol(traindata$x))),design = ~ 1)
features <- vst(dds_train, blind=TRUE)
features <- as.matrix(assay(vst_train))
}
Organizing data in the format PAMR algorithms wants that is:
“A list with components: x- an expression genes in the rows, samples in the columns), and y- a vector of the class labels for each sample. Optional components- genenames, a vector of gene names, and geneid- a vector of gene identifiers.” (from ?pamr.train)
traindata=list()
traindata$y=labels[train_index,2]
traindata$x=as.matrix(features[,train_index])
traindata$geneid=rownames(features)
testdata=list()
testdata$y=labels[-train_index,2]
testdata$x=as.matrix(features[,-train_index])
testdata$geneid=rownames(features)
rm(features) #remove features to empty memory space
print(paste0("Nr. training obvs: ", length(traindata$y)))
## [1] "Nr. training obvs: 872"
print(paste0("Nr. test obvs: ", length(testdata$y)))
## [1] "Nr. test obvs: 219"
library(pamr)
## Loading required package: cluster
## Loading required package: survival
pamr=pamr.train(traindata) #first fit
## 123456789101112131415161718192021222324252627282930
thres=pamr.adaptthresh(pamr) #find best scaling factor for each class
## Initial errors: 219.26667 20.83333 4.86667 Roc 450.5882
## Update 1
## 123456789101112131415161718192021222324252627282930
## Errors 231.90000 18.46667 11.93333 Roc 442.6114
## Update 2
## 123456789101112131415161718192021222324252627282930
## Errors 244.46667 15.63333 18.66667 Roc 431.2626
## Update 3
## 123456789101112131415161718192021222324252627282930
## Errors 245.30000 13.96667 24.00000 Roc 412.5635
## Update 4
## 123456789101112131415161718192021222324252627282930
## Errors 245.26667 13.06667 29.06667 Roc 390.527
## Update 5
## 123456789101112131415161718192021222324252627282930
## Errors 245.13333 12.36667 34.00000 Roc 366.911
## Update 6
## 123456789101112131415161718192021222324252627282930
## Errors 245.30000 12.03333 38.36667 Roc 353.8755
## Update 7
## 123456789101112131415161718192021222324252627282930
## Errors 245.43333 11.56667 41.66667 Roc 350.5562
## Update 8
## 123456789101112131415161718192021222324252627282930
## Errors 245.26667 11.36667 45.03333 Roc 346.8804
## Update 9
## 123456789101112131415161718192021222324252627282930
## Errors 244.56667 11.30000 49.06667 Roc 343.6389
## Update 10
## 123456789101112131415161718192021222324252627282930
## Errors 243.7667 11.2000 51.7000 Roc 338.56
pamr=pamr.train(traindata, threshold.scale = thres) #refit with adaptive threshold
## 123456789101112131415161718192021222324252627282930
pamrcv=pamr.cv(pamr, traindata) #show performance of models for different threshold in CV
## 123Fold 1 :123456789101112131415161718192021222324252627282930
## Fold 2 :123456789101112131415161718192021222324252627282930
## Fold 3 :123456789101112131415161718192021222324252627282930
## Fold 4 :123456789101112131415161718192021222324252627282930
## Fold 5 :123456789101112131415161718192021222324252627282930
## Fold 6 :123456789101112131415161718192021222324252627282930
## Fold 7 :123456789101112131415161718192021222324252627282930
## Fold 8 :123456789101112131415161718192021222324252627282930
## Fold 9 :123456789101112131415161718192021222324252627282930
## Fold 10 :123456789101112131415161718192021222324252627282930
pamrcv
## Call:
## pamr.cv(fit = pamr, data = traindata)
## threshold nonzero errors
## 1 0.000 22830 60
## 2 0.842 14931 65
## 3 1.684 11450 62
## 4 2.527 8811 58
## 5 3.369 6760 57
## 6 4.211 5258 56
## 7 5.053 4145 54
## 8 5.895 3230 51
## 9 6.737 2456 52
## 10 7.580 1897 48
## 11 8.422 1446 46
## 12 9.264 1119 46
## 13 10.106 891 44
## 14 10.948 644 43
## 15 11.790 481 43
## 16 12.633 343 40
## 17 13.475 243 42
## 18 14.317 180 40
## 19 15.159 144 42
## 20 16.001 104 42
## 21 16.844 75 45
## 22 17.686 45 52
## 23 18.528 31 50
## 24 19.370 24 56
## 25 20.212 16 78
## 26 21.054 12 118
## 27 21.897 7 134
## 28 22.739 5 137
## 29 23.581 1 270
## 30 24.423 0 452
pamr.plotcv(pamrcv) #plotting results
Two threshold values seems interesting to further evaluate. - One is around \(To\), where the lowest misclassification error rate seems to happen (with around 200 genes selected for classification) - The other one between \(Th\) and \(Th\) before the misclassification error rate start to rise drastically especially for the “NORMAL” samples.
Selection of two threshold value to evaluate
\(To\) - threshold “optimal” \(Th\) - threshold “higher” , suboptimal (but still errors do not explode), with fewer genes selected
To=14.3 # Optimal threshold
Th=19.3 # Higher threshold
library(classmap)
## Warning: package 'classmap' was built under R version 4.2.3
library(classmapExt) #package that extends classmap functionalities
Using vcr.pamr.train function to produce all the quantities needed for the visualization of the two models with the two thresholds:
vcrtrainTo=vcr.pamr.train(data=traindata, pamrfit = pamr, threshold=To)
vcrtrainTh=vcr.pamr.train(data=traindata, pamrfit = pamr, threshold=Th)
Saving also the index of the retained genes for classification at the two thresholds (to run another classification algorithm that does not perform also feature selection):
selectedgenes=pamr.listgenes(pamr, traindata, threshold = vcrtrainTo$threshold)
## id LUAD-score LUSC-score NORM-score
## [1,] SFTPC 0 0 2.244
## [2,] AGER 0 0 1.0194
## [3,] CLEC3B 0 0 0.5766
## [4,] C13orf15 0 0 0.5065
## [5,] TRIM29 -0.3787 0 0
## [6,] EMP2 0 0 0.3709
## [7,] PVRL1 -0.3467 0 0
## [8,] MUC1 0.3407 0 0
## [9,] PERP -0.3328 0 0
## [10,] FHL1 0 0 0.3269
## [11,] KRT5 -0.3185 0 0
## [12,] NKX2-1 0.3123 0 0
## [13,] CD63 0.2996 0 0
## [14,] IRF6 -0.2612 0 0
## [15,] GOLM1 0.258 0 0
## [16,] RBM47 0.2564 0 0
## [17,] TP63 -0.2541 0 0
## [18,] TNNC1 0 0 0.2311
## [19,] ABCC3 0.23 0 0
## [20,] LOC440335 0.2239 0 0
## [21,] PKP1 -0.2223 0 0
## [22,] ERBB3 0.2215 0 0
## [23,] C17orf28 0.2203 0 0
## [24,] PRSS8 0.2131 0 0
## [25,] ATP1B3 -0.2084 0 0
## [26,] KRT6A -0.2026 0 0
## [27,] DSC3 -0.2024 0 0
## [28,] ACSL5 0.202 0 0
## [29,] MVP 0.1989 0 0
## [30,] EPCAM 0.1952 0 0
## [31,] DDAH1 0.1822 0 0
## [32,] GJB5 -0.1738 0 0
## [33,] JAG1 -0.1726 0 0
## [34,] SERPINB5 -0.1724 0 0
## [35,] TMC5 0.1707 0 0
## [36,] ST6GALNAC2 -0.1694 0 0
## [37,] PRR15L 0.163 0 0
## [38,] CEACAM6 0.1545 0 0
## [39,] LLGL2 0.1497 0 0
## [40,] ZDHHC9 0.1456 0 0
## [41,] RORC 0.1436 0 0
## [42,] SH3BP1 -0.1433 0 0
## [43,] CLDN3 0.1401 0 0
## [44,] QSOX1 0.1392 0 0
## [45,] MLPH 0.1386 0 0
## [46,] DSG3 -0.1378 0 0
## [47,] SLC2A1 -0.1377 0 0
## [48,] PTP4A2 0.1337 0 0
## [49,] CNBP -0.1311 0 0
## [50,] ICA1 0.1295 0 0
## [51,] CD9 -0.1281 0 0
## [52,] TMEM125 0.124 0 0
## [53,] ERGIC1 0.1232 0 0
## [54,] CD46 0.1228 0 0
## [55,] CD2AP 0.1222 0 0
## [56,] PON2 0.1195 0 0
## [57,] TMEM63A 0.1192 0 0
## [58,] MIR205HG -0.1185 0 0
## [59,] SMPDL3B 0.1177 0 0
## [60,] XXYLT1 -0.1171 0 0
## [61,] SFTA3 0.117 0 0
## [62,] PTTG1IP 0.1146 0 0
## [63,] TMEM189 -0.1137 0 0
## [64,] NEK6 0.1131 0 0
## [65,] CYB561D2 0.112 0 0
## [66,] GALE 0.1115 0 0
## [67,] SFTA2 0.1114 0 0
## [68,] TMEM62 0.1111 0 0
## [69,] KCNK5 0.111 0 0
## [70,] ZNF385A -0.1103 0 0
## [71,] SLC6A8 -0.1099 0 0
## [72,] EFNB1 -0.1062 0 0
## [73,] TMEM8A 0.1039 0 0
## [74,] SFN -0.1039 0 0
## [75,] WFDC2 0.1029 0 0
## [76,] HDGFRP3 -0.1017 0 0
## [77,] SIAH2 -0.1008 0 0
## [78,] LGALS3BP 0.097 0 0
## [79,] CSDA -0.0956 0 0
## [80,] PNKD 0.0954 0 0
## [81,] CALML3 -0.094 0 0
## [82,] MMP15 0.093 0 0
## [83,] LOC100129034 0.0912 0 0
## [84,] ATP11A 0.0911 0 0
## [85,] GPC1 -0.0897 0 0
## [86,] CGN 0.0878 0 0
## [87,] CYB561 0.0875 0 0
## [88,] RASSF7 0.0842 0 0
## [89,] PECAM1 0 0 0.0833
## [90,] VSNL1 -0.083 0 0
## [91,] TJP3 0.0824 0 0
## [92,] CAPN8 0.0823 0 0
## [93,] C1orf210 0.0815 0 0
## [94,] ST3GAL5 0.0811 0 0
## [95,] TMEM150A 0.077 0 0
## [96,] ARL6IP5 0.0743 0 0
## [97,] GALM 0.074 0 0
## [98,] FAM107B 0.0725 0 0
## [99,] FYTTD1 -0.0714 0 0
## [100,] KRT17 -0.0713 0 0
## [101,] SUCLG2 0.071 0 0
## [102,] FZD6 -0.0704 0 0
## [103,] SNAI2 -0.0702 0 0
## [104,] FSCN1 -0.07 0 0
## [105,] FAM162A -0.0682 0 0
## [106,] ARF4 0.0671 0 0
## [107,] ARPC2 -0.067 0 0
## [108,] CLDN7 0.066 0 0
## [109,] ACTL6A -0.0639 0 0
## [110,] TCEA3 0.0609 0 0
## [111,] MICALL1 -0.0589 0 0
## [112,] OCIAD2 0.0585 0 0
## [113,] PCYT1A -0.0578 0 0
## [114,] HNF1B 0.0566 0 0
## [115,] NAPSA 0.055 0 0
## [116,] C4orf34 0.0544 0 0
## [117,] GPR87 -0.0535 0 0
## [118,] ITGA6 -0.0526 0 0
## [119,] BMP7 -0.0516 0 0
## [120,] CLDND1 -0.0503 0 0
## [121,] TUBA1C -0.0503 0 0
## [122,] TMEM59 0.0495 0 0
## [123,] SLC22A31 0.0487 0 0
## [124,] TMEM219 0.0477 0 0
## [125,] PSMD2 -0.0459 0 0
## [126,] PRNP -0.0452 0 0
## [127,] CLCA2 -0.0451 0 0
## [128,] LPCAT1 0.0441 0 0
## [129,] FUCA1 0.0432 0 0
## [130,] TRA2B -0.0425 0 0
## [131,] GMPS -0.0422 0 0
## [132,] PDCD10 -0.0409 0 0
## [133,] TUBB6 -0.0398 0 0
## [134,] MRPL47 -0.0395 0 0
## [135,] JUP -0.0378 0 0
## [136,] ANXA11 0.0365 0 0
## [137,] TIMMDC1 -0.0364 0 0
## [138,] RAB7A -0.0363 0 0
## [139,] TMEM45B 0.0361 0 0
## [140,] CD151 0.0359 0 0
## [141,] CORO1C -0.0332 0 0
## [142,] RGL3 0.0332 0 0
## [143,] SOX15 -0.0329 0 0
## [144,] TSPAN15 0.0324 0 0
## [145,] EMB 0.0321 0 0
## [146,] TMC4 0.0316 0 0
## [147,] SPDEF 0.0307 0 0
## [148,] DLG1 -0.0297 0 0
## [149,] NAA50 -0.0277 0 0
## [150,] C3orf37 -0.0276 0 0
## [151,] GOLT1A 0.0263 0 0
## [152,] BICD2 -0.0255 0 0
## [153,] PTDSS1 -0.0242 0 0
## [154,] MRPS22 -0.0236 0 0
## [155,] UFC1 0.0235 0 0
## [156,] TRIM8 0.0229 0 0
## [157,] CCNJL 0.0208 0 0
## [158,] ZDHHC16 0.019 0 0
## [159,] FAM83B -0.0184 0 0
## [160,] HRAS -0.0179 0 0
## [161,] SLC9A3R1 -0.0175 0 0
## [162,] KRTCAP2 0.0172 0 0
## [163,] PBXIP1 0.0155 0 0
## [164,] SIGIRR 0.0153 0 0
## [165,] TRAM1 0.0142 0 0
## [166,] EFCAB4A 0.0129 0 0
## [167,] CD44 -0.0109 0 0
## [168,] GNPTG 0.0096 0 0
## [169,] LMAN2 0.0095 0 0
## [170,] AP2M1 -0.0095 0 0
## [171,] GPRC5C 0.0077 0 0
## [172,] KRT7 0.0074 0 0
## [173,] RFC4 -0.0044 0 0
## [174,] BSPRY 0.0028 0 0
## [175,] RAB17 0.0019 0 0
## [176,] DPP4 0.0015 0 0
## [177,] FRMD6 -0.0012 0 0
## [178,] SLC34A2 8e-04 0 0
## [179,] SLC43A3 5e-04 0 0
## [180,] CLIC6 2e-04 0 0
selectedgenes=as.vector(selectedgenes[,1])
sgenes_indexTo <- rownames(traindata$x) %in% selectedgenes
selectedgenes=pamr.listgenes(pamr, traindata, threshold = vcrtrainTh$threshold)
## id LUAD-score LUSC-score NORM-score
## [1,] SFTPC 0 0 0.6671
## [2,] TRIM29 -0.1894 0 0
## [3,] PVRL1 -0.1574 0 0
## [4,] MUC1 0.1514 0 0
## [5,] PERP -0.1435 0 0
## [6,] KRT5 -0.1291 0 0
## [7,] NKX2-1 0.1229 0 0
## [8,] CD63 0.1103 0 0
## [9,] IRF6 -0.0719 0 0
## [10,] GOLM1 0.0687 0 0
## [11,] RBM47 0.067 0 0
## [12,] TP63 -0.0647 0 0
## [13,] ABCC3 0.0406 0 0
## [14,] LOC440335 0.0346 0 0
## [15,] PKP1 -0.033 0 0
## [16,] ERBB3 0.0322 0 0
## [17,] C17orf28 0.031 0 0
## [18,] PRSS8 0.0237 0 0
## [19,] ATP1B3 -0.0191 0 0
## [20,] KRT6A -0.0132 0 0
## [21,] DSC3 -0.013 0 0
## [22,] ACSL5 0.0127 0 0
## [23,] MVP 0.0095 0 0
## [24,] EPCAM 0.0058 0 0
selectedgenes=as.vector(selectedgenes[,1])
sgenes_indexTh<- rownames(traindata$x) %in% selectedgenes
sgenes_index=data.frame(sgenes_indexTo,sgenes_indexTh)
write.csv(sgenes_index, "sgenes_index.csv" , row.names = FALSE)
cfm=caret::confusionMatrix(factor(vcrtrainTo$pred),factor(vcrtrainTo$y))
cfm$table
## Reference
## Prediction LUAD LUSC NORM
## LUAD 380 23 1
## LUSC 10 377 0
## NORM 2 3 76
silplot(vcrtrainTo, main=paste0("(Training) Silplot at threshold: ",To))
## classNumber classLabel classSize classAveSi
## 1 LUAD 392 0.91
## 2 LUSC 403 0.87
## 3 NORM 77 0.98
cfm=caret::confusionMatrix(factor(vcrtrainTh$pred),factor(vcrtrainTh$y))
cfm$table
## Reference
## Prediction LUAD LUSC NORM
## LUAD 384 36 10
## LUSC 5 366 0
## NORM 3 1 67
silplot(vcrtrainTh, main=paste0("(Training) Silplot at threshold: ",Th))
## classNumber classLabel classSize classAveSi
## 1 LUAD 392 0.50
## 2 LUSC 403 0.56
## 3 NORM 77 0.58
par(mfrow=c(1,3))
classmap(vcrtrainTo, whichclass = 1)
classmap(vcrtrainTo, whichclass = 2)
classmap(vcrtrainTo, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",To), line=-5, side=3, outer=TRUE, cex=1)
par(mfrow=c(1,3))
classmap(vcrtrainTh, whichclass = 1)
classmap(vcrtrainTh, whichclass = 2)
classmap(vcrtrainTh, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",Th), line=-5, side=3, outer=TRUE, cex=1)
#add a way to print in markdown documentation of mdscolorscale??
mdscolorscale(vcrtrainTo, diss=vcrtrainTo$pwd, main=
paste0("(Train) MDScolorscale at thresh: ",To))
mdscolorscale(vcrtrainTh, diss=vcrtrainTh$pwd, main=
paste0("(Train) MDScolorscale at thresh: ",Th))
Just for threshold \(Th\)
On continuos covariates:
index_continus_covariates=c(1,4,5,14,18,26,27,28,44)
par(mfrow=c(3,3))
for (i in index_continus_covariates) {
continuos_covariate=as.numeric(unlist(covariates[train_index,i], use.names = FALSE))
qresplot(vcrtrainTh$PAC,continuos_covariate, plotErrorBars = TRUE, main = colnames(covariates)[i])
}
We can also plot some discrete covariates coding them to integers values:
index_discrete_covariates=c(2,22)
par(mfrow=c(1,2))
legend=data.frame(matrix(nrow = 20, ncol = 0))
for (i in index_discrete_covariates) {
discrete_covariate=factor(unlist(covariates[train_index,i], use.names = FALSE))
levels=levels(discrete_covariate)
if (length(levels) < 20) {
levels <- c(levels, rep(NA, 20 - length(levels)))
}
legend=cbind(legend,levels)
if(i!=22) {name="Stage"} else {name="Histology code"}
qresplot(vcrtrainTh$PAC,as.numeric(discrete_covariate), plotErrorBars = TRUE,
main = name)
}
legend=cbind(c(1:20), legend)
colnames(legend)=c("Integer coded","Stage","Histology code")
knitr::kable(legend, caption = "Legend")
Integer coded | Stage | Histology code |
---|---|---|
1 | ||
2 | STAGE I | 8052/3 |
3 | STAGE IA | 8070/3 |
4 | STAGE IB | 8071/3 |
5 | STAGE II | 8072/3 |
6 | STAGE IIA | 8073/3 |
7 | STAGE IIB | 8083/3 |
8 | STAGE III | 8140/3 |
9 | STAGE IIIA | 8230/3 |
10 | STAGE IIIB | 8250/3 |
11 | STAGE IV | 8252/3 |
12 | NA | 8253/3 |
13 | NA | 8255/3 |
14 | NA | 8260/3 |
15 | NA | 8310/3 |
16 | NA | 8480/3 |
17 | NA | 8490/3 |
18 | NA | 8507/3 |
19 | NA | 8550/3 |
20 | NA | NA |
On selected genes for the classification in dataset:
selectedgenes=which(sgenes_index$sgenes_indexTh)
#in all the genes
coeff=rep(0,nrow(traindata$x[selectedgenes,]))
#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(traindata$x[selectedgenes,], 1, function(x) {
unique_count <- length(unique(x))
zero_count <- sum(x == 0)
return(unique_count >= 7 & zero_count / length(x) < 0.35)}))
for (i in index_nonsparse) {
lm=lm(vcrtrainTh$PAC~traindata$x[i,])
coeff[i]=lm$coefficients[2]
}
sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 11 18 8 24 3 4 2 20 5
sorted_coeff[1:9]
## [1] 0.220692155 0.113886093 0.063452879 0.021180450 0.013086051 0.011393740 0.003220045 0.002280854 0.001019688
par(mfrow=c(3,3))
for (i in sorted_indices[1:9]) {
gene=traindata$x[i,]
qresplot(vcrtrainTh$PAC,gene,plotErrorBars = TRUE, main = rownames(traindata$x)[i])
}
On all genes in dataset:
#in all the genes
coeff=rep(0,nrow(traindata$x))
#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
index_nonsparse <- which(apply(traindata$x, 1, function(x) length(unique(x)) >= 8))
#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(traindata$x, 1, function(x) {
unique_count <- length(unique(x))
zero_count <- sum(x == 0)
return(unique_count >= 7 & zero_count / length(x) < 0.35)}))
for (i in index_nonsparse) {
lm=lm(vcrtrainTh$PAC~traindata$x[i,])
coeff[i]=lm$coefficients[2]
}
sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 11529 17404 13647 2943 1516 10195 4907 16301 10933
sorted_coeff[1:9]
## [1] 1.0471477 0.7730503 0.5717340 0.5355485 0.5255370 0.4841109 0.4669274 0.4653891 0.4572245
par(mfrow=c(3,3))
for (i in sorted_indices[1:9]) {
gene=traindata$x[i,]
qresplot(vcrtrainTh$PAC,gene,plotErrorBars = TRUE, main = rownames(traindata$x)[i])
}
Using vcr.pamr.newdata function to produce all the quantities needed for the visualization of the two models with the two thresholds:
vcrtestTo=vcr.pamr.newdata(newdata=testdata, vcr.pamr.train.out = vcrtrainTo)
vcrtestTh=vcr.pamr.newdata(newdata=testdata, vcr.pamr.train.out = vcrtrainTh)
cfm=caret::confusionMatrix(factor(vcrtestTo$pred),factor(vcrtestTo$ynew))
cfm$table
## Reference
## Prediction LUAD LUSC NORM
## LUAD 95 6 0
## LUSC 10 82 0
## NORM 4 0 22
silplot(vcrtestTo, main=paste0("(Test) Silplot at threshold: ",To))
## classNumber classLabel classSize classAveSi
## 1 LUAD 109 0.74
## 2 LUSC 88 0.87
## 3 NORM 22 1.00
cfm=caret::confusionMatrix(factor(vcrtestTh$pred),factor(vcrtestTh$ynew))
cfm$table
## Reference
## Prediction LUAD LUSC NORM
## LUAD 101 7 2
## LUSC 6 81 0
## NORM 2 0 20
silplot(vcrtrainTh, main=paste0("(Test) Silplot at threshold: ",Th))
## classNumber classLabel classSize classAveSi
## 1 LUAD 392 0.50
## 2 LUSC 403 0.56
## 3 NORM 77 0.58
par(mfrow=c(1,3))
classmap(vcrtestTo, whichclass = 1)
classmap(vcrtestTo, whichclass = 2)
classmap(vcrtestTo, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",To), line=-5, side=3, outer=TRUE, cex=1)
par(mfrow=c(1,3))
classmap(vcrtestTh, whichclass = 1)
classmap(vcrtestTh, whichclass = 2)
classmap(vcrtestTh, whichclass = 3)
mtext(paste0("Classmaps at threshold: ",Th), line=-5, side=3, outer=TRUE, cex=1)
mdscolorscale(vcrtestTo, diss=vcrtestTo$pwd, main=
paste0("(Test) MDScolorscale at thresh: ",To))
mdscolorscale(vcrtestTh, diss=vcrtestTh$pwd, main=
paste0("(Test) MDScolorscale at thresh: ",Th))
Just for threshold \(Th\)
On continuos covariates:
index_continus_covariates=c(1,4,5,14,18,26,27,28,44)
par(mfrow=c(3,3))
for (i in index_continus_covariates) {
continuos_covariate=as.numeric(unlist(covariates[-train_index,i], use.names = FALSE))
qresplot(vcrtestTh$PAC,continuos_covariate, plotErrorBars = TRUE, main = colnames(covariates)[i])
}
We can also plot some discrete covariates coding them to integers values:
index_discrete_covariates=c(2,22)
par(mfrow=c(1,2))
legend=data.frame(matrix(nrow = 20, ncol = 0))
for (i in index_discrete_covariates) {
discrete_covariate=factor(unlist(covariates[-train_index,i], use.names = FALSE))
levels=levels(discrete_covariate)
if (length(levels) < 20) {
levels <- c(levels, rep(NA, 20 - length(levels)))
}
legend=cbind(legend,levels)
if(i!=22) {name="Stage"} else {name="Histology code"}
qresplot(vcrtestTh$PAC,as.numeric(discrete_covariate), plotErrorBars = TRUE,
main = name)
}
legend=cbind(c(1:20), legend)
colnames(legend)=c("Integer coded","Stage","Histology code")
knitr::kable(legend, caption = "Legend")
Integer coded | Stage | Histology code |
---|---|---|
1 | STAGE I | 8052/3 |
2 | STAGE IA | 8070/3 |
3 | STAGE IB | 8071/3 |
4 | STAGE II | 8083/3 |
5 | STAGE IIA | 8140/3 |
6 | STAGE IIB | 8230/3 |
7 | STAGE IIIA | 8252/3 |
8 | STAGE IIIB | 8255/3 |
9 | STAGE IV | 8260/3 |
10 | NA | 8310/3 |
11 | NA | 8480/3 |
12 | NA | 8550/3 |
13 | NA | NA |
14 | NA | NA |
15 | NA | NA |
16 | NA | NA |
17 | NA | NA |
18 | NA | NA |
19 | NA | NA |
20 | NA | NA |
On selected genes for the classification in dataset:
selectedgenes=which(sgenes_index$sgenes_indexTh)
#in all the genes
coeff=rep(0,nrow(testdata$x[selectedgenes,]))
#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(testdata$x[selectedgenes,], 1, function(x) {
unique_count <- length(unique(x))
zero_count <- sum(x == 0)
return(unique_count >= 7 & zero_count / length(x) < 0.35)}))
for (i in index_nonsparse) {
lm=lm(vcrtestTh$PAC~testdata$x[i,])
coeff[i]=lm$coefficients[2]
}
sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 18 24 4 8 3 5 2 14 21
sorted_coeff[1:9]
## [1] 0.0489389471 0.0387591959 0.0184546417 0.0148206676 0.0113137019 0.0099551486 0.0019554276 0.0008352847 0.0003943720
par(mfrow=c(3,3))
for (i in sorted_indices[1:9]) {
gene=testdata$x[i,]
qresplot(vcrtestTh$PAC,gene,plotErrorBars = TRUE, main = rownames(testdata$x)[i])
}
On all genes in dataset:
#in all the genes
coeff=rep(0,nrow(testdata$x))
#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
index_nonsparse <- which(apply(testdata$x, 1, function(x) length(unique(x)) >= 8))
#getting indexes only of genes that take on more that 7 unique values (<7 values it is rather discrete)
#also lefting out genes that are more than 35% observation that take 0 values
index_nonsparse <- which(apply(testdata$x, 1, function(x) {
unique_count <- length(unique(x))
zero_count <- sum(x == 0)
return(unique_count >= 7 & zero_count / length(x) < 0.35)}))
for (i in index_nonsparse) {
lm=lm(vcrtestTh$PAC~testdata$x[i,])
coeff[i]=lm$coefficients[2]
}
sorted_indices <- order(coeff, decreasing = TRUE)
sorted_coeff <- sort(coeff, decreasing = TRUE)
sorted_indices[1:9]
## [1] 11529 2547 13647 15640 16151 18524 7158 10982 1934
sorted_coeff[1:9]
## [1] 1.4775282 1.3338545 0.9201371 0.6643222 0.5330686 0.5316600 0.4690691 0.4098608 0.3177889
par(mfrow=c(3,3))
for (i in sorted_indices[1:9]) {
gene=testdata$x[i,]
qresplot(vcrtestTh$PAC,gene,plotErrorBars = TRUE, main = rownames(testdata$x)[i])
}