## Make colors for plots
fabcolors = RColorBrewer::brewer.pal(n = 11,name = 'RdGy')
col1 = RColorBrewer::brewer.pal(n = 10,name = 'PRGn')
col2 = RColorBrewer::brewer.pal(n = 10,name = 'Spectral')
col3 = RColorBrewer::brewer.pal(n = 10,name = 'BrBG')
col4 = RColorBrewer::brewer.pal(n = 10,name = 'PiYG')
col5 = RColorBrewer::brewer.pal(n = 10,name = 'PuOr')


allcolors <- c(fabcolors, col1,col2,col3, col4, col5)
allcolors <- list(allcolors)

morecolors1 <- wes_palette("Darjeeling1", n=4, type = "discrete")
morecolors1 <- list(morecolors1)

morecolors2 <- wes_palette("Moonrise2", n=3, type = "discrete")
morecolors2 <- list(morecolors2)

color_list <- c(allcolors, morecolors1, morecolors2)

0.1 Introduction

This document describes training a random forest model using latent variables generated by transfer learning approaches as its input features. We are using the latent variables found in NF1 tumor data to train the forest to find the most important classifying LVs for the four tumor types in NF1. We chose to train a random forest classifier to identify NF tumortypes using the LVs for the following features of the model :

  • robustness to high dimensionality data
  • ability to handle unbalanced classes
  • robustness to outliers and non-linear data
  • quick training /prediction speeds
  • low bias and moderate variance

Our goal is to find important latent variables that classify the various tumorTypes. We will then inspect the classifying features to find meaningful genesets that distinguish between two tumortypes.

Tumortypes represented in the data:

  • Plexiform Neurofibroma
  • MPNST
  • Cutaneous Neurofibroma
  • Neurofibroma

0.2 Partitioning the data into training and testing set:

The dataset was split into 75% training and 25% testing dataset. The function createDataPartition is used to create balanced splits of the data. Since the tumorType argument to this function is a factor, the random sampling occurs within each class and should preserve the overall class distribution of the data.

#Make the test and training datasets
set.seed(998)
inTraining <- createDataPartition(as.factor(forest_data$tumorType), p = .75, list = FALSE)

training <- forest_data[ inTraining,]
testing  <- forest_data[-inTraining,]

0.3 Model training and Crossvalidation :

We first trained an initial model on the training dataset using iterative mtrys and 500 trees. The model was crossvalidated using 10-fold crossvalidation technique.

Then we tuned the model parameters to iterate through 1:100 different features to split the trees on (mtrys). We also increased the number of trees to 1000 to increase its accuracy. To account for adequate sample size for each validation round, 5-fold crossvalidation was carried out. Below are details of our tuned model.

## Load Fit data (The Model described in this document is stored on Synapse)
load(synGet("syn21201190")$path)

# 10 fold validation control
fitControl <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated ten times
                           repeats = 5)

tunegrid <- expand.grid(.mtry=c(1:100))

#Find the classes:
summary(training$tumorType)
##                  Cutaneous Neurofibroma 
##                                      25 
## Malignant Peripheral Nerve Sheath Tumor 
##                                       7 
##                            Neurofibroma 
##                                       9 
##                  Plexiform Neurofibroma 
##                                      15
## Construct the random forest model called Fit (the code is commented out to facilitate quick rendering of html file by loading the Fit from Synapse)

# set.seed(9998)
# Fit <- train(tumorType ~ .,
#              data = training[,c(1:ncol(training))],
#              method = "rf",
#              ntree= 1000,
#              tuneGrid = tunegrid,
#              #classwt =
#              proximity=TRUE,
#              importance = TRUE,
#              trControl = fitControl,
#              verbose = TRUE)

print(" Check the fit of the model")                 
## [1] " Check the fit of the model"
Fit
## Random Forest 
## 
##  56 samples
## 962 predictors
##   4 classes: 'Cutaneous Neurofibroma', 'Malignant Peripheral Nerve Sheath Tumor', 'Neurofibroma', 'Plexiform Neurofibroma' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 45, 45, 45, 45, 44, 45, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##     1   0.7618182  0.6447017
##     2   0.7750909  0.6674779
##     3   0.7618182  0.6477441
##     4   0.7684242  0.6594311
##     5   0.7784242  0.6737575
##     6   0.7684242  0.6600565
##     7   0.7624242  0.6506555
##     8   0.7651515  0.6548846
##     9   0.7717576  0.6644446
##    10   0.7757576  0.6714266
##    11   0.7790909  0.6749416
##    12   0.7581212  0.6452353
##    13   0.7754545  0.6699783
##    14   0.7647879  0.6552780
##    15   0.7578182  0.6438213
##    16   0.7681212  0.6608203
##    17   0.7611515  0.6483924
##    18   0.7690909  0.6621373
##    19   0.7753939  0.6717473
##    20   0.7647879  0.6555631
##    21   0.7687879  0.6619126
##    22   0.7681212  0.6603113
##    23   0.7711515  0.6649180
##    24   0.7650909  0.6554221
##    25   0.7578182  0.6451481
##    26   0.7614545  0.6504119
##    27   0.7721212  0.6664220
##    28   0.7611515  0.6497884
##    29   0.7684848  0.6609998
##    30   0.7611515  0.6494565
##    31   0.7611515  0.6501727
##    32   0.7541818  0.6388756
##    33   0.7644848  0.6545503
##    34   0.7761212  0.6712361
##    35   0.7605455  0.6474140
##    36   0.7578182  0.6444511
##    37   0.7724242  0.6656658
##    38   0.7718182  0.6654602
##    39   0.7581818  0.6452469
##    40   0.7651515  0.6553275
##    41   0.7761212  0.6719176
##    42   0.7611515  0.6503284
##    43   0.7764242  0.6722162
##    44   0.7727879  0.6671177
##    45   0.7867273  0.6882605
##    46   0.7661212  0.6573377
##    47   0.7724848  0.6648988
##    48   0.7684848  0.6589195
##    49   0.7687879  0.6607646
##    50   0.7694545  0.6610450
##    51   0.7618182  0.6497178
##    52   0.7684242  0.6603213
##    53   0.7690909  0.6610938
##    54   0.7648485  0.6522546
##    55   0.7547879  0.6397116
##    56   0.7690909  0.6592891
##    57   0.7618182  0.6508757
##    58   0.7651515  0.6552726
##    59   0.7548485  0.6380501
##    60   0.7790909  0.6747136
##    61   0.7730909  0.6663844
##    62   0.7721818  0.6653602
##    63   0.7754545  0.6702252
##    64   0.7761212  0.6701356
##    65   0.7797576  0.6772121
##    66   0.7761212  0.6706439
##    67   0.7690909  0.6608972
##    68   0.7475758  0.6280900
##    69   0.7506061  0.6315657
##    70   0.7585455  0.6447689
##    71   0.7621818  0.6507205
##    72   0.7615152  0.6480603
##    73   0.7724848  0.6660047
##    74   0.7621818  0.6498402
##    75   0.7681212  0.6597313
##    76   0.7757576  0.6717167
##    77   0.7757576  0.6710969
##    78   0.7615152  0.6490280
##    79   0.7581818  0.6421636
##    80   0.7581818  0.6441605
##    81   0.7651515  0.6535819
##    82   0.7545455  0.6375596
##    83   0.7621212  0.6497683
##    84   0.7545455  0.6378949
##    85   0.7618182  0.6489569
##    86   0.7442424  0.6216472
##    87   0.7648485  0.6532031
##    88   0.7581818  0.6417498
##    89   0.7651515  0.6534756
##    90   0.7684848  0.6577606
##    91   0.7654545  0.6531216
##    92   0.7654545  0.6550242
##    93   0.7618182  0.6471949
##    94   0.7687879  0.6587440
##    95   0.7581818  0.6425740
##    96   0.7757576  0.6710274
##    97   0.7548485  0.6373475
##    98   0.7655152  0.6541693
##    99   0.7618182  0.6475781
##   100   0.7615152  0.6473774
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 45.
#plot the model
theme_update(text = element_text(size=20))
ggplot(Fit$results,  aes(x=mtry, y=Accuracy)) +
  geom_point(aes(size=2)) +
  geom_line() +
  coord_cartesian(ylim = c(0,1)) +
  labs(main="The model", x="mtry :: Number of features for each split", y= "Accuracy of the model") 

print("Check the clustering of the samples according to the model")
## [1] "Check the clustering of the samples according to the model"
MDSplot(Fit$finalModel, 
        as.factor(training$tumorType), 
        k=2, palette=NULL, 
        pch=as.numeric(training$tumorType), 
        cex=2, 
        cex.axis= 1.5,
        cex.lab = 1.5,
        cex.main = 1.5,
        main= "MDS Plot of the classified training set")
legend("topright",
       inset=0.01, 
       cex= 1.2,
       legend=levels(training$tumorType), 
       fill=brewer.pal(6, "Set1"))

#Order errors using OOB
#head(Fit$finalModel$err.rate[order(Fit$finalModel$err.rate[,1]),])

0.4 Predict the test set with our tuned model

#Use model to predict labels of test data
pred <- predict(Fit, newdata = testing[,c(1:length(colnames(testing)))])

#store predicted labels in the test dataframe
testing$pred <- pred

0.4.1 Check the quality of prediction

The confusion matrix of the prediction using the model is given below.

# Check the accuracy of the model
library(DT)

conf_matrix <- confusionMatrix(data = testing$pred, 
                              reference = as.factor(testing$tumorType), 
                              mode = "prec_recall")

conf_matrix
## Confusion Matrix and Statistics
## 
##                                          Reference
## Prediction                                Cutaneous Neurofibroma
##   Cutaneous Neurofibroma                                       8
##   Malignant Peripheral Nerve Sheath Tumor                      0
##   Neurofibroma                                                 0
##   Plexiform Neurofibroma                                       0
##                                          Reference
## Prediction                                Malignant Peripheral Nerve Sheath Tumor
##   Cutaneous Neurofibroma                                                        0
##   Malignant Peripheral Nerve Sheath Tumor                                       2
##   Neurofibroma                                                                  0
##   Plexiform Neurofibroma                                                        0
##                                          Reference
## Prediction                                Neurofibroma
##   Cutaneous Neurofibroma                             0
##   Malignant Peripheral Nerve Sheath Tumor            0
##   Neurofibroma                                       3
##   Plexiform Neurofibroma                             0
##                                          Reference
## Prediction                                Plexiform Neurofibroma
##   Cutaneous Neurofibroma                                       0
##   Malignant Peripheral Nerve Sheath Tumor                      0
##   Neurofibroma                                                 0
##   Plexiform Neurofibroma                                       4
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8049, 1)
##     No Information Rate : 0.4706     
##     P-Value [Acc > NIR] : 2.722e-06  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: Cutaneous Neurofibroma
## Precision                                   1.0000
## Recall                                      1.0000
## F1                                          1.0000
## Prevalence                                  0.4706
## Detection Rate                              0.4706
## Detection Prevalence                        0.4706
## Balanced Accuracy                           1.0000
##                      Class: Malignant Peripheral Nerve Sheath Tumor
## Precision                                                    1.0000
## Recall                                                       1.0000
## F1                                                           1.0000
## Prevalence                                                   0.1176
## Detection Rate                                               0.1176
## Detection Prevalence                                         0.1176
## Balanced Accuracy                                            1.0000
##                      Class: Neurofibroma Class: Plexiform Neurofibroma
## Precision                         1.0000                        1.0000
## Recall                            1.0000                        1.0000
## F1                                1.0000                        1.0000
## Prevalence                        0.1765                        0.2353
## Detection Rate                    0.1765                        0.2353
## Detection Prevalence              0.1765                        0.2353
## Balanced Accuracy                 1.0000                        1.0000
#conf_matrix$table

Even though our model accuracy on the training data seemed to stagnate around 77%, a close look at the confusion matrix of the test set shows high F1 scores for all our test classes. So we decided to go ahead with this model and inspect the features it deemed important for the classification.

0.5 Important variables for the model

Lets take a look at the important latent variables picked up by our model as classifiers for the different classes.

# estimate variable importance
importance <- varImp(Fit, scale=FALSE)

# Plot the importance of the variables
varImpPlot(Fit$finalModel,
           main = "Important variables in the forest",
           n.var = 20)

# Select top important features
list <- as.data.frame(importance$importance)

## Transform list to enable plotting
list$LV <- rownames(list)
list$LV <- gsub("`", "", list$LV)
##Function to extract numbers from data in columns
library(stringr)
numextract <- function(string){ 
  str_extract(string, "\\-*\\d+\\.*\\d*")
} 
list$LatentVar <- numextract(list$LV)
list$LatentVar <- glue('V{list$LatentVar}')


#Find important vars for each class
MPNST_list <- list[order(-list$`Malignant Peripheral Nerve Sheath Tumor`), ]
pNF_list <- list[order(-list$`Plexiform Neurofibroma`), ]
cNF_list <- list[order(-list$`Cutaneous Neurofibroma`), ]
NF_list <- list[order(-list$`Neurofibroma`), ]


# Plot top features for cNF

featureHeatmap <- function(forest_data, list, numfeatures, list_title){
  "
  This function plots a heatmap of all the important features deemed as classifiers of a class
  
  Expected Inputs:
  1. forest_data <- dataframe used to generate the model (with numeric values of all features for all the classes, specimenID as rownames, column names tumorType)
  2. list <- list of important variables for a specific class
  3. numfeatures <- number of features to be plotted
  4. list_title <- name of the list to be plotted as heatmap
  
  Expected Output:
  pheatmap (tumortype by features)
  "
  keep <- c(colnames(forest_data) %in% list$LV[1:numfeatures])
  plot_data <- forest_data[ , keep]
  plot_data$tumorType <- forest_data$tumorType
  plot_data$specimenID <- rownames(plot_data)
  pheatmap(plot_data[,1:numfeatures],
                       show_rownames = F,
                       labels_col = colnames(plot_data)[1:numfeatures],
                       #fontsize_row = 0,
                       clustering_method = 'ward.D2',
                       cluster_rows = T,
                       scale = "column",
                       #annotation_col = colnames(plot_data),
                      #annotation_colors = c(color_list[[1]],color_list[[2]],color_list[[3]]),
                       annotation_row= plot_data[,c("specimenID", "tumorType")],
                       width = 8, 
                       height = 8,
                       main= glue('Heatmap of features from {list_title}'))
}

featureHeatmap(forest_data, cNF_list, 50, "Cutaneous Neurofibroma")

featureHeatmap(forest_data, NF_list, 50, "Neurofibroma")

featureHeatmap(forest_data, MPNST_list, 50, "MPNST")

featureHeatmap(forest_data, pNF_list, 50, "Plexiform Neurofibroma")

0.6 Plot the gene loadings the top 5 important LVs for each of the tumortypes

0.6.1 Cutaneous Neurofibroma

## Plot the genes in the top LVs

plotGenes <- function(plot_data, list_var){
  
  for (i in (unique(list_var$LatentVar)[1:5])) {
 
  tidy <- plot_data %>%
    dplyr::select(i) %>% 
    tibble::rownames_to_column('genes')
  
  p <- ggplot(tidy %>% top_n(30, get(i))) +
    aes(x=reorder(genes, -get(i)), y=get(i)) +
    geom_bar(stat="identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    ylim(c(0,8)) +
    xlab("Genes") +
    ylab("Loadings") +
    ggtitle(glue('Gene Loadings for interesting LV: {i}'))
  
  print(p)
  
  }

}

plotGenes(gene_by_lv, cNF_list)

0.6.2 Neurofibroma

## Plot the genes in the top LVs

plotGenes(gene_by_lv, NF_list)

0.6.3 Plexiform Neurofibroma

## Plot the genes in the top LVs

plotGenes(gene_by_lv, pNF_list)

## Warning: Removed 2 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (position_stack).

0.6.4 MPNST

## Plot the genes in the top LVs

plotGenes(gene_by_lv, MPNST_list)

## Warning: Removed 2 rows containing missing values (position_stack).

save(Fit, forest_data, file = "RandomForest_LatentVar_Tumortypes.Rdata")
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] glue_1.3.1                      gridExtra_2.3                  
##  [3] AnnotationDbi_1.46.0            IRanges_2.18.1                 
##  [5] S4Vectors_0.22.0                Biobase_2.44.0                 
##  [7] BiocGenerics_0.30.0             pheatmap_1.0.12                
##  [9] AppliedPredictiveModeling_1.1-7 caret_6.0-84                   
## [11] lattice_0.20-38                 e1071_1.7-2                    
## [13] randomForest_4.6-14             wesanderson_0.3.6              
## [15] RColorBrewer_1.1-2              colorspace_1.4-1               
## [17] DT_0.8                          forcats_0.4.0                  
## [19] stringr_1.4.0                   dplyr_0.8.3                    
## [21] purrr_0.3.2                     readr_1.3.1                    
## [23] tidyr_0.8.3                     tibble_2.1.3                   
## [25] ggplot2_3.2.1                   tidyverse_1.2.1                
## [27] BiocManager_1.30.4              synapserutils_0.1.6            
## [29] synapser_0.6.58                
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-141          bit64_0.9-7           lubridate_1.7.4      
##  [4] httr_1.4.1            tools_3.6.0           backports_1.1.4      
##  [7] R6_2.4.0              rpart_4.1-15          DBI_1.0.0            
## [10] lazyeval_0.2.2        nnet_7.3-12           withr_2.1.2          
## [13] tidyselect_0.2.5      bit_1.1-14            compiler_3.6.0       
## [16] cli_1.1.0             rvest_0.3.4           xml2_1.2.2           
## [19] labeling_0.3          scales_1.0.0          digest_0.6.20        
## [22] rmarkdown_1.14        pkgconfig_2.0.2       htmltools_0.3.6      
## [25] htmlwidgets_1.3       rlang_0.4.0           readxl_1.3.1         
## [28] RSQLite_2.1.2         rstudioapi_0.10       generics_0.0.2       
## [31] jsonlite_1.6          ModelMetrics_1.2.2    CORElearn_1.53.1     
## [34] magrittr_1.5          Matrix_1.2-17         Rcpp_1.0.2           
## [37] munsell_0.5.0         stringi_1.4.3         yaml_2.2.0           
## [40] MASS_7.3-51.4         plyr_1.8.4            recipes_0.1.7        
## [43] blob_1.2.0            grid_3.6.0            crayon_1.3.4         
## [46] haven_2.1.1           splines_3.6.0         hms_0.5.0            
## [49] zeallot_0.1.0         knitr_1.24            pillar_1.4.2         
## [52] reshape2_1.4.3        codetools_0.2-16      PythonEmbedInR_0.3.37
## [55] evaluate_0.14         data.table_1.12.2     modelr_0.1.5         
## [58] vctrs_0.2.0           foreach_1.4.7         cellranger_1.1.0     
## [61] gtable_0.3.0          assertthat_0.2.1      pack_0.1-1           
## [64] xfun_0.8              gower_0.2.1           prodlim_2019.10.13   
## [67] broom_0.5.2           class_7.3-15          survival_2.44-1.1    
## [70] timeDate_3043.102     iterators_1.0.12      memoise_1.1.0        
## [73] ellipse_0.4.1         cluster_2.1.0         lava_1.6.6           
## [76] ipred_0.9-9