Get Data

This is nearly entirely based on the code in notebook 08.

We have two tables on synapse that have cNF patient data in them - Patient Table - Sample Table

We also have latent variable expression analysis data - Latent Variable Table

Let’s see if there are any correlations between LV expression and tumor size and patient age.

pat.tab<-synapser::synTableQuery("SELECT Patient,Race,Gender,Age,Pain,Itching FROM syn7342635")$asDataFrame()%>%
  select(-c(ROW_ID,ROW_VERSION))
## 
 [####################]100.00%   1/1   Done...    
Downloading  [####################]100.00%   616.0bytes/616.0bytes (257.6kB/s) Job-99655162243056606045677453.csv Done...
samp.tab<-synapser::synTableQuery("SELECT Patient,TumorNumber,Length_in_mm FROM syn5556216")$asDataFrame()%>%
  select(-c(ROW_ID,ROW_VERSION))
## 
 [####################]100.00%   1/1   Done...    
Downloading  [####################]100.00%   1.4kB/1.4kB (1.6MB/s) Job-99655172121841080595111621.csv Done...
mp_res<-synTableQuery("SELECT * FROM syn21046991")$asDataFrame() %>% 
  filter(tumorType=="Cutaneous Neurofibroma") %>% 
  group_by(latent_var) %>% 
  mutate(sd = sd(value)) %>% 
  ungroup() %>% 
  filter(sd > 0.025) #somewhat arbitrarily picked, gets rid of the LVs with nearly no change
## 
Building the CSV... [#######-------------]34.31%   54123/157768       
Create CSV FileHandle [##########----------]50.35%   79435/157768       
Create CSV FileHandle [####################]100.00%   157768/157768   Done...    
Downloading  [#-------------------]7.43%   2.0MB/26.9MB (3.6MB/s) Job-99655185539596059450051307.csv     
Downloading  [###-----------------]14.85%   4.0MB/26.9MB (5.6MB/s) Job-99655185539596059450051307.csv     
Downloading  [####----------------]22.28%   6.0MB/26.9MB (7.1MB/s) Job-99655185539596059450051307.csv     
Downloading  [######--------------]29.71%   8.0MB/26.9MB (8.0MB/s) Job-99655185539596059450051307.csv     
Downloading  [#######-------------]37.13%   10.0MB/26.9MB (8.4MB/s) Job-99655185539596059450051307.csv     
Downloading  [#########-----------]44.56%   12.0MB/26.9MB (8.9MB/s) Job-99655185539596059450051307.csv     
Downloading  [##########----------]51.99%   14.0MB/26.9MB (9.3MB/s) Job-99655185539596059450051307.csv     
Downloading  [############--------]59.41%   16.0MB/26.9MB (9.5MB/s) Job-99655185539596059450051307.csv     
Downloading  [#############-------]66.84%   18.0MB/26.9MB (9.7MB/s) Job-99655185539596059450051307.csv     
Downloading  [###############-----]74.27%   20.0MB/26.9MB (10.0MB/s) Job-99655185539596059450051307.csv     
Downloading  [################----]81.69%   22.0MB/26.9MB (10.2MB/s) Job-99655185539596059450051307.csv     
Downloading  [##################--]89.12%   24.0MB/26.9MB (10.3MB/s) Job-99655185539596059450051307.csv     
Downloading  [###################-]96.54%   26.0MB/26.9MB (10.5MB/s) Job-99655185539596059450051307.csv     
Downloading  [####################]100.00%   26.9MB/26.9MB (10.5MB/s) Job-99655185539596059450051307.csv Done...

Merge data together

For the purposes of this analysis we want to have just the age, sex, tumor size and and ‘reformed’ patient name

full.tab<-samp.tab%>%left_join(pat.tab,by='Patient')%>%
  mutate(specimenID=paste0('patient',Patient,'tumor',TumorNumber))
DT::datatable(full.tab)

Retrieve Latent Variable Data

Let’s retrieve the LV data and evaluate any correlations between scores and tumor size or patient age

data.with.var<-mp_res%>%inner_join(full.tab,by='specimenID')

#now coampute the correlation with size for each transcript...?
tum.cors=data.with.var%>%group_by(latent_var)%>%
  mutate(corVal=cor(value,Length_in_mm, method = "spearman"))%>%
  arrange(desc(abs(corVal)))%>%
  select(latent_var,corVal)%>%distinct()

DT::datatable(tum.cors)

Size-LV Correlation

Macrophages appear to be associated with associated with size, as does LV 962 and integrin signaling. At the other end of the spectrum, ERY3-related latent variable (comprising erythroid population 3 markers) is inversely correlated to size, as are other LVs - LV 689, LV 903, LV 94, RAC1 signaling, etc….

Expression of these pathways are low in larger cNFs. If (hypothetically) we can upregulate these pathways can we prevent or reverse cNF size?

Conversely, can we get rid of the macrophages in the large tumors? Would this improve outcomes? Here are those data plotted where the correlation value >0.35:

lvs <- subset(tum.cors,abs(corVal)>0.35) %>%
  ungroup() %>%
  arrange(desc(corVal)) %>% 
  select(latent_var) %>%
  unlist()

subset(data.with.var)%>%
    subset(latent_var%in%lvs)%>%
   mutate(latent_var = factor(latent_var, levels = unique(lvs), ordered = T)) %>% 
    ggplot()+geom_point(aes(x=value,y=Length_in_mm,col=sex))+facet_wrap(latent_var~.)

Age-LV Correlation

We can also evaluate LV expression as it correlates to age of the patient. The correlation is much weaker. This is probably because we are comparing a patient variable to a tumor measurement. It would be better to compare tumor age to tumor LV expression, rather than patient age. Worth trying, though.

data.with.var<-mp_res%>%inner_join(full.tab,by='specimenID')

#now coampute the correlation with size for each transcript...?
tum.cors=data.with.var%>%group_by(latent_var)%>%
  mutate(corVal=cor(value,Age, method = "spearman"))%>%
  arrange(desc(abs(corVal)))%>%
  select(latent_var,corVal)%>%distinct()

DT::datatable(tum.cors)
lvs <- subset(tum.cors,abs(corVal)>0.35) %>%
  ungroup() %>%
  arrange(desc(corVal)) %>% 
  select(latent_var) %>%
  unlist()

subset(data.with.var)%>%
    subset(latent_var%in%lvs)%>%
   mutate(latent_var = factor(latent_var, levels = unique(lvs), ordered = T)) %>% 
    ggplot()+geom_point(aes(x=value,y=Age,col=sex))+facet_wrap(latent_var~.)

LV with itching significance test

Do latent variables correlate with itching in the patient? Again, this is patient-level metadata, not tumor level metadata, so it may be more poorly correlated than we hope.

tum.cors=data.with.var%>%group_by(latent_var)%>%
  mutate(corVal=cor(value,Length_in_mm, method = "spearman"))%>%
  arrange(desc(abs(corVal)))%>%
  select(latent_var,corVal)%>%distinct()

lvs <- subset(tum.cors,abs(corVal)>0.35) %>%
  ungroup() %>%
  arrange(desc(corVal)) %>% 
  select(latent_var) %>%
  unlist()

data.with.var %>% 
  subset(latent_var%in%lvs)%>%
  mutate(latent_var = factor(latent_var, levels = unique(lvs), ordered = T)) %>% 
  ggplot() +
  geom_point(aes(x=value,y=Length_in_mm,col=Itching))+facet_wrap(latent_var~.)

Doesn’t seem to be super correlated. But we can do a test to see if Itching is correlated

sig.vals<-data.with.var%>%
  spread(key=Itching,value=value)%>%
  group_by(latent_var)%>%
  mutate(pval=wilcox.test(`TRUE`,`FALSE`)$p.value)%>%
  select(latent_var,pval)%>%distinct()%>%
  ungroup()

#there are some?!
DT::datatable(subset(sig.vals,pval<0.05))
##added code to arrange by p-value from left to right
data.with.var %>%
  left_join(sig.vals) %>% 
  filter(pval < 0.05) %>% 
  arrange(pval) %>% 
  mutate(latent_var = factor(latent_var, levels = unique(latent_var), ordered = T)) %>% 
  ggplot() + 
  geom_boxplot(aes(x=latent_var,y=value,fill=Itching)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 10))
## Joining, by = "latent_var"

Itching-Size correlation

Nope, not significantly correlated:

#it seems patients who have pain also have itching, so will only evaluate itching
dat <- full.tab %>% 
  spread(key=Itching,value=Length_in_mm)
        
wilcox.test(dat$`TRUE`, dat$`FALSE`)$p.value
## [1] 0.1800699