今天的文章是關(guān)于膠質(zhì)母細(xì)胞瘤，標(biāo)題是Mining TCGA database for genes of prognostic value in glioblastoma microenvironment，銜接我們上次的R包“estimate”。作者用分析出的”StromalScore“和”ImmuneScore“對TCGA數(shù)據(jù)進(jìn)行了重新分組，并進(jìn)行了一系列后續(xù)的下游分析。

P值0.05界限為什么不好(純R代碼復(fù)現(xiàn)一篇數(shù)據(jù)挖掘文章) （這個是R包“ESTIMATE”的教程）

首先還是下載表達(dá)數(shù)據(jù)、臨床信息以及突變矩陣，然后對數(shù)據(jù)進(jìn)行簡單的過濾。

if (!file.exists( './data/TCGA-GBM.Rdata' )) {
  gzfile <- './raw_data/TCGA-GBM.AffyU133a_log2.tsv.gz'
  download.file('https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/HT_HG-U133A.gz', 
                destfile = gzfile)
  library(R.utils)
  gunzip(gzfile, remove = F)
  library(data.table)
  raw_data <- fread( './raw_data/TCGA-GBM.AffyU133a_log2.tsv',
                     sep = ' ', header = T)
  raw_data <- as.data.frame( raw_data )
  raw_data[1:5, 1:6] 
  rownames( raw_data ) <- raw_data[, 1]
  raw_data <- raw_data[, -1]
  raw_data[1:5, 1:6]
  raw_data <- 2^raw_data - 1
  raw_data <- ceiling( raw_data )
  raw_data[1:5, 1:6]
  pick_row <- apply( raw_data, 1, function(x){
    sum(x == 0) < 10
  })
  raw_data <- raw_data[pick_row, ]
  dim(raw_data  )
  save( raw_data, file = './data/TCGA-GBM.Rdata' )
}else{
  load('./data/TCGA-GBM.Rdata')
}

# Step2 Grouping by special clinical information --------------------------

if (!file.exists( './data/TCGA-GBM_phenotype.Rdata' )) {
  gzfile <- './raw_data/TCGA-GBM_phenotype.tsv.gz'
  ## download.file('https://gdc.xenahubs.net/download/TCGA-BRCA/Xena_Matrices/TCGA-BRCA.GDC_phenotype.tsv.gz', 
  ##               destfile = gzfile)
  download.file('https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/GBM_clinicalMatrix.gz', 
                 destfile = gzfile)
  phenoData <- read.table( gzfile,
                           header = T,
                           sep = ' ',
                           quote = '' )
  name <- phenoData[ , 1]
  name <- gsub(pattern = '-', replacement = '.', name)
  phenoData <- phenoData[ , -1]
  rownames( phenoData ) <- name
  phenoData[1:5, 1:3]
  save( phenoData, file = './data/TCGA-GBM_phenotype.Rdata' )
}else{
  load('./data/TCGA-GBM_phenotype.Rdata')
}


## Category 1.2: GBM: GeneExp_Subtype
## Pick columns that contains 'GeneExp_Subtype'
typePheno <- phenoData[, 'GeneExp_Subtype']
save(typePheno, file = './data/gbm_geneExp_sample.Rdata')

## Category 3: mutation
if (!file.exists( './data/TCGA-GBM.snv.Rdata' )) {
  gzfile <- './raw_data/TCGA-GBM.snv.tsv.gz'
  ## download.file('https://gdc.xenahubs.net/download/TCGA-BRCA/Xena_Matrices/TCGA-BRCA.mutect2_snv.tsv.gz', 
  ##               destfile = gzfile)
  download.file('https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/mutation_broad.gz', 
                destfile = gzfile)
  library(R.utils)
  gunzip(gzfile, remove = F)
  mutype_file <- read.table( './raw_data/TCGA-GBM.snv.tsv',
                             header = T,
                             sep = ' ',
                             quote = '' )
  save( mutype_file, file = './data/TCGA-GBM.snv.Rdata' )
}else{
  load('./data/TCGA-GBM.snv.Rdata')
}

這里根據(jù)臨床信息中的，'GeneExp_Subtype'對數(shù)據(jù)進(jìn)行了分組。用”estimate“R包得到的結(jié)果，根據(jù)其中的'StromalScore' ，'ImmuneScore'可以得到兩個分類。并且文章還根據(jù)突變IDH進(jìn)行了分類。一共有四個分類。后續(xù)我們只用到了StromalScore' ，'ImmuneScore'的分類。

# Step1 Install packages --------------------------------------------------

library(utils)
rforge <- 'http://r-forge.r-project.org'
install.packages('estimate', repos = rforge, dependencies = TRUE)


# Step2 calculate tumor purity --------------------------------------------

library(estimate)

write.table(as.data.frame(raw_data), './data/varianCancerExpr.txt', 
            quote = FALSE, sep = '	')

filterCommonGenes(input.f = './data/varianCancerExpr.txt', 
                  output.f = './data/genes.gct', id = 'GeneSymbol')

estimateScore('./data/genes.gct', './data/estimate_score.gct',
              platform = 'affymetrix')

## read result
estimate_score <- read.table('./data/estimate_score.gct',
                             sep = '	', skip = 2,
                             header  = T)
rownames(estimate_score) <- estimate_score[, 1]
estimate_score <- as.data.frame(t(estimate_score[, -c(1,2)]))
estimate_score[1:5, 1:4]

## type1-gene_Exp
estimate_score <- estimate_score[rownames(estimate_score) %in% names(typePheno),]
typePheno <- typePheno[names(typePheno) %in% rownames(estimate_score)]
estimate_score <- estimate_score[names(typePheno), ]
estimate_score$type <- as.character(typePheno)
estimate_score <- estimate_score[estimate_score[,5] != '', ]

## type2-WHO-IDH1
name <- mutype_file[,1]
name <- gsub(pattern = '-', replacement = '.', name)
mutype_file[,1] <- name
mutation <- mutype_file[mutype_file[, 7] == 'IDH1', 1]
typeIDH <- ifelse(rownames(estimate_score) %in% mutype_file[,1], 
               ifelse(rownames(estimate_score) %in% mutation, 'IDH_mutat', 'IDH-wildtype'),
               'NOS')
estimate_score$type <- as.character(typeIDH)
estimate_score <- estimate_score[estimate_score[,5] != 'NOS', ]

## type3-ImmuneScore
estimate_score <- estimate_score[order(estimate_score[,2]),]
typeImmune <- c(rep('Low', floor(length(rownames(estimate_score))/2)),
                rep('High', ceiling(length(rownames(estimate_score))/2)))
names(typeImmune) <- rownames(estimate_score)

## type4-StromalScore
estimate_score <- estimate_score[order(estimate_score[,1]),]
typeStromal <- c(rep('Low', floor(length(rownames(estimate_score))/2)),
                rep('High', ceiling(length(rownames(estimate_score))/2)))
names(typeStromal) <- rownames(estimate_score)


ggboxplot(estimate_score, x = 'type', y = 'ImmuneScore',
          color = 'type', palette = c('#00AFBB', '#E7B800', '#FC4E07', '#c00000'),
          add = 'jitter', shape = 'type') + stat_compare_means(label.y = 3500) 

ggboxplot(estimate_score, x = 'type', y = 'StromalScore',
          color = 'type', palette = c('#00AFBB', '#E7B800', '#FC4E07', '#c00000'),
          add = 'jitter', shape = 'type') + stat_compare_means(label.y = -2000)

## OS
library(survival)  
os.model <- phenoData[, c('OS.time', 'OS')]

os.model <- os.model[rownames(os.model) %in% names(typeStromal),]
os.model <- os.model[names(typeStromal),]
os.model$type <- typeStromal

fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)

library(survminer)
ggsurvplot(fit,
           pval = TRUE, conf.int = TRUE,
           linetype = 'type',
           surv.median.line = 'hv',
           ggtheme = theme_bw(),
           palette = c('#E7B800', '#2E9FDF')
)

os.model <- os.model[rownames(os.model) %in% names(typeImmune),]
os.model <- os.model[names(typeImmune),]
os.model$type <- typeImmune

fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)

library(survminer)
ggsurvplot(fit,
           pval = TRUE, conf.int = TRUE,
           linetype = 'type',
           surv.median.line = 'hv',
           ggtheme = theme_bw(),
           palette = c('#E7B800', '#2E9FDF')
)

下面是StromalScore分類得到的生存分析結(jié)果

下面是ImmuneScore分類得到的生存分析結(jié)果

draw_heatmap <- function(nrDEG, type){
  library( 'pheatmap' )
  nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ]
  nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ]
  choose_gene = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:50] )
  choose_matrix = AssayData[ choose_gene, ]
  choose_matrix = t( scale( t( choose_matrix ) ) )

  choose_matrix[choose_matrix > 2] = 2
  choose_matrix[choose_matrix < -2] = -2

  annotation_col = data.frame( CellType = factor( group_list ) )
  rownames( annotation_col ) = colnames( AssayData )
  filename <- paste('./fig/', type, '_heatmap_top100_logFC.png',
                    sep = '', collapse = NULL)
  pheatmap( fontsize = 6, choose_matrix, annotation_col = annotation_col, 
            show_rownames = T, show_colnames = F,
            annotation_legend = T, cluster_cols = F, 
            filename = filename)
}

draw_volcano <- function(nrDEG, type){
  library( 'ggplot2' )
  logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )

  nrDEG$change = as.factor( ifelse( 
    nrDEG$P.Value < 0.01 & abs(nrDEG$logFC) > logFC_cutoff,
    ifelse( nrDEG$logFC > logFC_cutoff, 'UP', 'DOWN' ), 'NOT' ) )
  nrDEGfile <- paste('./data/', type, '_nrDEG_by_logFC.Rdata',
                    sep = '', collapse = NULL)
  save( nrDEG, file = nrDEGfile )

  this_tile <- paste0( 
    'Cutoff for logFC is ', round( logFC_cutoff, 3 ),
    '
The number of up gene is ', nrow(nrDEG[ nrDEG$change == 'UP', ] ),
    '
The number of down gene is ', nrow(nrDEG[ nrDEG$change == 'DOWN', ] ) )

  volcano = ggplot(data = nrDEG, 
                   aes( x = logFC, y = -log10(P.Value), color = change)) +
    geom_point( alpha = 0.4, size = 1.75 ) +
    theme_set( theme_set( theme_bw( base_size = 15 ) ) ) +
    xlab( 'log2 fold change' ) + ylab( '-log10 p-value' ) +
    ggtitle( this_tile ) + 
    theme( plot.title = element_text( size = 15, hjust = 0.5 )) +
    scale_colour_manual( values = c('blue','black','red') )
  print( volcano )
  filename <- paste('./fig/', type, '_volcano_logFC.png',
                    sep = '', collapse = NULL)
  ggsave( volcano, filename = filename )
  dev.off()
}

AssayData <- raw_data
names(typeImmune) <- gsub(pattern = '\.', replacement = '-', names(typeImmune))
AssayData <- AssayData[, colnames(AssayData) %in% names(typeImmune)]
AssayData <- AssayData[, names(typeImmune)]
group_list <- typeImmune

AssayData <- raw_data
names(typeStromal) <- gsub(pattern = '\.', replacement = '-', names(typeStromal))
AssayData <- AssayData[, colnames(AssayData) %in% names(typeStromal)]
AssayData <- AssayData[, names(typeStromal)]
group_list <- typeStromal


# Step3 Lastly run voom from limma ----------------------------------------

library(limma)
## A list-based S4 class for storing read counts and associated information 
## from digital gene expression or sequencing technologies.
DGElist <- DGEList( counts = AssayData, group = factor(group_list) )
## Counts per Million or Reads per Kilobase per Million
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2
table(keep_gene)
DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]
## Calculate Normalization Factors to Align Columns of a Count Matrix
DGElist <- calcNormFactors( DGElist )
DGElist$samples

design <- model.matrix( ~0 + factor(group_list) )
rownames(design) <- colnames(DGElist)
colnames(design) <- levels(factor(group_list))

## Transform RNA-Seq Data Ready for Linear Modelling
v <- voom(DGElist, design, plot = TRUE, normalize = 'quantile')
## Fit linear model for each gene given a series of arrays
fit <- lmFit(v, design)

## Construct the contrast matrix corresponding to specified contrasts of a set 
## of parameters.
cont.matrix <- makeContrasts(contrasts = c('Low-High'), levels = design)
## Given a linear model fit to microarray data, compute estimated coefficients 
## and standard errors for a given set of contrasts.
fit2 <- contrasts.fit(fit, cont.matrix)
## Empirical Bayes Statistics for Differential Expression
fit2 <- eBayes(fit2)

nrDEG_limma_voom = topTable(fit2, coef = 'Low-High', n = Inf)
nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
head(nrDEG_limma_voom)
draw_heatmap(nrDEG = nrDEG_limma_voom, type = 'limma_voom')
draw_volcano(nrDEG = nrDEG_limma_voom, type = 'limma_voom')


# Step4 Compare  ---------------------------------------------

library('VennDiagram')

nrDEG_Z = nrDEG_limma_voom_immune[ order( nrDEG_limma_voom_immune$logFC ), ]
nrDEG_F = nrDEG_limma_voom_immune[ order( -nrDEG_limma_voom_immune$logFC ), ]
choose_gene_A = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:350] )

nrDEG_Z = nrDEG_limma_voom_stromal[ order( nrDEG_limma_voom_stromal$logFC ), ]
nrDEG_F = nrDEG_limma_voom_stromal[ order( -nrDEG_limma_voom_stromal$logFC ), ]
choose_gene_B = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:350] )
## Venn Diagram
venn.plot <- venn.diagram(x = list(A = choose_gene_A, B = choose_gene_B), 
                          filename = 'DIFF.png', height = 450, width = 450,
                          resolution = 300, imagetype = 'png', col = 'transparent',
                          fill = c('cornflowerblue', 'darkorchid1'),
                          alpha = 0.50, cex = 0.45, cat.cex = 0.45)

先是stromal_Score的分組繪制的熱圖和火山圖

之后是immnue_Score的分組繪制的熱圖和火山圖

比較一下共有的基因情況

## function of KEGG pathway
kegg_plot <- function(type) {
  kk.up <- enrichKEGG(   gene          =  gene_up    ,
                         organism      =  'hsa'      ,
                         universe      =  gene_all   ,
                         pvalueCutoff  =  0.8        ,
                         qvalueCutoff  =  0.8        )
  kk.down <- enrichKEGG( gene          =  gene_down  ,
                         organism      =  'hsa'      ,
                         universe      =  gene_all   ,
                         pvalueCutoff  =  0.8        ,
                         qvalueCutoff  =  0.8        )
  library( 'ggplot2' )
  kegg_down_dt <- as.data.frame( kk.down )
  kegg_up_dt <- as.data.frame( kk.up )
  down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]
  down_kegg$group <- -1
  up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]
  up_kegg$group <- 1
  dat = rbind( up_kegg, down_kegg )
  dat$pvalue = -log10( dat$pvalue )
  dat$pvalue = dat$pvalue * dat$group
  dat = dat[ order( dat$pvalue, decreasing = F ), ]
  g_kegg <- ggplot( dat, 
                    aes(x = reorder( Description, order( pvalue, decreasing = F )), 
                        y = pvalue, fill = group)) + 
    geom_bar( stat = 'identity' ) + 
    scale_fill_gradient( low = 'blue', high = 'red', guide = FALSE ) + 
    scale_x_discrete( name = 'Pathway names' ) +
    scale_y_continuous( name = 'log10P-value' ) +
    coord_flip() + theme_bw() + 
    theme( plot.title = element_text( hjust = 0.5 ), 
           axis.text.x = element_text(size = 10),
           axis.text.y = element_text(size = 7)) +
    ggtitle( 'Pathway Enrichment' ) 
  print( g_kegg )
  filename <- paste('./fig/kegg_up_down_', type, '.png', sep = '', collapse = NULL)
  ggsave( g_kegg, filename = filename )
}

## function of GO pathway
go_plot <- function(type) {
  go_enrich_results <- lapply( g_list, function(gene) {
    lapply( c( 'BP', 'MF', 'CC' ) , function(ont) {
      cat( paste( 'Now process', ont ) )
      ego <- enrichGO( gene          =  gene,
                       universe      =  gene_all,
                       OrgDb         =  org.Hs.eg.db,
                       ont           =  ont ,
                       pAdjustMethod =  'BH',
                       pvalueCutoff  =  0.99,
                       qvalueCutoff  =  0.99,
                       readable      =  TRUE)
      print( head( ego ) )
      return( ego )
    })
  })
  gofilename <- paste('./data/go_enrich_result', type, '.Rdata', 
                      sep = '', collapse = NULL)
  save( go_enrich_results, file = gofilename )

  n1 = c( 'gene_up', 'gene_down', 'gene_diff' )
  n2 = c( 'BP', 'MF', 'CC' ) 
  for (i in 1:3) {
    for (j in 1:3) {
      fn = paste0( './fig/dotplot_', n1[i], '_', n2[j], '_', type, '.png' )
      cat( paste0( fn, '
' ) )
      png( fn, res = 150, width = 1080 )
      print( dotplot( go_enrich_results[[i]][[j]] ) )
      dev.off()
    }
  }
}



# Step1 annotation --------------------------------------------------------

library( 'clusterProfiler' )
library( 'org.Hs.eg.db' )
keytypes(org.Hs.eg.db)
library('stringr')

nrDEG <- nrDEG_limma_voom_immune
nrDEG <- nrDEG_limma_voom_stromal
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )

nrDEG$change = as.factor( ifelse( 
  nrDEG$P.Value < 0.01 & abs(nrDEG$logFC) > logFC_cutoff,
  ifelse( nrDEG$logFC > logFC_cutoff, 'UP', 'DOWN' ), 'NOT' ) )
nrDEG_limma_voom_immune <- nrDEG
nrDEG_limma_voom_stromal <- nrDEG


## tans1: ENSEMBL2ENTREZID
table( nrDEG$change )
rownames( nrDEG ) <- str_sub(rownames( nrDEG ), start = 1, end = 15)
nrDEG$SYMBOL <- rownames( nrDEG )
df <- bitr( rownames( nrDEG ), fromType = 'SYMBOL', toType = c( 'ENTREZID' ), 
            OrgDb = org.Hs.eg.db )
head( df )
nrDEG = merge( nrDEG, df, by = 'SYMBOL' )
head( nrDEG )

gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ] 
gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
gene_diff = c( gene_up, gene_down )
gene_all = as.character(nrDEG[ ,'ENTREZID'] )
g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)


# Step2 pathway analysis ------------------------------------------

kegg_plot('limma_voom')
go_plot('limma_voom')

下面是”StromalScore“分類后差異分析的通路結(jié)果

下面是”immnue_Score“分類后差異分析的通路結(jié)果

代碼由生信技能樹學(xué)徒獨(dú)立完成，我并沒有進(jìn)行任何干預(yù)！

如果你看不懂上面的圖和R代碼，也不會制作，那么你可能需要下面的學(xué)習(xí)班：

本站僅提供存儲服務(wù)，所有內(nèi)容均由用戶發(fā)布，如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容，請點(diǎn)擊舉報。

免费视频淫片aa毛片_日韩高清在线亚洲专区vr_日韩大片免费观看视频播放_亚洲欧美国产精品完整版