2017-08-25 27 views
0

我正嘗試使用phyper函數從R執行富集分析。我寫的代碼給了我準確的結果,但當矩陣的大小增加時,它會一直持續下去。以下是621 * 1860矩陣的可重現示例。但是,當矩陣的大小增加到6210 X 24000時,即使我在多核上運行,也需要將近一天才能完成。我想知道是否有一種方法來優化相同。執行富集分析的優化方法

請從評論中的可共享鏈接下載三個RObjects

## Main Functions 
GetEnrichedAnnotations <- function(DrugDiseaseName, 
           DrugDiseaseGeneMatrix, 
           DrugDiseaseFeatureMatrix, 
           DFDrgDis){ ## Function Begins 
    TotalGenesCount = nrow(DrugDiseaseGeneMatrix) 
    ## Get the assosciated Genes for each Drug or Disease 
    DrugDiseaseGenes = GetGeneList(DrugDiseaseName,DFDrgDis) 
    ## Get the only annotations that Genes from the Drug or Disease List 
    UPDNAnnotations = DrugDiseaseFeatureMatrix[DrugDiseaseName,] 
    UPDNAnnotations = UPDNAnnotations[UPDNAnnotations > 0] 
    ## First value to the HyperGeometricFunction phyper 
    GenesFromInput = DrugDiseaseFeatureMatrix[DrugDiseaseName,names(UPDNAnnotations)] 
    ## Second value to the HyperGeometricFunction phyper 
    GenesinAnnotation = DrugDiseaseGeneMatrix[,names(UPDNAnnotations)] 
    GenesinAnnotation = apply(GenesinAnnotation,2,sum) 
    ## Third Value to the HyperGeometricFunction phyper 
    TotalGenes = rep(TotalGenesCount,length(GenesFromInput)) 
    RemainingGenes = TotalGenes - GenesinAnnotation 
    ## Fourth value to the HyperGeometricFunction phyper 
    NumberOfGenesInDrug = rep(length(DrugDiseaseGenes),length(GenesFromInput)) 
    names(NumberOfGenesInDrug) = names(GenesFromInput) 
    ## Apply Enrichment ANalysis 
    PValues = phyper(GenesFromInput-1,GenesinAnnotation,RemainingGenes,NumberOfGenesInDrug,lower.tail = FALSE) 
    AdjustedPvalues = p.adjust(PValues,method = "BH") 
    EnrichedAnnotations = AdjustedPvalues[AdjustedPvalues <= 0.05] 
    ### When P value is zero, replacing zeros with the minimum value 
    EnrichedAnnotations[EnrichedAnnotations == 0] = 2.2e-16 
    EnrichedAnnotations = EnrichedAnnotations[EnrichedAnnotations <= 0.05] 
    ## Get the log value for the adjusted Pvalues 
    EnrichedAnnotations = -log(EnrichedAnnotations,2) 
    ## This vector consists of all the annotations including Enriched Annotations 
    TotalAnnotaionsVector = rep(0,ncol(DrugDiseaseGeneMatrix)) 
    names(TotalAnnotaionsVector) = colnames(DrugDiseaseGeneMatrix) 
    TotalAnnotaionsVector[names(EnrichedAnnotations)] = EnrichedAnnotations 
    return(TotalAnnotaionsVector) 
    }##Function Ends 


## Get GeneList for a given Diseases 
GetGeneList = function(DiseaseName,DFDrgDis){ 
    GeneList = DFDrgDis[DFDrgDis$DrugName == DiseaseName,"Symbol"] 
    GeneList = unlist(strsplit(GeneList,",")) 
    GeneList = trimws(GeneList) 
    GeneList = unique(GeneList) 
    return(GeneList) 
} 


## Parraleize the Code 
numberofCores = parallel::detectCores() - 1 
### Closing all the Existing Connections 
closeAllConnections() 
### Making a Cluster with 8 of 8 available cores 
Cluster <- parallel::makeCluster(numberofCores) 
### Register the Cluster 
doParallel::registerDoParallel(Cluster) 





## Please download the RObject from below link 
## Please download the three objects for reproducible example 
## https://drive.google.com/drive/folders/0Bz9Y4BgZAF7oS2dtVVEwN0Z1Tnc?usp=sharing 

GeneAnnotations = readRDS("Desktop/StackOverFlow/GeneAnnotations.rds") 
DiseaseAnnotations = readRDS("Desktop/StackOverFlow/DiseaseAnnotations.rds") 
Diseases = readRDS("Desktop/StackOverFlow/Diseases.rds") 
## Get the Unique Names of Disease List 
DisNames = row.names(DiseaseAnnotations) 

## Below Function runs the code on parallel to get the Enriched Annotations for Multiple Drugs or Diseases 
## Get the Enriched Annotaions for all the Diseases UP Regulated EnricR Genes 
library(foreach) 
EnrichedAnnotations <- foreach(i=1:length(DisNames), .export= c('GetGeneList'),.packages = "Matrix") %dopar% { 
    GetEnrichedAnnotations(DisNames[i],GeneAnnotations,DiseaseAnnotations,Diseases) 
} 

## Convert to Matrix 
EnrichedAnnotations <- do.call("cbind",EnrichedAnnotations) 
EnrichedAnnotations = t(EnrichedAnnotations) 

colnames(EnrichedAnnotations) = colnames(EnrichedAnnotations) 
rownames(EnrichedAnnotations) = DisNames 


## Stop the Cluster 
parallel::stopCluster(Cluster) 
+0

使用TopGO在這種情況下不走? –

+0

TopGO比上面的腳本需要更多時間來生成豐富的矩陣。 –

+0

我的感覺是,如果修改自定義函數以便將數據保存在內存中並遍歷要測試的疾病,則可以加快速度(一點)。簡而言之,不是一次傳遞1個DrugDiseaseName,而是傳遞一個向量,函數迭代......稍後我會研究它,因爲我認爲這是一個非常有趣的問題。 –

回答

0

如果您快速配置您的串行版本,可以看到幾乎所有的時間採取以下兩行:

GenesinAnnotation = DrugDiseaseGeneMatrix[,names(UPDNAnnotations)] 
GenesinAnnotation = apply(GenesinAnnotation,2,sum) 

您可以預先計算colSums(這樣你就不能計算多次同樣的事情),你實際上不需要對數據進行子集化(你可以對結果進行子集化)。

所以,你只需要通過更換它們:

GenesinAnnotation <- GenesinAnnotation0[names(UPDNAnnotations)] 

而預先計算

GenesinAnnotation0 <- colSums(DrugDiseaseGeneMatrix)