我們有一組txt格式的5000個KYC文件。我需要對他們執行NER,以閃亮的應用程序中的表格形式總結報告,這可以在我們的網站中使用。然而,解析文件花了很多時間超過30分鐘:(需要完成優化,任何機構都可以提供我需要實現的一些方法,文本文件是這種格式的文件R代碼中的性能問題
名稱 - XYZ 父親姓名 - ABC 地址 - 商城路,西姆拉,印度 婚姻狀況 - 結婚 年收入 - 盧比750000 就業 - 是 Guaranter - 先生高清 信用分析師點評 - XYZ沒有信用記錄可能是一個NPA在將來............. 和其他詳細信息
涉及的步驟: 1.預處理文件名(刪除數字,空格,因爲它以pdf格式上傳,然後轉換爲文本)
創建所有列的索引姓名,父親姓名,地址,婚姻狀況,年收入,年齡,信用分析師評論)
解析每個文件的函數。使用命名實體識別和其他技術來獲取關鍵字並忽略其他詞並將其映射到相應的列。該函數的名稱是parseAKYC(文件)。
在另一個函數parseallKYC中調用此函數。
函數parseallKYC(files_path)花費太多時間來完成,當有一大堆文件時。有六個文件,它給了我幾秒鐘的結果。我想使用包並行。誰能幫我嗎 ?所顯示的大多數例子都是爲了安慰,樂於助人。我們可以使用包並行來並行地實現我定義的函數parseAllKYC嗎?
下面是最終函數parseallKYC的代碼,如下所示。
`
#code for parallel parsing library(foreach) library(iterators) library(doParallel) fileloc <- "location of 5000 KYC files" filelist <- list.files(path=fileloc,pattern = 'txt') files <- "" for (j in (1:length(filelist))) { files[j] <- paste0(fileloc,'/',filelist[j]) } no_cores <- detectCores() - 1 cl <- makeCluster(no_cores) registerDoParallel(cl) KYCTable <- foreach(i=iter(files),.combine=rbind) %dopar% { resume <- parseAKYC(i) } stopCluster(cl) #code for parseAKYC function require("NLP") require("openNLPmodels.en") require("openNLP") library(tm) library(DT) preprocessFile <- function(file) { file <- file[!duplicated(file)] file <- gsub("\\f", "", file) file <- gsub('""', "", file) file <- gsub("Page\\d+", "", file) file <- gsub("-+", "", file) file <- file[file != ""] return (file) } extract_People_Location_Org <- function(file) { file <- lapply(file, removePunctuation) file <- unlist(file) s <- as.String(file) sent_token_annotator <- Maxent_Sent_Token_Annotator() gc() word_token_annotator <- Maxent_Word_Token_Annotator() a2 <- annotate(s, list(sent_token_annotator, word_token_annotator)) ## Entity recognition for pepple's names. entity_annotator_people <- Maxent_Entity_Annotator() annotate(s, entity_annotator_people, a2) if (length(entity_annotator_people(s, a2)) == 0) { people_name <- "" } else { people_name <- s[entity_annotator_people(s, a2)] } if (length(people_name) > 1) { people_name <- people_name[!duplicated(people_name)] } result1 <- paste(people_name, collapse = ", ") ## Entity recognition for Location entity_annotator_location <- Maxent_Entity_Annotator(kind = "location") annotate(s, entity_annotator_location, a2) ## Directly: if (length(entity_annotator_location(s, a2)) == 0) { location <- "" } else { location <- s[entity_annotator_location(s, a2)] } if (length(location) > 1) { location <- location[!duplicated(location)] } result2 <- paste(location, collapse = ", ") ## Entity recognition for Organization entity_annotator_org <- Maxent_Entity_Annotator(kind = "organization") annotate(s, entity_annotator_org, a2) if (length(entity_annotator_org(s, a2)) == 0) { org <- "" } else { org <- s[entity_annotator_org(s, a2)] } if (length(org) > 1) { org <- org[!duplicated(org)] } result3 <- paste(org, collapse = ", ") return (c(result1, result2, result3)) } extractCreditAnalystComments <- function(file) { index <- makeIndex(file) CreditAnalystComments <- paste(if (length(which(index == 6)) > 0) file[(which(index == 6)[1] + 1) : (tail(which(index == 6), 1))], collapse = ", ") return (paste(CreditAnalystComments, collapse = ", ")) } makeIndex <- function(file) { # create a blank vector to store index of respective field # CODE: 1-Name 2-Job 3-Email 4-Language 5-Education 6-CreditAnalystCommentss (CreditAnalystCommentss & Expertise) # 7-Experience (Experience, Volunteer Experience, Certifications) # 8-Summary 9-Interests 10-Certifications index <- rep(0, length(file)) index[which(file == "Name")] <- 1 index[which(file == "Address")] <- 2 # index[which(grepl("@", file) == T)] <- 3 index[which(file == "Marital Status")] <- 4 index[which(file == "Annual Income")] <- 5 index[which(file == "Employed")] <- 6 index[which(file == "Guaranter")] <- 7 index[which(file == "CreditAnalystComments")] <- 8 index[which(file == "Interests")] <- 9 index[which(file == "Credit History")] <- 10 for (i in 1:(length(index)-1)) { if (index[i+1] == 0) { index[i+1] <- index[i] } } return (index) } parseAKYC <- function(file_name) { # input: a KYC in format *.txt # read file text file <- readLines(file_name, warn = F) # preprocessing file file <- preprocessFile(file) KYC <- as.list(c("Name" = character(), "CreditAnalystComments" = character(), "Employed" = character(), "Address" = character(), "Annual Income" = character(), "Guaranter" = character())) KYC$Name <- file[1] KYC$CreditAnalystComments <- extractCreditAnalystComments(file) x <- extract_People_Location_Org(file) # ------------------------------------------------------------- CreditAnalystComments.split <- unlist(strsplit(KYC$CreditAnalystComments, split = ",")) CreditAnalystComments.split <- gsub("^\\s+", "", CreditAnalystComments.split) Employed.split <- unlist(strsplit(x[3], split = ",")) Employed.split <- gsub("^\\s+", "", Employed.split) Employed_not_in_credit <- Employed.split[-which(Employed.split %in% CreditAnalystComments.split)] Employed<- paste0(Employed_not_in_CreditAnalystComments, collapse = ", ") # ------------------------------------------------------------- # ------------------------------------------------------------- Guaranter.split <- unlist(strsplit(x[1], split = ",")) Guaranter.split <- gsub("^\\s+", "", Guaranter.split) Guaranter_not_in_CreditAnalystComments <- Guaranter.split[-which(Guaranter.split %in% CreditAnalystComments.split)] Guaranter <- paste0(Guaranter_not_in_CreditAnalystComments, collapse = ", ") # ------------------------------------------------------------- KYC$Employed <- Employed # remember to change Java heap size memory to at leats 2GB KYC$Address <- x[2] #KYC$Designation <- file[2] KYC$Guaranter <- Guaranter return (as.data.frame(KYC, stringsAsFactors = F)) } parseAllKYC <- function(files_path) { KYC .df <- data.frame(Name = character(), FatherName = character(), Address = character(), maritalstatus = character(), Annualincome = character(), CreditAnalystComments= character(),stringsAsFactors=FALSE) for (i in files_path) { KYC <- parseAKYC(i) KYC.df <- rbind(KYC.df, KYC) } return (KYC.df) } #ui.R fluidPage(fluidRow(column(12,DT::dataTableOutput('tbl')))) #server.R library(shiny) library(DT) source("getKYCTable.R") function(input, output, session) { output$tbl =DT::renderDataTable(KYCTable ,filter ='top',options = list(lengthChange = FALSE) ) } `
你想運行在多個內核的代碼? – TUSHAr
是的。出錯。這裏是使用的代碼。 –
你能確保你正確地縮進你的代碼(parseAllKYC)。另外,我在parseAllKYC函數中看不到parseAResume。 – TUSHAr