名稱 - XYZ 父親姓名 - ABC 地址 - 商城路,西姆拉,印度 婚姻狀況 - 結婚 年收入 - 盧比750000 就業 - 是 Guaranter - 先生高清 信用分析師點評 - XYZ沒有信用記錄可能是一個NPA在將來............. 和其他詳細信息
涉及的步驟: 1.預處理文件名(刪除數字,空格,因爲它以pdf格式上傳,然後轉換爲文本)
函數parseallKYC(files_path)花費太多時間來完成,當有一大堆文件時。有六個文件,它給了我幾秒鐘的結果。我想使用包並行。誰能幫我嗎 ?所顯示的大多數例子都是爲了安慰,樂於助人。我們可以使用包並行來並行地實現我定義的函數parseAllKYC嗎?
#code for parallel parsing library(foreach) library(iterators) library(doParallel) fileloc <- "location of 5000 KYC files" filelist <- list.files(path=fileloc,pattern = 'txt') files <- "" for (j in (1:length(filelist))) { files[j] <- paste0(fileloc,'/',filelist[j]) } no_cores <- detectCores() - 1 cl <- makeCluster(no_cores) registerDoParallel(cl) KYCTable <- foreach(i=iter(files),.combine=rbind) %dopar% { resume <- parseAKYC(i) } stopCluster(cl) #code for parseAKYC function require("NLP") require("openNLPmodels.en") require("openNLP") library(tm) library(DT) preprocessFile <- function(file) { file <- file[!duplicated(file)] file <- gsub("\\f", "", file) file <- gsub('""', "", file) file <- gsub("Page\\d+", "", file) file <- gsub("-+", "", file) file <- file[file != ""] return (file) } extract_People_Location_Org <- function(file) { file <- lapply(file, removePunctuation) file <- unlist(file) s <- as.String(file) sent_token_annotator <- Maxent_Sent_Token_Annotator() gc() word_token_annotator <- Maxent_Word_Token_Annotator() a2 <- annotate(s, list(sent_token_annotator, word_token_annotator)) ## Entity recognition for pepple's names. entity_annotator_people <- Maxent_Entity_Annotator() annotate(s, entity_annotator_people, a2) if (length(entity_annotator_people(s, a2)) == 0) { people_name <- "" } else { people_name <- s[entity_annotator_people(s, a2)] } if (length(people_name) > 1) { people_name <- people_name[!duplicated(people_name)] } result1 <- paste(people_name, collapse = ", ") ## Entity recognition for Location entity_annotator_location <- Maxent_Entity_Annotator(kind = "location") annotate(s, entity_annotator_location, a2) ## Directly: if (length(entity_annotator_location(s, a2)) == 0) { location <- "" } else { location <- s[entity_annotator_location(s, a2)] } if (length(location) > 1) { location <- location[!duplicated(location)] } result2 <- paste(location, collapse = ", ") ## Entity recognition for Organization entity_annotator_org <- Maxent_Entity_Annotator(kind = "organization") annotate(s, entity_annotator_org, a2) if (length(entity_annotator_org(s, a2)) == 0) { org <- "" } else { org <- s[entity_annotator_org(s, a2)] } if (length(org) > 1) { org <- org[!duplicated(org)] } result3 <- paste(org, collapse = ", ") return (c(result1, result2, result3)) } extractCreditAnalystComments <- function(file) { index <- makeIndex(file) CreditAnalystComments <- paste(if (length(which(index == 6)) > 0) file[(which(index == 6)[1] + 1) : (tail(which(index == 6), 1))], collapse = ", ") return (paste(CreditAnalystComments, collapse = ", ")) } makeIndex <- function(file) { # create a blank vector to store index of respective field # CODE: 1-Name 2-Job 3-Email 4-Language 5-Education 6-CreditAnalystCommentss (CreditAnalystCommentss & Expertise) # 7-Experience (Experience, Volunteer Experience, Certifications) # 8-Summary 9-Interests 10-Certifications index <- rep(0, length(file)) index[which(file == "Name")] <- 1 index[which(file == "Address")] <- 2 # index[which(grepl("@", file) == T)] <- 3 index[which(file == "Marital Status")] <- 4 index[which(file == "Annual Income")] <- 5 index[which(file == "Employed")] <- 6 index[which(file == "Guaranter")] <- 7 index[which(file == "CreditAnalystComments")] <- 8 index[which(file == "Interests")] <- 9 index[which(file == "Credit History")] <- 10 for (i in 1:(length(index)-1)) { if (index[i+1] == 0) { index[i+1] <- index[i] } } return (index) } parseAKYC <- function(file_name) { # input: a KYC in format *.txt # read file text file <- readLines(file_name, warn = F) # preprocessing file file <- preprocessFile(file) KYC <- as.list(c("Name" = character(), "CreditAnalystComments" = character(), "Employed" = character(), "Address" = character(), "Annual Income" = character(), "Guaranter" = character())) KYC$Name <- file[1] KYC$CreditAnalystComments <- extractCreditAnalystComments(file) x <- extract_People_Location_Org(file) # ------------------------------------------------------------- CreditAnalystComments.split <- unlist(strsplit(KYC$CreditAnalystComments, split = ",")) CreditAnalystComments.split <- gsub("^\\s+", "", CreditAnalystComments.split) Employed.split <- unlist(strsplit(x[3], split = ",")) Employed.split <- gsub("^\\s+", "", Employed.split) Employed_not_in_credit <- Employed.split[-which(Employed.split %in% CreditAnalystComments.split)] Employed<- paste0(Employed_not_in_CreditAnalystComments, collapse = ", ") # ------------------------------------------------------------- # ------------------------------------------------------------- Guaranter.split <- unlist(strsplit(x[1], split = ",")) Guaranter.split <- gsub("^\\s+", "", Guaranter.split) Guaranter_not_in_CreditAnalystComments <- Guaranter.split[-which(Guaranter.split %in% CreditAnalystComments.split)] Guaranter <- paste0(Guaranter_not_in_CreditAnalystComments, collapse = ", ") # ------------------------------------------------------------- KYC$Employed <- Employed # remember to change Java heap size memory to at leats 2GB KYC$Address <- x[2] #KYC$Designation <- file[2] KYC$Guaranter <- Guaranter return (as.data.frame(KYC, stringsAsFactors = F)) } parseAllKYC <- function(files_path) { KYC .df <- data.frame(Name = character(), FatherName = character(), Address = character(), maritalstatus = character(), Annualincome = character(), CreditAnalystComments= character(),stringsAsFactors=FALSE) for (i in files_path) { KYC <- parseAKYC(i) KYC.df <- rbind(KYC.df, KYC) } return (KYC.df) } #ui.R fluidPage(fluidRow(column(12,DT::dataTableOutput('tbl')))) #server.R library(shiny) library(DT) source("getKYCTable.R") function(input, output, session) { output$tbl =DT::renderDataTable(KYCTable ,filter ='top',options = list(lengthChange = FALSE) ) } `
