2017-04-04 54 views
0

我嘗試使用R校正拼寫錯誤的單詞或文檔的拼寫檢查器。strsplit(word,NULL)中的錯誤:使用拼寫檢查器的非字符參數

我嘗試用這個R代碼裏面做了一個字,它工作得很好的修正:

> Correct("speling", dtm = counts) 
$l4 
[1] "spelling" 

但是當我嘗試做一個文件的修正,我得到這個錯誤:

> CorrectDocument("the quick bruwn fowx jumpt ovre tha lasy dog", dtm = counts) 

Error in strsplit(word, NULL) : non-character argument 
# This is a text processing function, which I 
# borrowed from a CMU Data mining course professor. 
strip.text <- function(txt) { 
    # remove apostrophes (so "don't" -> "dont", "Jane's" -> "Janes", etc.) 
    txt <- gsub("'","",txt) 
    # convert to lowercase 
    txt <- tolower(txt) 
    # change other non-alphanumeric characters to spaces 
    txt <- gsub("[^a-z0-9]"," ",txt) 
    # change digits to # 
    txt <- gsub("[0-9]+"," ",txt) 
    # split and make one vector 
    txt <- unlist(strsplit(txt," ")) 
    # remove empty words 
    txt <- txt[txt != ""] 
    return(txt) 
} 

# Words within 1 transposition. 
Transpositions <- function(word = FALSE) { 
    N <- nchar(word) 
    if (N > 2) { 
    out <- rep(word, N - 1) 
    word <- unlist(strsplit(word, NULL)) 
    # Permutations of the letters 
    perms <- matrix(c(1:(N - 1), 2:N), ncol = 2) 
    reversed <- perms[, 2:1] 
    trans.words <- matrix(rep(word, N - 1), byrow = TRUE, nrow = N - 1) 
    for(i in 1:(N - 1)) { 
     trans.words[i, perms[i, ]] <- trans.words[i, reversed[i, ]] 
     out[i] <- paste(trans.words[i, ], collapse = "") 
    } 
    } 
    else if (N == 2) { 
    out <- paste(word[2:1], collapse = "") 
    } 
    else { 
    out <- paste(word, collapse = "") 
    } 
    return(out) 
} 

# Single letter deletions. 
Deletes <- function(word = FALSE) { 
    N <- nchar(word) 
    word <- unlist(strsplit(word, NULL)) 
    out <- list() 
    for(i in 1:N) { 
    out[i] <- paste(word[-i], collapse = "") 
    } 
    return(out) 
} 

# Single-letter insertions. 
Insertions <- function(word = FALSE) { 
    N <- nchar(word) 
    out <- list() 
    for (letter in letters) { 
    out[[letter]] <- rep(word, N + 1) 
    for (i in 1:(N + 1)) { 
     out[[letter]][i] <- paste(substr(word, i - N, i - 1), letter, 
           substr(word, i, N), sep = "") 
    } 
    } 
    out <- unlist(out) 
    return(out) 
} 

# Single-letter replacements. 
Replaces <- function(word = FALSE) { 
    N <- nchar(word) 
    out <- list() 
    for (letter in letters) { 
    out[[letter]] <- rep(word, N) 
    for (i in 1:N) { 
     out[[letter]][i] <- paste(substr(word, i - N, i - 1), letter, 
           substr(word, i + 1, N + 1), sep = "") 
    } 
    } 
    out <- unlist(out) 
    return(out) 
} 
# All Neighbors with distance "1" 
Neighbors <- function(word) { 
    neighbors <- c(word, Replaces(word), Deletes(word), 
       Insertions(word), Transpositions(word)) 
    return(neighbors) 
} 

# Probability as determined by our corpus. 
Probability <- function(word, dtm) { 
    # Number of words, total 
    N <- length(dtm) 
    word.number <- which(names(dtm) == word) 
    count <- dtm[word.number] 
    pval <- count/N 
    return(pval) 
} 

# Correct a single word. 
Correct <- function(word, dtm) { 
    neighbors <- Neighbors(word) 
    # If it is a word, just return it. 
    if (word %in% names(dtm)) { 
    out <- word 
    } 
    # Otherwise, check for neighbors. 
    else { 
    # Which of the neighbors are known words? 
    known <- which(neighbors %in% names(dtm)) 
    N.known <- length(known) 
    # If there are no known neighbors, including the word, 
    # look farther away. 
    if (N.known == 0) { 
     print(paste("Having a hard time matching '", word, "'...", sep = "")) 
     neighbors <- unlist(lapply(neighbors, Neighbors)) 
    } 
    # Then out non-words. 
    neighbors <- neighbors[which(neighbors %in% names(dtm))] 
    N <- length(neighbors) 
    # If we found some neighbors, find the one with the highest 
    # p-value. 
    if (N > 1) { 
     P <- 0*(1:N) 
     for (i in 1:N) { 
     P[i] <- Probability(neighbors[i], dtm) 
     } 
     out <- neighbors[which.max(P)] 
    } 
    # If no neighbors still, return the word. 
    else { 
     out <- word 
    } 
    } 
    return(out) 
} 

# Correct an entire document. 
CorrectDocument <- function(document, dtm) { 
    by.word <- unlist(strsplit(document, " ")) 
    N <- length(by.word) 
    for (i in 1:N) { 
    by.word[i] <- Correct(by.word[i], dtm = dtm) 
    } 
    corrected <- paste(by.word, collapse = " ") 
    return(corrected) 
} 

words <- scan("http://norvig.com/big.txt", what = character()) 
words <- strip.text(words) 
counts <- table(words) 
Correct("speling", dtm = counts) 

#---correct a document 
CorrectDocument("the quick bruwn fowx jumpt ovre tha lasy dog", dtm = counts) 

任何想法嗎?

謝謝

回答

1

功能Correct有一個bug,你應該添加一個unlist,即行:

Correct <- function(word, dtm) { 
    neighbors <- Neighbors(word) 

應更改爲:

Correct <- function(word, dtm) { 
    neighbors <- unlist(Neighbors(word)) 

編輯:

下面是該糾正文檔文件的線功能(覆蓋它):

CorrectDocumentFile <- function(file,dtm){ 
    # read the file lines 
    textLines <- unlist(readLines(file)) 

    # for each line not empty or blank, correct the text 
    for(i in which(!grepl("^\\s*$",textLines))){ 
    line <- textLines[[i]] 
    textLines[i] <- CorrectDocument(line,dtm) 
    } 

    # overwrite the file with the correction 
    writeLines(textLines, file) 
} 

用法:

CorrectDocumentFile(file="fileToBeCorrected.txt", dtm=counts) 
+0

謝謝它的作品。 – Datackatlon

+0

最後一個問題,在這一行:'CorrectDocument(「快速bruow fowx jumpt ovre tha lasy dog」,dtm = counts)',我該如何修改它以使輸入文件名包含文本?謝謝 – Datackatlon

+0

你想通過字典文件文本或文件與文本糾正? – digEmAll