Flesch-Kincaid可讀性測試

5

不是開源的，但你使用the ReadabilityStatistic interface可以委派到Word。即使您的文檔不是以Word開頭，您也可以打開Word（對用戶無形），將文本轉儲到Word中，然後使用ReadabilityStatistic來計算統計信息。

來源

2012-02-05 17:29:49

0

我很驚訝這裏沒有圖書館，但你真的需要它嗎？

如果你能得到你的原始文本，計算是相當微不足道的。

查看 this（PHP）的源代碼計數音節就像計算句子一樣，使用正則表達式，而不是分割。！？分割所有的元音aeiouy。

來源

2012-06-29 07:06:28

+1

這是一個非常粗略的假設，見http://stackoverflow.com/a/1076924/1226839有關爲什麼的更多信息（或者看看這個句子中的多元音字:-) – Nathan 2014-09-15 13:00:42

0

在Java中有一個開源的解決方案 - 它不是.Net，但它是相對清晰的代碼，你可能可以翻譯：https://github.com/ipeirotis/ReadabilityMetrics（Java），它依次基於http://search.cpan.org/author/GREGFAST/Lingua-EN-Syllable-0.251/（在Perl中）。

來源

2014-09-15 13:05:02 Nathan

1

正如弗萊士 - 金凱德年級的公式來描述：

https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

你需要計算單詞，句子和音節。音節也許是最棘手的，儘管句子也需要一些思考。

這裏是兩個翻譯其他人的代碼爲F＃的音節計數（這是.NET，你可以在visual studio中創建一個F＃項目，然後從你的C＃項目引用該項目）。我已經做了基本的但不是廣泛的測試。

我發現Ipeirotis在我的一些測試用例（一旦我添加問題單詞列表）比兒童更好的結果。我的測試字是：

let testWords = [|"abalone";"gracious";"atheism";"unaware"; "seaside";"underwater";"wonderwoman";"biology"|]

孩子的代碼在列表的末尾特別有問題。重新排列從最長的詞綴到最短的正則表達式似乎並沒有解決它。

我的翻譯：

module Readability 

open System.Text.RegularExpressions 
//for syllables 
//simpler: 
//https://github.com/ipeirotis/ReadabilityMetrics/blob/master/src/main/java/com/ipeirotis/readability/engine/Syllabify.java 

let SyllableCount2 (word:string) = 
    let SubSyl = [| "cial"; "tia"; "cius"; "cious"; "giu"; "ion"; "iou"; "sia$"; ".ely$" |] 
    let AddSyl = [| "ia"; "riet"; "dien"; "iu"; "io"; "ii"; "[aeiouym]bl$"; "[aeiou]{3}"; "^mc"; "ism$"; "[^aeiouy][^aeiouy]l$"; "[^l]lien"; "^coa[dglx]."; "[^gq]ua[^auieo]"; "dnt$" |] 

    let mutable tempWord = word.ToLower() 
    tempWord <- tempWord.Replace("'", " ") 

    if problemWordMap.ContainsKey(word) then 
     problemWordMap.[word] 
    else if tempWord = "i" || tempWord = "a" then 
     1 
    else 
     if tempWord.EndsWith("e") then 
      tempWord <- tempWord.Substring(0, tempWord.Length - 1) 

     let phonems = Regex.Split(tempWord, "[^aeiouy]+") 

     let mutable syl = 0; 

     for i = 0 to SubSyl.Length - 1 do 
      let syllabe = SubSyl.[i]; 
      if Regex.IsMatch(tempWord, syllabe) then 
       syl <- syl - 1 

     for i = 0 to AddSyl.Length - 1 do 
      let syllabe = AddSyl.[i]; 
      if Regex.IsMatch(tempWord, syllabe) then 
       syl <- syl + 1 

     if tempWord.Length = 1 then 
      syl <- syl + 1 

     for i = 0 to phonems.Length - 1 do 
      if phonems.[i].Length > 0 then 
       syl <- syl + 1 

     if syl = 0 then 
      syl <- 1 

     // return 
     syl 

//https://github.com/DaveChild/Text-Statistics/blob/master/src/DaveChild/TextStatistics/Syllables.php 

let problemWordMap = 
    dict[ 
     ("abalone", 4); 
     ("abare", 3); 
     ("abed" , 2); 
     ("abruzzese", 4); 
     ("abbruzzese" , 4); 
     ("aborigine", 5); 
     ("aborigines", 5); //andrew plural (ap) 
     ("acreage", 3); 
     ("acreage", 3); //ap 
     ("adame", 3); 
     ("adieu", 2); 
     ("adobe", 3); 
     ("anemone", 4); 
     ("anemones", 4); //ap 
     ("apache" , 3); 
     ("apaches" , 3); //ap 
     ("aphrodite", 4); 
     ("apostrophe" , 4); 
     ("apostrophes" , 4); //ap 
     ("ariadne", 4); 
     ("cafe" , 2); 
     ("cafes" , 2); //ap 
     ("calliope" , 4); 
     ("catastrophe", 4); 
     ("catastrophes", 4); //ap 
     ("chile", 2); 
     ("chiles", 2); //ap 
     ("chloe", 2); 
     ("circe", 2); 
     ("coyote" , 3); 
     ("coyotes" , 3); //ap 
     ("epitome", 4); 
     ("forever", 3); 
     ("gethsemane" , 4); 
     ("guacamole", 4); 
     ("guacamoles", 4); //ap 
     ("hyperbole", 4); 
     ("hyperboles", 4); //ap 
     ("jesse", 2); 
     ("jukebox", 2); 
     ("jukeboxes", 2); //ap 
     ("karate" , 3); 
     ("karates" , 3); //ap 
     ("machete", 3); 
     ("maybe", 2); 
     ("people" , 2); 
     ("recipe" , 3); 
     ("sesame" , 3); 
     ("shoreline", 2); 
     ("simile" , 3); 
     ("machetes", 3); //ap 
     ("maybes", 2);//ap 
     ("peoples" , 2);//ap 
     ("recipes" , 3);//ap 
     ("sesames" , 3);//ap 
     ("shorelines", 2);//ap 
     ("similes" , 3);//ap 
     ("syncope", 3); 
     ("tamale" , 3); 
     ("tamales" , 3); //ap 
     ("yosemite" , 4); 
     ("daphne" , 2); 
     ("eurydice" , 4); 
     ("euterpe", 3); 
     ("hermione" , 4); 
     ("penelope" , 4); 
     ("persephone" , 4); 
     ("phoebe" , 2); 
     ("zoe", 2); 
    ] 

// These syllables would be counted as two but should be one 
let oneSyllableCorrection = 
    [| 
     "cia(l|$)"; // glacial, acacia 
     "tia"; 
     "cius"; 
     "cious"; 
     "[^aeiou]giu"; 
     "[aeiouy][^aeiouy]ion"; 
     "iou"; 
     "sia$"; 
     "eous$"; 
     "[oa]gue$"; 
     ".[^aeiuoycgltdb]{2,}ed$"; 
     ".ely$"; 
     //"[cg]h?ed?$"; 
     //"rved?$"; 
     //"[aeiouy][dt]es?$"; 
     //"^[dr]e[aeiou][^aeiou]+$"; // Sorts out deal, deign etc 
     //"[aeiouy]rse$"; // Purse, hearse 
     "^jua"; 
     //"nne[ds]?$"; // canadienne 
     "uai"; // acquainted 
     "eau"; // champeau 
     //"pagne[ds]?$"; // champagne 
     //"[aeiouy][^aeiuoytdbcgrnzs]h?e[rsd]?$"; 
     // The following detects words ending with a soft e ending. Don";t 
     // mess with it unless you absolutely have to! The following 
     // is a list of words you can use to test a new version of 
     // this rule (add ";r";, ";s"; and ";d"; where possible to test 
     // fully): 
     // - absolve 
     // - acquiesce 
     // - audience 
     // - ache 
     // - acquire 
     // - brunelle 
     // - byrne 
     // - canadienne 
     // - coughed 
     // - curved 
     // - champagne 
     // - designate 
     // - force 
     // - lace 
     // - late 
     // - lathe 
     // - make 
     // - relayed 
     // - scrounge 
     // - side 
     // - sideline 
     // - some 
     // - wide 
     // - taste 
     "[aeiouy](b|c|ch|d|dg|f|g|gh|gn|k|l|ll|lv|m|mm|n|nc|ng|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y|z)e$"; 
     // For soft e endings with a "d". Test words: 
     // - crunched 
     // - forced 
     // - hated 
     // - sided 
     // - sidelined 
     // - unexploded 
     // - unexplored 
     // - scrounged 
     // - squelched 
     // - forced 
     "[aeiouy](b|c|ch|dg|f|g|gh|gn|k|l|lch|ll|lv|m|mm|n|nc|ng|nch|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|th|v|y|z)ed$"; 
     // For soft e endings with a "s". Test words: 
     // - absences 
     // - accomplices 
     // - acknowledges 
     // - advantages 
     // - byrnes 
     // - crunches 
     // - forces 
     // - scrounges 
     // - squelches 
     "[aeiouy](b|ch|d|f|gh|gn|k|l|lch|ll|lv|m|mm|n|nch|nn|p|r|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y)es$"; 
     "^busi$"; 
    |] |> String.concat("|") |> Regex 


// These syllables would be counted as one but should be two 
let twoSyllableCorrection = 
    [| 
     "([^s]|^)ia"; 
     "riet"; 
     "dien"; // audience 
     "iu"; 
     "io"; 
     "eo($|[b-df-hj-np-tv-z])"; 
     "ii"; 
     "[ou]a$"; 
     "[aeiouym]bl$"; 
     "[aeiou]{3}"; 
     "[aeiou]y[aeiou]"; 
     "^mc"; 
     "ism$"; 
     "asm$"; 
     "thm$"; 
     "([^aeiouy])\1l$"; 
     "[^l]lien"; 
     "^coa[dglx]."; 
     "[^gq]ua[^auieo]"; 
     "dnt$"; 
     "uity$"; 
     "[^aeiouy]ie(r|st|t)$"; 
     "eings?$"; 
     "[aeiouy]sh?e[rsd]$"; 
     "iell"; 
     "dea$"; 
     "real"; // real, cereal 
     "[^aeiou]y[ae]"; // bryan, byerley 
     "gean$"; // aegean 
     "uen"; // influence, affluence 

    |] |> String.concat("|") |> Regex 

// Single syllable prefixes and suffixes 
let oneSyllableAffix = 
    [| 
     "^un"; 
     "^fore"; 
     "^ware"; 
     "^none?"; 
     "^out"; 
     "^post"; 
     "^sub"; 
     "^pre"; 
     "^pro"; 
     "^dis"; 
     "^side"; 
     "ly$"; 
     "less$"; 
     "some$"; 
     "ful$"; 
     "ers?$"; 
     "ness$"; 
     "cians?$"; 
     "ments?$"; 
     "ettes?$"; 
     "villes?$"; 
     "ships?$"; 
     "sides?$"; 
     "ports?$"; 
     "shires?$"; 
     "tion(ed)?$"; 

    |] |> String.concat("|") |> Regex 

// Double syllable prefixes and suffixes 
let twoSyllableAffix = 
    [| 
     "^above"; 
     "^ant[ie]"; 
     "^counter"; 
     "^hyper"; 
     "^afore"; 
     "^agri"; 
     "^in[ft]ra"; 
     "^inter"; 
     "^over"; 
     "^semi"; 
     "^ultra"; 
     "^under"; 
     "^extra"; 
     "^dia"; 
     "^micro"; 
     "^mega"; 
     "^kilo"; 
     "^pico"; 
     "^nano"; 
     "^macro"; 
     "berry$"; 
     "woman$"; 
     "women$"; 

    |] |> String.concat("|") |> Regex 

// Triple syllable prefixes and suffixes 
let threeSyllableAffix = 
    [| 
     "ology$"; 
     "ologist$"; 
     "onomy$"; 
     "onomist$"; 
    |] |> String.concat("|") |> Regex 

/// <summary> 
/// For each match in pattern, replace match with empty string in input word, 
/// returning bare word and # matches 
/// </summary> 
/// <param name="pattern"></param> 
/// <param name="word"></param> 
let RegexReplace (regex:Regex) word = 
    //let affixReplace = new Regex(pattern) 
    let matches = regex.Matches(word) 
    let mutable bareWord = word 
    for aMatch in matches do 
     bareWord <- bareWord.Replace(aMatch.Value,"") 
    // 
    bareWord, matches.Count //need to exclude a group? 

let CountMatches (regex:Regex) word = 
    //let regex = new Regex(pattern) 
    let matches = regex.Matches(word) 
    // 
    matches.Count 

/// <summary> 
/// Counts syllables in word. Assumes word has already been "cleaned" 
/// </summary> 
/// <param name="word"></param> 
let SyllableCount(word : string) = 
    if problemWordMap.ContainsKey(word) then 
     problemWordMap.[word] 
    else 
     //remove and count affixes 
     let wordMinus1Affix, oneAffixCount = RegexReplace oneSyllableAffix word 
     let wordMinus2Affix, twoAffixCount = RegexReplace twoSyllableAffix wordMinus1Affix 
     let wordMinus3Affix, threeAffixCount = RegexReplace threeSyllableAffix wordMinus2Affix 

     //count word parts 
     let vowelSplit = Regex.Split(wordMinus3Affix, "[^aeiouy]") 
     let mutable wordPartCount = 0 
     for wordPart in vowelSplit do 
      if wordPart.Length > 0 then 
       wordPartCount <- wordPartCount + 1 

     //base syllable count 
     let mutable baseSyllableCount = oneAffixCount + twoAffixCount + threeAffixCount + wordPartCount 

     //handle degenerate cases 
     let oneSyllableCorrectionCount = CountMatches oneSyllableCorrection word //count two as one: subtract 
     let twoSyllableCorrectionCount = CountMatches twoSyllableCorrection word //count one as two: add 

     baseSyllableCount <- baseSyllableCount - oneSyllableCorrectionCount + twoSyllableCorrectionCount 

     //we always have 1 syllable in a word 
     if baseSyllableCount > 0 then 
      baseSyllableCount 
     else 
      1

要處理的句子計數，我用的NuGet包斯坦福解析器和創建這個包裝：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Threading.Tasks; 
using edu.stanford.nlp.process; 
using edu.stanford.nlp.util; 

namespace StanfordWrapper 
{ 
    public class SentenceTokenizer 
    { 
     public static readonly TokenizerFactory TokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), 
       "normalizeParentheses=false,normalizeOtherBrackets=false,invertible=true"); 

     public static List<string> Go(string input) 
     { 
      java.io.Reader reader = new java.io.StringReader(input); 
      DocumentPreprocessor dp = new DocumentPreprocessor(reader); 
      dp.setTokenizerFactory(TokenizerFactory); 

      List<string> output = new List<string>(); 
      foreach (java.util.List sentence in dp) 
      { 
       output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence)); 
      } 

      return output; 
     } 
    } 
}

一個包裝是有幫助的B/C解析器在Java中。 nuget使用IKVMC使其可以被.NET調用。

最後一個字計數我使用一些代碼清理/記號化：

module TextNormalizer 

open System; 
open System.Collections.Generic; 
open System.Linq; 
open System.Text.RegularExpressions; 

let spaceRegex = new Regex(@"\s+"); 
let normalizeTextRegexStrict = new Regex(String.Join("|", [| @"[^\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled); 
let normalizeTextRegexApostrophe = new Regex(String.Join("|", [| @"[^'\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled); 

/// <summary> 
/// Replaces all punctuation with whitspace, apostrophe optional. Will return string matching original text with punctuation 
/// removed, text lowercased, and words evenly delimited with whitespace 
/// </summary> 
/// <param name="normedLine"></param> 
/// <param name="removeApostrophe"></param> 
let Normalize(normedLine) (removeApostrophe) = 
    let normedLine = 
     if removeApostrophe then 
      normalizeTextRegexStrict.Replace(normedLine, " "); // replace all punctuation with whitespace 
     else 
      normalizeTextRegexApostrophe.Replace(normedLine, " "); // replace all except apostrophe with whitespace 

    //return 
    spaceRegex.Replace(normedLine, " ") // reduce continguous whitespace to a single space 
     .Trim()           // get rid of any whitespace on ends 
     .ToLower();          // lowercase whole thing

有了所有這些東西是微不足道的計算FK：

let FleshKincaidGradeLevel(text) = 
    let sentences = StanfordWrapper.SentenceTokenizer.Go(text) |> Seq.toArray 

    let words = sentences |> Array.map(fun x -> TextNormalizer.Normalize x false) |> Array.collect(fun x -> x.Split(' ')) 

    let syllableCount = words |> Array.map SyllableCount2 |> Array.sum 

    //FKGL formula: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests 
    (0.39 * (float words.Length)/(float sentences.Length)) + (11.8 * (float syllableCount)/(float words.Length)) - 15.59

來源

2015-11-12 20:28:46

Flesch-Kincaid可讀性測試

回答

相關問題