2013-03-11 46 views

回答

1

我找到了一種方法來獲取每個字符的邊界框,並使用該信息來查看每個字符屬於哪個區域。

// Load the .box file output by Tesseract-OCR, and match each bounding box 
// to a character from the text. This is not trivial, because whitespaces 
// are present in the text from the .txt, but are missing from the .box 
// file. 
std::vector<RECT> loadBoxFile(const wchar_t *fileName, const std::wstring &text, int imageHeight) 
{ 
    // Open file. 
    std::ifstream st(fileName); 
    if (!st.is_open()) 
     throw std::runtime_error("Could not open .box file."); 

    std::string line; 
    std::string symbolAscii; 
    std::wstring symbol; 
    RECT r = { -1, -1, -1, -1 }; 
    std::vector<RECT> ret; 
    ret.resize(text.size(), r); 
    size_t textPos = 0; 
    while (std::getline(st, line)) { 
     // Parse a line. 
     std::istringstream iss(line); 
     if (!(iss >> symbolAscii >> r.left >> r.top >> r.right >> r.bottom)) 
      throw std::runtime_error("Could not parse .box file line."); 
     symbol = utf8to16(symbolAscii.c_str()); 

     // We don't try to get the bounding box for '~', because sometimes 
     // there is a '~' in .box file that is not there in .txt file. It's 
     // a bug in Tesseract-OCR as far as I know. This is a workaround 
     // for that case. 
     if (L"~" == symbol) 
      continue; 

     // Now match the symbol for that line to a symbol in the .txt file. 
     textPos = text.find(symbol, textPos); 

     // If we couldn't match it then fail. 
     if (text.npos == textPos) 
      throw std::runtime_error(std::string() + "Could not match symbol \"" + symbolAscii + "\" from .box file to .txt file."); 

     // Write the bounding box in the array, at index matching the symbol 
     // in the .txt file. 
     r.bottom = imageHeight - r.bottom; 
     r.top = imageHeight - r.top; 
     for (int ii = 0; ii < symbol.size(); ii++) 
      ret[textPos + ii] = r; 

     // Now increment textPos() so we start searching after the last 
     // symbol of the currently found symbol. 
     textPos += symbol.size(); 
    } 
    return ret; 
}