2015-09-07 53 views
1

我有一個導入.xlsx文件的類。我把它從這個例子中拿出來,並根據我的需要進行了修改: https://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java 現在應用程序處理一些文件就好了,其他的完全沒有。如果我在其中一個不工作的文件中更改單個字段或字符,並再次保存它們,則整個內容都會正確處理。有沒有人有一個想法可能是什麼原因(恕我直言,它位於原始Excel文件中的某處)。Apache POI Streaming API無法識別Excel(xlsx)內容

敬啓者有所幫助,這裏是我的代碼:

package com.goodgamestudios.icosphere.service.fileReader; 

    import com.goodgamestudios.icosphere.datamodel.DataSet; 
    import com.goodgamestudios.icosphere.datamodel.Tuple; 
    import java.io.File; 
    import java.io.IOException; 
    import java.io.InputStream; 
    import org.apache.poi.openxml4j.exceptions.OpenXML4JException; 
    import org.apache.poi.openxml4j.opc.OPCPackage; 
    import org.apache.poi.ss.usermodel.BuiltinFormats; 
    import org.apache.poi.ss.usermodel.DataFormatter; 
    import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; 
    import org.apache.poi.xssf.eventusermodel.XSSFReader; 
    import org.apache.poi.xssf.model.SharedStringsTable; 
    import org.apache.poi.xssf.model.StylesTable; 
    import org.apache.poi.xssf.usermodel.XSSFCellStyle; 
    import org.apache.poi.xssf.usermodel.XSSFRichTextString; 
    import org.slf4j.Logger; 
    import org.slf4j.LoggerFactory; 
    import org.xml.sax.Attributes; 
    import org.xml.sax.InputSource; 
    import org.xml.sax.SAXException; 
    import org.xml.sax.XMLReader; 
    import org.xml.sax.helpers.DefaultHandler; 
    import org.xml.sax.helpers.XMLReaderFactory; 


    public class ExcelFileReader implements FileReader { 

    static final Logger LOG = LoggerFactory.getLogger(ExcelFileReader.class); 
    private SheetHandler handler; 

    @Override 
    public DataSet getDataFromFile(File file) throws IOException { 

     LOG.info("Start ingesting file {}"); 
     try { 
      OPCPackage pkg = OPCPackage.open(file); 
      XSSFReader reader = new XSSFReader(pkg); 
      StylesTable styles = reader.getStylesTable(); 
      ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg); 

      SharedStringsTable sst = reader.getSharedStringsTable(); 
      XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); 
      handler = new SheetHandler(styles, strings, 24); 
      parser.setContentHandler(handler); 

      // rId2 found by processing the Workbook 
      // Seems to either be rId# or rSheet# 
      System.out.println("yooooo 1"); 
      InputStream sheet2 = reader.getSheet("rId2"); 
      System.out.println("yooooo 2"); 
      InputSource sheetSource = new InputSource(sheet2); 
      System.out.println("yooooo 3"); 
      parser.parse(sheetSource); 
      LOG.debug("{} rows parsed", handler.getData().getRows().size() + 1); 
      sheet2.close(); 
      return handler.getData(); 

     } catch (OpenXML4JException | SAXException ex) { 
      LOG.warn("Unable to parse file {}", file.getName()); 
      LOG.warn("Exception: {}: ", ex); 
     } 

     return null; 
    } 

    /** 
    * See org.xml.sax.helpers.DefaultHandler javadocs 
    * 
    * Derived from http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api 
    * <p/> 
    * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at 
    * http://www.ecma-international.org/publications/standards/Ecma-376.htm 
    * <p/> 
    * A web-friendly version is http://openiso.org/Ecma/376/Part4 
    */ 
    private static class SheetHandler extends DefaultHandler { 

     boolean isFirstRow = true; 
     private int quantityOfColumns; 
     private int currentColumnNumber = 1; 
     int currentRowNumber = 1; 
     private int rowNumberOfLastCell = 1; 
     private DataSet data = new DataSet(); 
     private Tuple tuple; 

     /** 
     * Table with styles 
     */ 
     private StylesTable stylesTable; 

     /** 
     * Table with unique strings 
     */ 
     private ReadOnlySharedStringsTable sharedStringsTable; 

     /** 
     * Number of columns to read starting with leftmost 
     */ 
     private final int minColumnCount; 

     // Set when V start element is seen 
     private boolean vIsOpen; 

     // Set when cell start element is seen; 
     // used when cell close element is seen. 
     private xssfDataType nextDataType; 

     // Used to format numeric cell values. 
     private short formatIndex; 
     private String formatString; 
     private final DataFormatter formatter; 

     // The last column printed to the output stream 
     private int lastColumnNumber = -1; 

     // Gathers characters as they are seen. 
     private StringBuffer value; 

     static final Logger LOG = LoggerFactory.getLogger(SheetHandler.class); 

     private SheetHandler(StylesTable styles, 
       ReadOnlySharedStringsTable strings, 
       int cols) { 
      this.stylesTable = styles; 
      this.sharedStringsTable = strings; 
      this.minColumnCount = cols; 
      this.value = new StringBuffer(); 
      this.nextDataType = xssfDataType.NUMBER; 
      this.formatter = new DataFormatter(); 
      LOG.debug("Sheethandler created"); 
     } 

     /* 
     * (non-Javadoc) 
     * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 
     */ 
     public void startElement(String uri, String localName, String name, 
       Attributes attributes) throws SAXException { 
      System.out.println("yooooooooooo start:uri:" + uri + " localname: " + localName + " name: " + name); 
      if ("inlineStr".equals(name) || "v".equals(name)) { 
       vIsOpen = true; 
       // Clear contents cache 
       value.setLength(0); 
      } // c => cell 
      else if ("c".equals(name)) { 
       // Get the cell reference 
       String r = attributes.getValue("r"); 
       int firstDigit = -1; 
       for (int c = 0; c < r.length(); ++c) { 
        if (Character.isDigit(r.charAt(c))) { 
         firstDigit = c; 
         break; 
        } 
       } 
       currentColumnNumber = nameToColumn(r.substring(0, firstDigit)); 
       System.out.println("colu mn " + currentColumnNumber); 

       // Set up defaults. 
       this.nextDataType = xssfDataType.NUMBER; 
       this.formatIndex = -1; 
       this.formatString = null; 
       String cellType = attributes.getValue("t"); 
       String cellStyleStr = attributes.getValue("s"); 
       if ("b".equals(cellType)) { 
        nextDataType = xssfDataType.BOOL; 
       } else if ("e".equals(cellType)) { 
        nextDataType = xssfDataType.ERROR; 
       } else if ("inlineStr".equals(cellType)) { 
        nextDataType = xssfDataType.INLINESTR; 
       } else if ("s".equals(cellType)) { 
        nextDataType = xssfDataType.SSTINDEX; 
       } else if ("str".equals(cellType)) { 
        nextDataType = xssfDataType.FORMULA; 
       } else if (cellStyleStr != null) { 
        // It's a number, but almost certainly one 
        // with a special style or format 
        XSSFCellStyle style = null; 
        if (cellStyleStr != null) { 
         int styleIndex = Integer.parseInt(cellStyleStr); 
         style = stylesTable.getStyleAt(styleIndex); 
        } else if (stylesTable.getNumCellStyles() > 0) { 
         style = stylesTable.getStyleAt(0); 
        } 
        if (style != null) { 
         this.formatIndex = style.getDataFormat(); 
         this.formatString = style.getDataFormatString(); 
         if (this.formatString == null) { 
          this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); 
         } 
        } 
       } 
      } 

     } 

     /* 
     * (non-Javadoc) 
     * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String) 
     */ 
     public void endElement(String uri, String localName, String name) 
       throws SAXException { 

      String thisStr = null; 

      // v => contents of a cell 
      if ("v".equals(name)) { 
       // Process the value contents as required. 
       // Do now, as characters() may be called more than once 
       switch (nextDataType) { 

        case BOOL: 
         char first = value.charAt(0); 
         thisStr = first == '0' ? "FALSE" : "TRUE"; 
         break; 

        case ERROR: 
         thisStr = "\"ERROR:" + value.toString() + '"'; 
         break; 

        case FORMULA: 
         // A formula could result in a string value, 
         // so always add double-quote characters. 
         thisStr = '"' + value.toString() + '"'; 
         break; 

        case INLINESTR: 
         // TODO: have seen an example of this, so it's untested. 
         XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); 
         thisStr = '"' + rtsi.toString() + '"'; 
         break; 

        case SSTINDEX: 
         String sstIndex = value.toString(); 
         try { 
          int idx = Integer.parseInt(sstIndex); 
          XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); 
          thisStr = rtss.toString(); 
         } catch (NumberFormatException ex) { 
          System.out.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); 
         } 
         break; 

        case NUMBER: 
         String n = value.toString(); 
         if (this.formatString != null && n.length() > 0) { 
          thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); 
         } else { 
          thisStr = n; 
         } 
         break; 

        default: 
         thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; 
         break; 
       } 

       // Output after we've seen the string contents 
       // Emit commas for any fields that were missing on this row 
       if (lastColumnNumber == -1) { 
        lastColumnNumber = 0; 
       } 
       for (int i = lastColumnNumber; i < currentColumnNumber; ++i) { 
       } 

       // Might be the empty string. 
       System.out.println(thisStr); 
       if (isFirstRow) { 
        data.getHeaders().add(thisStr); 
       } else { 
        tuple.getRowEntries()[currentColumnNumber] = thisStr; 
       } 
       // Update column 
       if (currentColumnNumber > -1) { 
        lastColumnNumber = currentColumnNumber; 
       } 

      } else if ("row".equals(name)) { 

       // We're onto a new row 
       System.out.println("nextrow"); 
       lastColumnNumber = -1; 
       System.out.println("yoooooo tuple:" + tuple); 
       if (isFirstRow) { 
        isFirstRow = false; 
        quantityOfColumns = data.getHeaders().size(); 
        tuple = new Tuple(quantityOfColumns); 

       } else if (!tuple.isEmpty()) { 
        data.addRow(tuple); 
        tuple = new Tuple(quantityOfColumns); 
       } 
      } 

     } 

     /** 
     * Captures characters only if a suitable element is open. Originally 
     * was just "v"; extended for inlineStr also. 
     */ 
     public void characters(char[] ch, int start, int length) 
       throws SAXException { 
      if (vIsOpen) { 
       value.append(ch, start, length); 
      } 
     } 

     /** 
     * Converts an Excel column name like "C" to a zero-based index. 
     * 
     * @param name 
     * @return Index corresponding to the specified name 
     */ 
     private int nameToColumn(String name) { 
      int column = -1; 
      for (int i = 0; i < name.length(); ++i) { 
       int c = name.charAt(i); 
       column = (column + 1) * 26 + c - 'A'; 
      } 
      return column; 
     } 

     public DataSet getData() { 
      return data; 
     } 
    } 

    /** 
    * The type of the data value is indicated by an attribute on the cell. The 
    * value is usually in a "v" element within the cell. 
    */ 
    enum xssfDataType { 

     BOOL, 
     ERROR, 
     FORMULA, 
     INLINESTR, 
     SSTINDEX, 
     NUMBER, 
    } 
} 

這裏是工作的XML例子,一個沒有工作的工作表:

http://www.file-upload.net/download-10909789/not_working.xml.html http://www.file-upload.net/download-10909790/working.xml.html

這裏的xlsx-files:

http://www.file-upload.net/download-10909802/not_working.xlsx.html http://www.file-upload.net/download-10909803/working.xlsx.html

謝謝!

+0

您能更具體地瞭解哪些文件不起作用嗎? – Gagravarr

+0

請證明2個示例xlsx文件,一個工作,另一個不是 –

+0

@Garavarr:恕我直言,應用程序由於某種原因不能識別行或其內容。我讓它打印startElement-Method的參數,它應該經過行,列,單元格等等: http://www.file-upload.net/download-10897984/output.txt.html – olkoza

回答

2

問題是,LibreOffice Calc將第一張工作表保存在「rId2」下,而MSOffice在「rId1」下這樣做。所以,現在我正在瀏覽sheetIds,直到解析了包含內容的工作表或找不到更多工作表。適用於這兩個文件:

private void parseFirstWorksheetWithContent(XSSFReader reader) throws IOException, InvalidFormatException, SAXException { 
    //Sheet-ID seems to differ, seems to be "rId2" for files saved by MS Excel and "rId1" for those saved by LibreOffice Calc 
    try { 
     for (int i = 1; handler.getData().isEmpty(); i++) { 
      parseSheet(reader, "rId" + i); 
     } 
    } catch (IllegalArgumentException e) { 
     //No more sheets, file empty 
    } 
} 

private void parseSheet(XSSFReader reader, String sheetId) throws InvalidFormatException, SAXException, IOException { 
     XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); 
     parser.setContentHandler(handler); 
     InputStream sheetStream = reader.getSheet(sheetId); 
     InputSource sheetSource = new InputSource(sheetStream); 
     parser.parse(sheetSource); 
     sheetStream.close(); 
    } 
+0

爲什麼不實際處理工作簿XML,並正確地查找工作表名稱或索引的ID,而不是僅僅猜測ID? – Gagravarr

+0

毫無疑問,這將是更好的解決方案。既然上面已經足夠了,但我會保持原樣(現在通過重複ID來改進)。感謝您的建議。 – olkoza

相關問題