2012-07-06 29 views
1

我正在處理XML文檔以提交給英國的HMRC。這些文件需要被處理並且爲它們生成一個哈希碼。Java XML Dom內存密集

我寫了下面的代碼,適用於小文檔。但是,如果我處理一個60MB的XML文件,它使用大約1.2GB的內存。

我已經看了一下,看看有沒有反正我可以使它更有效率,但看不到任何東西。它需要能夠刪除IRMark元素,如果它已經存在。

任何想法非常讚賞。謝謝。

/* 
** Generates the HMRC IRMARK as required. 
*/ 
import java.io.File; 
import java.io.FileNotFoundException; 
import java.io.FileOutputStream; 
import java.io.IOException; 
import java.io.PrintWriter; 
import java.io.StringWriter; 
import java.io.UnsupportedEncodingException; 
import java.security.MessageDigest; 
import java.security.NoSuchAlgorithmException; 

import javax.xml.parsers.DocumentBuilder; 
import javax.xml.parsers.DocumentBuilderFactory; 
import javax.xml.parsers.ParserConfigurationException; 
import javax.xml.transform.Transformer; 
import javax.xml.transform.TransformerFactory; 
import javax.xml.transform.dom.DOMSource; 
import javax.xml.transform.stream.StreamResult; 
import javax.xml.xpath.*; 

import org.apache.xml.security.Init; 
import org.apache.xml.security.c14n.CanonicalizationException; 
import org.apache.xml.security.c14n.Canonicalizer; 
import org.apache.xml.security.c14n.InvalidCanonicalizerException; 
import org.bouncycastle.util.encoders.Base64; 
import org.w3c.dom.Document; 
import org.w3c.dom.Element; 
import org.w3c.dom.Node; 
import org.xml.sax.SAXException; 


public class IRMarkDOS 
{ 
    /** 
    * @param args 
    */ 
    public static void main(String[] args) 
    { 

     // Initialise Apache XML tools 
     Init.init(); 

     // Start tracking execution time 
     long start = System.currentTimeMillis(); 

     try 
     { 

      // Validate/parse the command line 
      if (args.length != 3) 
      { 
       System.out.println("INCORRECT PARAMETERS SPECIFIED" + System.getProperty("line.separator") + " Specify IRMark.exe <InputFile> <OutputFile> " + "<TaxNamespace>"); 
       return; 
      } 

      // Set input/output variables 
      String sInput = args[0]; 
      String sOutput = args[1]; 
      String sTaxNamespace = args[2]; 


      // Read the XML Document 
      //Document xmlDoc = IRMarkDOS.processXML(sInput, sTaxNamespace); 
      String xml = IRMarkDOS.processXML(sInput, sTaxNamespace); 
      System.gc(); 


      // Generate the IRMark 
      String strIRMark = IRMarkDOS.generateIRMark(xml); 

      // Write to file 
      PrintWriter out = new PrintWriter(new FileOutputStream(sOutput)); 
      out.println(strIRMark); 
      out.close(); 
      System.out.println("IRmark64: " + strIRMark); 

      // Output execution time 
      long end = System.currentTimeMillis();   
      System.out.println("Execution Time " + ((end-start)/1000) + " seconds, " + (end-start) + " milliseconds"); 


     } 
     catch (RuntimeException ex) 
     { 
      System.out.println(ex.getMessage()); 
      System.exit(1); 
     } 
     catch (OutOfMemoryError ex) 
     { 
      System.out.println(ex.getMessage()); 
      System.exit(1);   
     } catch (FileNotFoundException e) { 
      // TODO Auto-generated catch block 
      e.printStackTrace(); 
     } 

    } 

    /** 
    * 
    * Processes the specified XML document 
    * 
    * @param sInput    - XML Document 
    * @param sTaxNamespace  - TaxNamespace 
    * @param sEnvelopeNamespace - EnvelopeNamespace 
    * @return 
    */ 
    private static String processXML (String sInput, String sTaxNamespace) 
    { 

     try 
     { 
      // Read XML 
      File xmlDocument=new File(sInput); 
      DocumentBuilderFactory xmlDomFactory = DocumentBuilderFactory.newInstance(); 
      DocumentBuilder xmlBuilder = xmlDomFactory.newDocumentBuilder(); 
      Document xmlDoc = xmlBuilder.parse(xmlDocument);      

      // Setup XPath   
      XPathFactory factory=XPathFactory.newInstance(); 
      XPath xPath=factory.newXPath();  

      Node body = (Node) xPath.evaluate("/GovTalkMessage/Body", xmlDoc, XPathConstants.NODE); 

      //Get IRMark and remove if exists 
      Node irmark = (Node) xPath.evaluate("/GovTalkMessage/Body/IRenvelope/IRheader/IRmark", xmlDoc, XPathConstants.NODE); 

      if (irmark != null) 
      { 
       System.out.println("Original IRMark: " + irmark.getTextContent()); 
       irmark.getParentNode().removeChild(irmark);    
      } 
      irmark = null; 


      // Create new doc for body and add envelope namespace to body as required 
      xmlDoc = null; 
      xmlDoc = xmlBuilder.newDocument(); 

      Node tmp = xmlDoc.importNode(body, true); 
      xmlDoc.appendChild(tmp); 
      tmp = null; 

      // Add namespace to body element   
      xmlDoc.getDocumentElement().setAttributeNS("http://www.w3.org/2000/xmlns/", "xmlns", "http://www.govtalk.gov.uk/CM/envelope"); 

      return IRMarkDOS.getOuterXml(xmlDoc); 

     } 
     catch (RuntimeException ex) 
     { 
      System.out.println(ex.getMessage()); 
      System.exit(1); 

     } catch (ParserConfigurationException e) { 
      e.printStackTrace(); 
     } catch (SAXException e) { 
      e.printStackTrace(); 
     } catch (IOException e) { 
      e.printStackTrace(); 
     } catch (XPathExpressionException e) { 
      e.printStackTrace(); 
     } 

     return null; 

    } 

    /** 
    * Generates the IRMark for the specified XML Document 
    * 
    * @param xmlDoc - XML Document to generate the IRMark for 
    * @return - The generated IRMark 
    */ 
    private static String generateIRMark(String bodyText1) 
    { 
     // Get XML string 
     //String bodyText1 = IRMarkDOS.getOuterXml(xmlDoc); 

     // Final Data Tweaks 
     bodyText1 = bodyText1.toString(); 
     bodyText1 = bodyText1.replace("&#xD;", ""); 
     bodyText1 = bodyText1.replace("\r\n", "\n"); 
     bodyText1 = bodyText1.replace("\r", "\n"); 

     try 
     { 
      // Convert the final document back into a byte array encoded as UTF8 
      byte[] bodyBytes = bodyText1.getBytes("UTF8"); 

      // Canonicalisation to C14n   
      Canonicalizer c14n = Canonicalizer.getInstance("http://www.w3.org/TR/2001/REC-xml-c14n-20010315"); 
      byte[] bodyCanonical = c14n.canonicalize(bodyBytes); 

      // Generate SHA 1 and convert to Base64 
      MessageDigest md1 = MessageDigest.getInstance("SHA");   
      md1.update(bodyCanonical); //bodyBytes 
      byte[] digest1 = md1.digest();   

      String strIRmark = new String(Base64.encode(digest1)); 
      return strIRmark; 
     } 
     catch (RuntimeException ex) 
     { 
      System.out.println(ex.getMessage()); 
      System.exit(1); 

     } catch (UnsupportedEncodingException e) { 
      e.printStackTrace(); 
     } catch (InvalidCanonicalizerException e) { 
      e.printStackTrace(); 
     } catch (CanonicalizationException e) { 
      e.printStackTrace(); 
     } catch (ParserConfigurationException e) { 
      e.printStackTrace(); 
     } catch (IOException e) { 
      e.printStackTrace(); 
     } catch (SAXException e) { 
      e.printStackTrace(); 
     } catch (NoSuchAlgorithmException e) { 
      e.printStackTrace(); 
     }  
     return ""; 
    } 


    /** 
    * Converts a XML Node to a string representation 
    * 
    * @param node - XML Node to convert to String 
    * @return - A string representation of the XML Node 
    */ 
    private static String getOuterXml(Node node) 
    { 
     try 
     { 
      Transformer transformer = TransformerFactory.newInstance().newTransformer(); 
      transformer.setOutputProperty("omit-xml-declaration", "yes"); 

      StringWriter writer = new StringWriter(); 
      transformer.transform(new DOMSource(node), new StreamResult(writer)); 
      return writer.toString(); 
     } 
     catch (Exception e) 
     { 
      e.printStackTrace(); 
     } 

     return ""; 

    } 

} 

回答

3

您可以嘗試使用SAX解析器並響應特定的元素打開和關閉事件。也許你可以完成你所需要做的事情,而無需一次以這種方式在內存中保存整個DOM。

+2

+1,這是正確的,但它讓我想起傑拉爾德溫伯格的報價解釋了「正義」如何轉化爲「去了很多麻煩」 – 2012-07-06 19:57:17

+0

不幸的是,Canonicalizer似乎是一個限制因素,因爲它似乎接受整個文檔(byte []或DOM,並且字節[]最有可能在內部轉換爲DOM)。 – jtahlborn 2012-07-06 20:24:05

0

我也建議你的processXML()方法如下修改:

private static String processXML (String sInput, String sTaxNamespace){ 
    Document xmlDoc = null; 
    try{ 
     /** 
     * Your processing logic 
     */ 
    }catch(Exception exe){ 
     //Catch your individual exceptions 
    }finally{ 
     if(xmlDoc != null){ 
      xmlDoc = null; 
      System.runFinalization(); 
      System.gc(); 
     } 
    } 
} 

這樣,您就可以確保您的文件可能會被垃圾收集完成後處理後同樣。

+0

這不適用於非常大的XML文件,不適合內存:) – GETah 2012-07-06 19:52:00

+0

你是對的@GETah - 我沒有考慮到。感謝您指出了這一點! – Sujay 2012-07-06 19:52:43

+0

當然沒問題:) – GETah 2012-07-06 19:53:51

0

DOM XML樹在處理小文件時很好,這是因爲當您使用DOM API加載XML文件時,會創建一棵樹來將整個文檔結構保存在內存中。處理大量XML數據時,應該使用XML streams。 XML流式傳輸使用最少量的內存,並能以一種簡單時尚的方式爲您提供讀取和處理XML數據的靈活性。