2014-01-06 75 views
2

我怎麼能手動解析PDF文檔到使用JavaScript的話陣列。我不關心圖像,數字,表格 - 只有單詞,因爲我可以將它用作JavaScript對象。如何?解析器PDF到JavaScript數組?

+0

要那樣做服務器端或客戶端?如果客戶端,你如何收到文件?沒有跨域問題? –

+0

是,客戶端) –

+0

@RuzelDavletyarov你已經在你的網站上的PDF文件,還是用戶上傳嗎? – Cilan

回答

0

假設你已經有PDF格式的內容,你可以下載pdftotext和:

功能應用(){ VAR 自我=這一點; this.complete = 0;

this.pdfToText = function(data){    
    // render the first pages 
    var pdf = new PDFJS.PDFDoc(data); 
    var total = pdf.numPages; 

    for (i = 1; i <= total; i++){ 
     var page = pdf.getPage(i); 

     var canvas = document.createElement('canvas'); 
     canvas.id = 'page' + i; 
     canvas.mozOpaque = true; 
     div.appendChild(canvas); 

     canvas.width = page.width; 
     canvas.height = page.height; 

     var context = canvas.getContext('2d'); 
     context.save(); 
     context.fillStyle = 'rgb(255, 255, 255)'; 
     context.fillRect(0, 0, canvas.width, canvas.height); 
     context.restore(); 

     self.setMessage("Rendering..."); 

     var textLayer = document.createElement('div'); 
     textLayer.className = 'textLayer'; 
     document.body.appendChild(textLayer); 

     page.startRendering(context, function(){ 
     if (++self.complete == total){ 
      self.setMessage("Finished rendering. Extracting text..."); 

      window.setTimeout(function(){ 
      var layers = []; 
      var nodes = document.querySelectorAll(".textLayer > div"); 
      for (var j = 0; j < nodes.length; j++){ 
       layers.push(nodes[j].textContent + "\n"); 
      } 
      self.sendOutput(layers.join("\n")); 

      self.setMessage("Done!"); 
      }, 1000); 
     } 
     }, textLayer); 
    } 
    }; 
    } 

    this.receiveInput = function(event){ 
    if (event.source != parent) return; 
    if (!event.data.byteLength) return alert("The PDF data needs to be an ArrayBuffer"); 
    self.setMessage("Received data"); 
    self.pdfToText(event.data); 
    } 

    this.sendOutput = function(text){ 
    var recipient = parent.postMessage ? parent : (parent.document.postMessage ? parent.document : undefined); 
    recipient.postMessage(text, "*"); 
    }; 

    this.setMessage = function(text){ 
    text = text.split(' '); 
    console.log(text); 
    } 

    window.addEventListener("message", self.receiveInput, true); 
    self.setMessage("Ready"); 
    self.sendOutput("ready"); 

,讓你輸入的onchangeApp()

DEMO (doesn't work in some browsers) 這將從PDF中記錄單詞數組(含標點符號)。

+0

東西我沒有采取文件的文字( –

0

使用pdf.js,我這樣做:

var pdf = require("pdf.js"); 
PDFJS.getDocument('document.pdf').then(function(pdf){ 
    pdf.getPage(1).then(function(page){ 
    page.getTextContent().then(function(txt){ 
     var arrayOfText = items.map(function(item){ 
     return item.str; 
     }); 
    }); 
    }); 
});