2
我看到像okular和evince這樣的pdf觀衆能夠很好地顯示PDF文檔(書)的索引,並鏈接到每個段落。 他們怎麼能這樣做?他們使用poppler庫,我怎麼能用poppler提取那個索引,或者一般來說?如何使用poppler提取pdf索引/目錄?
我看到像okular和evince這樣的pdf觀衆能夠很好地顯示PDF文檔(書)的索引,並鏈接到每個段落。 他們怎麼能這樣做?他們使用poppler庫,我怎麼能用poppler提取那個索引,或者一般來說?如何使用poppler提取pdf索引/目錄?
它只是停留在第一層(去更深入地需要遞歸)
toc=document->toc();
QDomElement docElem = toc->documentElement();
QDomNode n = docElem.firstChild();
while(!n.isNull()) {
QDomElement e = n.toElement(); // try to convert the node to an element.
if(!e.isNull()) {
qDebug("elem %s\n",qPrintable(e.tagName())); // the node really is an element.
}
n = n.nextSibling();
}
這裏是一個演示如何使用poppler的Python中做到這一點:
import poppler
def walk_index(iterp, doc):
while iterp.next():
link=iterp.get_action()
s = doc.find_dest(link.dest.named_dest)
print link.title,' ', doc.get_page(s.page_num).get_label()
child = iterp.get_child()
if child:
walk_index(child, doc)
def main():
uri = ("file:///"+path_to_pdf)
doc = poppler.document_new_from_file(uri, None)
iterp = poppler.IndexIter(doc)
link = iterp.get_action()
s = doc.find_dest(link.dest.named_dest)
print link.title,' ', doc.get_page(s.page_num).get_label()
walk_index(iterp, doc)
return 0
if __name__ == '__main__':
main()
蟒蛇poppler
庫已過時,這裏是怎麼用Gobject做的:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# walk to table of contents and print titles and pages
import sys
from gi.repository import Poppler
def walk_index(iterp, doc):
while iterp.next():
link=iterp.get_action()
dest=doc.find_dest(link.goto_dest.dest.named_dest)
s = doc.get_page(dest.page_num-1)
print link.goto_dest.title, dest.page_num, s.get_label()
child = iterp.get_child()
if child:
walk_index(child, doc)
def main():
uri = ("file:///"+sys.argv[1])
doc = Poppler.Document.new_from_file(uri, None)
iterp = Poppler.IndexIter.new(doc)
link = iterp.get_action()
dest=doc.find_dest(link.goto_dest.dest.named_dest)
s = doc.get_page(dest.page_num-1)
print link.goto_dest.title, dest.page_num, s.get_label()
walk_index(iterp, doc)
return 0
if __name__ == '__main__':
main()