2017-06-16 95 views
1

我想一個RTF字符串轉換爲純文本,而無需使用正則表達式如何使用任何圖書館

rtfstring = '{\rtf1\ansi\ansicpg1252\deff0\deflang1033{\fonttbl{\f0 Arial;}}{\colortbl;\red255\green0\blue0;\red1\green1\blue1;}\viewkind4\uc1\pard\f0\fs18 {\b Amount/complexity of data to be reviewed:\b0}\par{- Review and summarization of old records}\par}' 

純文本將是

Plaintext = "Amount/complexity of data to be reviewed:- Review and summarization of old records" 
+0

還有一兩件事,我不想這個保存爲.rtf文件並將其轉換爲純文本,因爲我在數據庫中有更多的數據作爲字符串。 –

回答

2

來源RTF字符串轉換爲純文本的蟒蛇 - StackOverflow questionregex

要運行就叫 -

>>> text = "Whatever your rtf text goes here" 
>>> python striprtf(text) 

碼 -

# -*- coding: utf-8 -*- 

""" 
Extract text in RTF Files. Refactored to use with Python 3.x 
Source: 
    http://stackoverflow.com/a/188877 
Code created by Markus Jarderot: http://mizardx.blogspot.com 
""" 

import re 


def striprtf(text): 
    pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) 
    # control words which specify a "destionation". 
    destinations = frozenset((
     'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid', 
     'atnparent','atnref','atntime','atrfend','atrfstart','author','background', 
     'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping', 
     'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap', 
     'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt', 
     'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl', 
     'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype', 
     'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr', 
     'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl', 
     'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc', 
     'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers', 
     'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride', 
     'listoverridetable','listpicture','liststylename','listtable','listtext', 
     'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr', 
     'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr', 
     'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me', 
     'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr', 
     'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag', 
     'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname', 
     'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr', 
     'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject', 
     'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname', 
     'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl', 
     'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr', 
     'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu', 
     'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr', 
     'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup', 
     'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide', 
     'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol', 
     'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables', 
     'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops', 
     'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password', 
     'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta', 
     'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe', 
     'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst', 
     'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv', 
     'svb','tc','template','themedata','title','txe','ud','upr','userprops', 
     'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform', 
     'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl', 
     'xmlopen', 
    )) 
    # Translation of some special characters. 
    specialchars = { 
     'par': '\n', 
     'sect': '\n\n', 
     'page': '\n\n', 
     'line': '\n', 
     'tab': '\t', 
     'emdash': '\u2014', 
     'endash': '\u2013', 
     'emspace': '\u2003', 
     'enspace': '\u2002', 
     'qmspace': '\u2005', 
     'bullet': '\u2022', 
     'lquote': '\u2018', 
     'rquote': '\u2019', 
     'ldblquote': '\201C', 
     'rdblquote': '\u201D', 
    } 
    stack = [] 
    ignorable = False  # Whether this group (and all inside it) are "ignorable". 
    ucskip = 1    # Number of ASCII characters to skip after a unicode character. 
    curskip = 0    # Number of ASCII characters left to skip 
    out = []    # Output buffer. 
    for match in pattern.finditer(text.decode()): 
     word,arg,hex,char,brace,tchar = match.groups() 
     if brace: 
     curskip = 0 
     if brace == '{': 
      # Push state 
      stack.append((ucskip,ignorable)) 
     elif brace == '}': 
      # Pop state 
      ucskip,ignorable = stack.pop() 
     elif char: # \x (not a letter) 
     curskip = 0 
     if char == '~': 
      if not ignorable: 
       out.append('\xA0') 
     elif char in '{}\\': 
      if not ignorable: 
       out.append(char) 
     elif char == '*': 
      ignorable = True 
     elif word: # \foo 
     curskip = 0 
     if word in destinations: 
      ignorable = True 
     elif ignorable: 
      pass 
     elif word in specialchars: 
      out.append(specialchars[word]) 
     elif word == 'uc': 
      ucskip = int(arg) 
     elif word == 'u': 
      c = int(arg) 
      if c < 0: c += 0x10000 
      if c > 127: out.append(chr(c)) #NOQA 
      else: out.append(chr(c)) 
      curskip = ucskip 
     elif hex: # \'xx 
     if curskip > 0: 
      curskip -= 1 
     elif not ignorable: 
      c = int(hex,16) 
      if c > 127: out.append(chr(c)) #NOQA 
      else: out.append(chr(c)) 
     elif tchar: 
     if curskip > 0: 
      curskip -= 1 
     elif not ignorable: 
      out.append(tchar) 
    return ''.join(out) 
+2

你的代碼工作正常,但text.decode()不得不由文本替換:pattern.finditer(text.decode())。運行Python 3.6 –