在.so文件中使用C模塊時出現分段錯誤

在Linux下的命令行中運行我的python腳本時，出現了分段錯誤 - 可能是由長循環造成的。我確切地知道其中的問題是，但我不知道爲什麼。我嘗試了一些我在網上搜索的方法，包括這個網站，但我仍然無法解決它。所以，請幫助我 - 預先感謝您。這裏如下一些代碼：在.so文件中使用C模塊時出現分段錯誤

analyzer.py，其中該程序開始：

from classify import BayesClassifier 
class Analyzer: 

    def __init__(self): 
     self.classify = BayesClassifier('/home/user/yakamoz/srcanalyzer/classification/training.tab') 

if __name__ == '__main__': 
    a = Analyzer() 
    # the following is a string of Chinese character, which, I am sure, 
    # has no influence on the Segmentation fault, you can just suppose 
    # it as a paragraph in English. 

    text = "市委常委、紀委書記楊娟高度評價我縣基層黨風廉政建設：\ 
     務實創新成效顯著作者：縣紀委辦公室發佈時間：11月27日下午，\ 
     市委常委、紀委書記楊娟率領市紀委副書記蔣玉平、王友富，\ 
     市紀委常委、祕書長任斌，市紀委機關黨委書記劉林建一行來我\ 
     縣調研基層黨風廉政建設。調研中，楊娟高度評價我縣基層黨風廉政建設，\ 
     認爲工作務實創新，成效顯著。縣委書記陳朝先，縣委副書記季代雙，縣委常委、\ 
     紀委書記韓忠明陪同調研。楊娟一行先後來到兩河鎮、西部花都、兩江廣場、\ 
     工業園區等地實地調研我縣基層黨風廉政建設，檢閱我縣「兩化」互動、「三化」\ 
     聯動發展成果。查閱相關資料在兩河鎮，楊娟認真聽取了兩河片區紀工委\ 
     日常工作開展情況的彙報，仔細翻閱了巡查工作日記和接訪記錄。楊娟指出，\ 
     設置鄉鎮片區紀工委是加強基層紀檢組織建設的創新舉措。\ 
     鹽亭在全市率先設置、運行紀工委以來，在化解農村信訪矛盾，理順羣衆情緒，\ 
     強化基層辦案工作等方面取得了明顯成效。她要求，要總結提煉片區紀工委的經驗，\ 
     進一步明確職能職責，在機構設置、人員配備、制度建設等方面進行探索實踐，\ 
     爲全市基層紀檢組織建設提供有益經驗借鑑。楊娟還饒有興趣地參觀了兩河鎮\ 
     的機關廉政文化建設" 

    print str(a.classify.classify_text(text)[0])

classify.py;該文件是由analyzer.py使用的，上面給出：

# -*- coding:utf-8 -*- 
from match import WordMatch 
import cPickle 
import math 

class BayesClassifier: 

    __trainingdata = {}      
    __classifywordscount = {} 
    __classifydoccount = {} 

    def __init__(self, table_name):   
     self.trainingtable = cPickle.load(open(table_name, 'r')) 
     for x in self.trainingtable: 
      self.train(x[1], x[0]) 
     print 'training finished' 
     self.matrix = self.get_matrix()   
     self.vector_count = len(self.matrix) 
     self.doc_count = len(self.trainingtable) 
     self.match = WordMatch(self.matrix) 

    def get_matrix(self):      
     matrix = {} 
     for x in self.trainingtable: 
      for k in x[0]: 
       matrix[k] = 0 
     return matrix 

    def doc_to_vector(self, content): 
     matrix = {word:value for (word, value) in self.match.find(content).items()}  
     return matrix   

    def train(self, cls, vector): 
     if cls not in self.__trainingdata: 
      self.__trainingdata[cls] = {} 
     if cls not in self.__classifywordscount: 
      self.__classifywordscount[cls] = 0 
     if cls not in self.__classifydoccount: 
      self.__classifydoccount[cls] = 0 
     self.__classifydoccount[cls] += 1 

     for word in vector.keys(): 
      self.__classifywordscount[cls] += vector[word] 
      if word not in self.__trainingdata[cls]: 
       self.__trainingdata[cls][word] = vector[word] 
      else: 
       self.__trainingdata[cls][word] += vector[word] 


    def classify_text(self, content): 
     t = -1 << 32 
     res = "unknown classification" 
     for cls in self.__trainingdata.keys(): 
      prob = self.__count(cls, self.doc_to_vector(content)) 
      if prob > t: 
       res = cls 
       t = prob 
     return res, t

match.py;這段代碼是由classify.py

# -*- coding:utf-8 -*- 
import os 
import re 
import util.ahocorasick.x64 as ahocorasick 
# util.ahocorasick.x64 is a folder where .so file locates 

class WordMatch(object): 
    def __init__(self, arg): 
     self.__tree = ahocorasick.KeywordTree() 
     if isinstance(arg, (list, dict)): 
      for item in arg: 
       if item: 
        self.__tree.add(item) 
     elif isinstance(arg, basestring): 
      if os.path.isfile(arg): 
       fp = open(arg) 
       for line in fp: 
        line = line.strip() 
        if line: 
         self.__tree.add(line) 
       fp.close() 
      else: 
       print 'the path of the input file does not exist' 
       return 
     else: 
      print 'parameter fault' 
      return   
     self.__tree.make() 

    def _findall(self, content): 
     '''return the list of keywords that is found 
     ''' 
     hit_list = [] 
     if isinstance(content, basestring): 
      for start, end in self.__tree.findall(content): 
       if len(content[start:end]): 
        hit_list.append(content[start:end]) 
     else: 
      print 'AC automation requires string ' 
     return hit_list 

    def find(self, content): 
     '''return those matched keywords and the corresponding count 
     ''' 
     hit_list = self._findall(content) 
     mydict = {} 
     for item in hit_list: 
      if item in mydict: 
       mydict[item] += 1 
      else: 
       mydict[item] = 1 
     return mydict

__init__.py引用，該文件夾下util.ahocorasick.x64：

import _ahocorasick 

__all__ = ['KeywordTree'] 


# A high level version of the keyword tree. Most of the methods here 
# are just delegated over to the underlying C KeywordTree 
#(in the .so file, which is not shown here). 


class KeywordTree(object): 
    def __init__(self): 
     self.__tree = _ahocorasick.KeywordTree(); 


    def add(self, s): 
     return self.__tree.add(s) 


    def make(self): 
     return self.__tree.make() 


    def zerostate(self): 
     return self.__tree.zerostate() 

    ##### !! I found this is where the segmentation fault occurs 

    def __findall_helper(self, sourceBlock, allow_overlaps, search_function): 
     """Helper function that captures the common logic behind the 
     two findall methods.""" 
     startpos = 0 
     startstate = self.zerostate() 
     loop_times = 0    

     while True: 
      #print spot_1 
      match = search_function(sourceBlock, startpos, startstate) 
      #print spot_2 
      if not match: 
       break 
      yield match[0:2] 
      startpos = match[1] 
      if allow_overlaps: #which in my case is always false 
       startstate = match[2] 
      else: 
       loop_times = loop_times + 1 
       #print spot_3 
       startstate = self.zerostate() 
       #print spot_4 
       #print loop_times 

    def findall(self, sourceBlock, allow_overlaps=0): 
     return self.__findall_helper(sourceBlock, allow_overlaps,self.__tree.search)

我給出不同的結果困惑：我已經找到了問題所在在3 __init__.py或更確切地說，__findall_helper(self, sourceBlock, akkow_overlaps, search_function)。

通過取消註釋以下注釋之一：

#print spot_1 
#print spot_2 
#print spot_4

一個可以消除分段錯誤和環路是有限的（匹配可以是None），但通過取消註釋#print spot_3，人們可以不（它似乎是一個無限循環）。這裏是我的問題：

print語句在python中有副作用嗎？我發現只有在上面提到的三個點（spot_1或spot_2或spot_4）之一中有一個print聲明可以排除故障。順便說一句，我偶然發現這個，起初沒有print。

這裏是backtrace使用gdb。

(gdb) r analyzer.py 

Starting program: /usr/local/bin/python analyzer.py 
[Thread debugging using libthread_db enabled] 
Detaching after fork from child process 11499. 
training finished 

Program received signal SIGSEGV, Segmentation fault. 
0x00007ffff178956d in ahocorasick_KeywordTree_search_helper (state=0x85c730, 
string=0x8967d4 "【中國環保在線 市場行情】「我國將在2016年啓動全國碳市場。全國碳交 易市場的首批行業企業將由電力、冶金、有色、建材、化工5個傳統制造業和航", <incomplete  sequence \347\251>..., n=140733193395828, startpos=118366835, 
out_start=0x7fffffffd468, out_end=0x7fffffffd460,  out_last_state=0x7fffffffd458) at aho-corasick.c:216 
216 aho-corasick.c: No such file or directory. 
in aho-corasick.c 
Missing separate debuginfos, use: debuginfo-install glibc-2.12- 1.149.el6.x86_64 

(gdb) bt 

#0 0x00007ffff178956d in ahocorasick_KeywordTree_search_helper  (state=0x85c730, string=0x8967d4 "【中國環保在線 市場行情】「我國將在2016年啓動全國碳市場。全國碳交  易市場的首批行業企業將由電力、冶金、有色、建材、化工5個傳統制造業和航", <incomplete sequence \347\251>..., n=140733193395828, startpos=118366835, out_start=0x7fffffffd468, out_end=0x7fffffffd460, out_last_state=0x7fffffffd458) at aho-corasick.c:216 
#1 0x00007ffff178a2b1 in ahocorasick_KeywordTree_basesearch  (self=0x7ffff7f6c230, args=0x7ffff0ca1a50, kwargs=0x0, helper=0x7ffff1789525<ahocorasick_KeywordTree_search_helper>) at  py_wrapper.c:190 
#2 0x00007ffff178a358 in ahocorasick_KeywordTree_search (self=0x7ffff7f6c230, args=0x7ffff0ca1a50, kwargs=0x0) at py_wrapper.c:212 
#3 0x00000000004a7103 in call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4013 
#4 PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666 
#5 0x0000000000507e8d in gen_send_ex (gen=0x7904640, arg=0x0, exc=<value optimized out>) at Objects/genobject.c:84 
#6 0x00000000004a25da in PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2497 
#7 0x00000000004a805b in fast_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4099 
#8 call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4034 
#9 PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666 
#10 0x00000000004a8bd7 in PyEval_EvalCodeEx (co=0x7ffff1ff54b0, globals=<value optimized out>, locals=<value optimized out>, args=<value optimized out>, argcount=3, kws=0x9984520, kwcount=0, defs=0x7ffff2016968, defcount=1, closure=0x0) at Python/ceval.c:3253 
#11 0x00000000004a6dce in fast_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4109 
#12 call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4034 
#13 PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666 
#14 0x00000000004a805b in fast_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4099 
#15 call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4034 
#16 PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666 
#17 0x00000000004a8bd7 in PyEval_EvalCodeEx (co=0x7ffff7ec4130, globals=<value optimized out>, locals=<value optimized out>, args=<value optimized out>, argcount=0, kws=0x0, kwcount=0, defs=0x0, defcount=0, closure=0x0) at Python/ceval.c:3253 
#18 0x00000000004a8ce2 in PyEval_EvalCode (co=<value optimized out>, globals=<value optimized out>, locals=<value optimized out>) at Python/ceval.c:667 
#19 0x00000000004c91fe in run_mod (fp=0x880ee0, filename=0x7fffffffe30c "analyzer.py", start=<value optimized out>, globals=0x7fc140, locals=0x7fc140, closeit=1, flags=0x7fffffffdea0) at Python/pythonrun.c:1346 
#20 PyRun_FileExFlags (fp=0x880ee0, filename=0x7fffffffe30c "analyzer.py", start=<value optimized out>, globals=0x7fc140, locals=0x7fc140, closeit=1,flags=0x7fffffffdea0) at Python/pythonrun.c:1332 
#21 0x00000000004c9414 in PyRun_SimpleFileExFlags (fp=0x880ee0, filename=0x7fffffffe30c "analyzer.py", closeit=1, flags=0x7fffffffdea0)at Python/pythonrun.c:936 
#22 0x0000000000414a4f in Py_Main (argc=<value optimized out>, argv=<value optimized out>) at Modules/main.c:599 
#23 0x0000003fd281ed5d in __libc_start_main() from /lib64/libc.so.6 
#24 0x0000000000413bc9 in _start()

來源

2015-05-07 gongguichun

這是一個龐大的代碼量，我找不到任何地方顯而易見的地方，你在訪問'.so'。你能給我們一個[最小的，完整的，可驗證的例子]（http://stackoverflow.com/help/mcve）？ – abarnert

要回答你的問題：不，「print」語句在Python中沒有副作用。 – gustafbstrom

你可以使用gdb獲得一個回溯到哪裏segfaults？ 'gdb $（which python）'並輸入'r analyzer.py'，當它停止輸入'bt' – maxy

我看到

self.__tree = _ahocorasick.KeywordTree();

然後

self.__tree.zerostate()

最後

return self.__findall_helper(sourceBlock, allow_overlaps,self.__tree.search_long)

所以我的猜測是，功能search_long是當你做這樣__tree.zerostate()無效喲你會得到未定義的行爲，在某些情況下會導致段錯誤。它有很多代碼，並且有一個不透明的庫，所以很難說清楚。最好的事情是去文檔，並確保你正確使用庫。

print是一個紅鯡魚，並通過分配的東西只是迫使崩潰發生得更早。

希望它有幫助。

來源

2015-05-07 14:45:17 Sorin

我真的很感謝你的回答，正如你所說，'print'是一個紅鯡魚。但事實是，它確實對結果有影響（崩潰或不崩潰）。如果funcion'self .__ tree.search'失效，則無論是否存在'print'語句，結果都應該是segfault。但每次存在時，它都能正常工作，而相反則會崩潰。其中，我可以肯定。 @Sorin – gongguichun

在.so文件中使用C模塊時出現分段錯誤

回答

相關問題