2013-10-13 53 views
0
#!/usr/bin/env python 

import threading 
import urllib, sys,os 
import Queue 


concurrent = 200 
queue = Queue.Queue(concurrent*2) 

try: 
    aim = sys.argv[1].lower() 
    dic = open(sys.argv[2],'r') 

except: 
    print "Usage: %s url wordlist" % sys.argv[0] 
    sys.exit(1) 

class Scanner(threading.Thread): 
    def __init__(self,queue): 
     threading.Thread.__init__(self) 
     self.queue=queue 

    def run(self): 

     while True: 

      self.path = self.queue.get() 
      self.geturl = urllib.urlopen(aim+'/'+self.path) 
      self.status = self.geturl.getcode() 
      self.url = aim+self.path 
      self.result = self.url+'=>'+str(self.status) 
      print self.result 
      self.writeresult(self.result) 
      self.queue.task_done() 



    def writeresult(self,result): 

     fp = open('result.txt','a+') 
     fp.write(result+'\n') 
     fp.close() 


def main():   

    for i in range(concurrent): 
     t = Scanner(queue) 
     t.setDaemon(True) 
     t.start() 

    for path in dic.readlines(): 
     queue.put(path.strip()) 

    queue.join() 

if __name__ == '__main__': 
    main() 

這是一個Python程序掃描網站的目錄,當掃描完成, 它甚至不能與CTRL + C 我想知道當它完成掃描如何自動退出程序退出。如何在所有線程完成後退出程序?

,當它在工藝,它也會出現一些這樣的問題:

Exception in thread Thread-130: 
Traceback (most recent call last): 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 551, in __bootstrap_inner 
    self.run() 
    File "tt.py", line 28, in run 
    self.geturl = urllib.urlopen(aim+'/'+self.path) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 86, in urlopen 
    return opener.open(url) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 207, in open 
    return getattr(self, name)(url) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 344, in open_http 
    h.endheaders(data) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 954, in endheaders 
    self._send_output(message_body) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 814, in _send_output 
    self.send(msg) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 776, in send 
    self.connect() 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 757, in connect 
    self.timeout, self.source_address) 
    File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 553, in create_connection 
    for res in getaddrinfo(host, port, 0, SOCK_STREAM): 
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known 
+0

由於您的Scanner.Run方法停留在無限循環(「while True:」),您爲什麼會期望您的程序退出? – selbie

回答

0

我想要一些練習,所以我嘗試了這一點,並改變了很多。它會給你帶來一整套結果嗎?你將需要用你原來的參數閱讀替換路徑。

  • 有了這些線程,也許你會得到未處理的異常導致缺少結果?我添加了一種機制,在讀取過程中捕獲任何錯誤並將其傳遞給結果編寫者。
  • 我想從多個線程附加到文件是好的,但我添加了一個作家線程更乾淨管理文件
  • 大部分分配給自己的是不必要的
  • ,如果你仍然得到插槽錯誤,請檢查在結果文件和路徑看你想怎麼處理這些結果,如果在所有
  • 我不是專家,所以不要把這個作爲最佳實踐

import threading 
import urllib 
import Queue 

concurrent = 5 

aim = 'http://edition.cnn.com' 
paths = ['2013/10/12/opinion/kazin-tea-party/index.html?hpt=hp_t5', 
     '2013/10/11/opinion/opinion-hay-nobel-opcw/index.html?hpt=hp_t5', 
     '2013/10/11/opinion/rosin-women-in-charge/index.html?hpt=hp_t5', 
     'some invalid path', 
     '2013'] # also an invalid path 


def main(): 
    work_q = Queue.Queue() 
    result_q = Queue.Queue() 

    # start the scanners and the result writer 
    scanners = [Scanner(work_q, result_q) for i in range(concurrent)] 
    for s in scanners: 
     s.start() 
    results_file_path = 'results.txt' 
    result_writer = ResultWriter(result_q, 'results.txt') 
    result_writer.start() 
    # send all the work and wait for it to be completed 
    for path in paths: 
     work_q.put(path.strip()) 
    work_q.join() 
    # tell everyone to stop 
    # you could just kill the threads but you writer needs to close the file 
    for s in scanners: 
     work_q.put(Scanner.STOP_TOKEN) 
    result_q.put(ResultWriter.STOP_TOKEN) # make sure file gets closed 
    # wait for everyone to actually stop 
    for s in scanners: 
     s.join() 
    result_writer.join() 
    print 'the scan has finished and results are in {}'.format(results_file_path) 


class Scanner(threading.Thread): 
    STOP_TOKEN = '<<stop>>' 

    def __init__(self, work_q, result_q): 
     threading.Thread.__init__(self) 
     self.work_q = work_q 
     self.result_q = result_q 

    def run(self): 
     while True: 
      path = status = None # reset in case of error 
      try: 
       try: 
        path = self.work_q.get(timeout=0.00001) 
       except Queue.Empty: 
        continue 
       if path == self.STOP_TOKEN: 
        break # stop looking for work 
       get_url = urllib.urlopen(aim + '/' + path) 
       status = get_url.getcode() 
      except Exception as e: 
       status = 'unhandled error ({})'.format(e) 
      self.result_q.put((path, status)) 
      self.work_q.task_done() 


class ResultWriter(threading.Thread): 
    STOP_TOKEN = '<<stop>>' 

    def __init__(self, result_q, results_file_path): 
     threading.Thread.__init__(self) 
     self.result_q = result_q 
     self.results_file_path = results_file_path 

    def run(self): 
     with open(self.results_file_path, 'w') as results_file: 
      while True: 
       try: 
        result = self.result_q.get(timeout=0.00001) 
       except Queue.Empty: 
        continue 
       if result == self.STOP_TOKEN: 
        break # stop looking for results 
       path, status = result 
       results_file.write('{}=>{}\n'.format(path, status)) 


if __name__ == '__main__': 
    main() 
+0

您是專家,謝謝 – user2876146

0

程序,因爲它是,當所有線程都完成,將關閉。 但輕鬆擺脫所有這些錯誤,在你的函數運行,從類中,而真正的後:聖誕老人,放在一個嘗試,隨後的一切:除了:子句這樣

try: 
    code 
except: 
    pass 

它不完全最乾淨的方式來做到這一點,但考慮到你以後的工作,它會完成這項工作,並且會讓你擺脫這些例外,這意味着某些URL已超時。

+0

我已經騎上了這些例外,但是在我的path.txt中,有730個目錄可以掃描,但是在result.txt中只有660個,你能幫我解決嗎?並且我想打印「掃描已完成」之類的內容,並在掃描完成後退出程序。 – user2876146

相關問題