2015-05-02 38 views
-1

我正在嘗試編寫以下腳本以調用大約30萬個來自遠程服務器的文件。 它通常工作正常,但只能工作到65到70個文件。在此之後,打印文件名稱並不處理任何東西只是 。如果有人可能會好好建議我 做錯了什麼?如何解決Python中MySql查詢的問題

import pymysql 
import pymysql.cursors 
import os 
import win32com.client 
from gensim.models import Word2Vec 
import nltk 
from nltk.corpus import stopwords 
import pyPdf 
from pyth.plugins.rtf15.reader import Rtf15Reader 
from pyth.plugins.plaintext.writer import PlaintextWriter 
import nltk 
import zipfile, re 
import time 

#READING DOC FILE FROM REMOTE LOCATION 
def readfilesq9(n): 
    connection = pymysql.connect(host='xxx.xxx.x.xxx', 
          user='abcd', 
          passwd='pwd1', 
          db='rep_db', 
          charset='utf8mb4', 
          cursorclass=pymysql.cursors.DictCursor) 
    list1=[] 
    with connection.cursor() as cursor: 
     # Read a single record 
     sql = "SELECT candidateid,cnd.FirstName, cnd.LastName,Concat('\\xxx.xxx.x.xxx\File\Cand_Res/',orgguid,'/',DATE_FORMAT(cnd.createddate,'%Y%m'),'/',candidateguid,'/',Resume) as ResumePath from candidate cnd join mstorganization org on cnd.orgid = org.OrgId where Resume <> '' and Resume is not null order by cnd.modifieddate limit 100000" 
     cursor.execute(sql) 
     result = cursor.fetchall() 
     #print result 
     #list1=[] 
     for i in result: 
      try: 
       #print i 
       item_1=i.items() 
       item_2=item_1[2][1] 
       print item_2 
       item_3=item_2.index("/") 
       file1=item_2[item_2:] 
       string1='\\\\xxx.xxx.x.xxx\\Resumes\\Cand_Res' 
       file1e=file1.encode('ascii', 'ignore') 
       urls=file1e.replace("/","\\") 
       file_full=string1+urls 
       time.sleep(1) 
       #osp="C:\\Python27" 
       os1=os.path.abspath(os.curdir) 
       osp2=os.path.join(os1,file_full) 
       print "Path1:",osp2 
       file_name1=osp2 
       print "Path:",file_name1 
       #IDENTIFICATION OF FILE KIND 
       #DOC CONVERSION 
       if ".doc" in file_name1: 
        #EXTRACTING ONLY .DOC FILES 
        if ".docx" not in file_name1: 
         #print "It is A Doc file$$:",file_name 
         try: 
          doc = win32com.client.GetObject(file_name1) 
          text = doc.Range().Text 
          text1=text.encode('ascii','ignore') 
          text_word=text1.split() 
          #print "The Text Word is:",text_word 
          #print "Text for Document File Is:",text1 
          list1.append(text_word) 
          #print "List for Doc File Is:",list3 
          #print "It is a Doc file" 
         except: 
          print "DOC ISSUE" 
        #EXTRACTING ONLY .DOCX FILES 
        elif ".docx" in file_name1: 
         #print "It is DOCX FILE:",file_name 
         docx1=zipfile.ZipFile(file_name1) 
         content = docx1.read('word/document.xml').decode('utf-8') 
         cleaned = re.sub('<(.|\n)*?>','',content).encode('ascii','ignore') 
         cleaned_word=cleaned.split() 
         #print "The Cleaned Document Is:",cleaned 
         list1.append(cleaned_word) 
         #print "List for DocX file Is:",list4 
        else: 
         print "NONE1" 
       else: 
        print "It is not a Doc file" 


      except: 
       print "OOPS1" 

我在Enthought Canopy上使用Python2.7.6。這不是我的默認Python。我的默認Python位於「C:\ Python27」位置。 我正在使用MySql和Windows 7 Professional。對於任何縮進錯誤,道歉。

回答

0

嘗試在while語句而不是使用cursor.fetchall()cursor.fetchone(),這樣你就可以在沒有更多的結果取一次在一個行和break。在沒有查看pymysql的文檔時,我認爲cursor.fetchone()將返回None,當沒有更多結果時。 如果這不起作用,請直接在MySQL中運行您的查詢並驗證輸出。