-1
我正在嘗試編寫以下腳本以調用大約30萬個來自遠程服務器的文件。 它通常工作正常,但只能工作到65到70個文件。在此之後,打印文件名稱並不處理任何東西只是 。如果有人可能會好好建議我 做錯了什麼?如何解決Python中MySql查詢的問題
import pymysql
import pymysql.cursors
import os
import win32com.client
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import pyPdf
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
import nltk
import zipfile, re
import time
#READING DOC FILE FROM REMOTE LOCATION
def readfilesq9(n):
connection = pymysql.connect(host='xxx.xxx.x.xxx',
user='abcd',
passwd='pwd1',
db='rep_db',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
list1=[]
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT candidateid,cnd.FirstName, cnd.LastName,Concat('\\xxx.xxx.x.xxx\File\Cand_Res/',orgguid,'/',DATE_FORMAT(cnd.createddate,'%Y%m'),'/',candidateguid,'/',Resume) as ResumePath from candidate cnd join mstorganization org on cnd.orgid = org.OrgId where Resume <> '' and Resume is not null order by cnd.modifieddate limit 100000"
cursor.execute(sql)
result = cursor.fetchall()
#print result
#list1=[]
for i in result:
try:
#print i
item_1=i.items()
item_2=item_1[2][1]
print item_2
item_3=item_2.index("/")
file1=item_2[item_2:]
string1='\\\\xxx.xxx.x.xxx\\Resumes\\Cand_Res'
file1e=file1.encode('ascii', 'ignore')
urls=file1e.replace("/","\\")
file_full=string1+urls
time.sleep(1)
#osp="C:\\Python27"
os1=os.path.abspath(os.curdir)
osp2=os.path.join(os1,file_full)
print "Path1:",osp2
file_name1=osp2
print "Path:",file_name1
#IDENTIFICATION OF FILE KIND
#DOC CONVERSION
if ".doc" in file_name1:
#EXTRACTING ONLY .DOC FILES
if ".docx" not in file_name1:
#print "It is A Doc file$$:",file_name
try:
doc = win32com.client.GetObject(file_name1)
text = doc.Range().Text
text1=text.encode('ascii','ignore')
text_word=text1.split()
#print "The Text Word is:",text_word
#print "Text for Document File Is:",text1
list1.append(text_word)
#print "List for Doc File Is:",list3
#print "It is a Doc file"
except:
print "DOC ISSUE"
#EXTRACTING ONLY .DOCX FILES
elif ".docx" in file_name1:
#print "It is DOCX FILE:",file_name
docx1=zipfile.ZipFile(file_name1)
content = docx1.read('word/document.xml').decode('utf-8')
cleaned = re.sub('<(.|\n)*?>','',content).encode('ascii','ignore')
cleaned_word=cleaned.split()
#print "The Cleaned Document Is:",cleaned
list1.append(cleaned_word)
#print "List for DocX file Is:",list4
else:
print "NONE1"
else:
print "It is not a Doc file"
except:
print "OOPS1"
我在Enthought Canopy上使用Python2.7.6。這不是我的默認Python。我的默認Python位於「C:\ Python27」位置。 我正在使用MySql和Windows 7 Professional。對於任何縮進錯誤,道歉。