import codecs, os
import re
import string
import mysql
import mysql.connector
y_ = ""
'''Searching and reading text files from a folder.'''
for root, dirs, files in os.walk("/Users/ultaman/Documents/PAN dataset/Pan Plagiarism dataset 2010/pan-plagiarism-corpus-2010/source-documents/test1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
'''Tokenizing the senteces of the text file.'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing punctuation marks.'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = ''
for review in tokenized_docs:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review+= new_token
tokenized_docs_no_punctuation += (new_review)
print(tokenized_docs_no_punctuation)
'''Connecting and inserting tokenized documents without punctuation in database field.'''
def connect():
for i in range(len(tokenized_docs_no_punctuation)):
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'test')
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,(tokenized_docs_no_punctuation[i])))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()
After writing the above code, The result is like
2 | S | N |
| 3 | S | o |
| 4 | S | | | 5 | S | d | | 6 | S | o | | 7 | S | u | | 8 | S | b | | 9 | S | t | | 10 | S | | | 11 | S | m | | 12 | S | y | | 13 | S |
| 14 | S | d
在數據庫中。在應用nltk的句子分詞器而不是Python中的句子後得到字母3.5.1
It should be like:
1 | S | No doubt, my dear friend.
2 | S | no doubt.
以上作品。 –