將某些東西保存到csv文件時遇到困難

我的程序完成了所有我想要的操作，但沒有將最終數據保存到csv文件中，我在它之前使用了一個打印來查看數據是否正確，只是沒有寫入csv文件，我使用'a'，因爲我不希望它重寫已寫入的內容，但它仍然返回錯誤。將某些東西保存到csv文件時遇到困難

這裏的部分代碼：

soup = BeautifulSoup(answer) 
        for table in soup.findAll('table', {"class":"formTable"}): 
         for row in table.findAll('tr'): 
          #heading = row.find('td', {"class":"sectionHeading"}) 
          #if heading is not None: 
           #print(heading.get_text()); 
          #else: 
          label = row.find('td', {"class":"fieldLabel"}) 
          data = row.find('td', {"class":"fieldData"}) 
          if data is not None and label is not None: 
             csvline += label.get_text() + "," + data.get_text() + "," 
        print(csvline) 
        #csvline.encode('utf-8') 
        with open ('output_file_two.csv', 'a', encoding='utf-8') as f: 
         writer = csv.writer(f) 
         writer.writerow(csvline)

這裏的錯誤：

Traceback (most recent call last): 
    File "C:\PROJECT\pdfs\final.py", line 95, in <module> 
    with open ('output_file_two.csv', 'a', encoding='utf-8') as f: 
TypeError: 'encoding' is an invalid keyword argument for this function

下面是需要

import shlex 
import subprocess 
import os 
import platform 
from bs4 import BeautifulSoup 
import re 
#import unicodecsv as csv 
import csv 
#import pickle 
import requests 
from robobrowser import RoboBrowser 
import codecs 

def rename_files(): 
    file_list = os.listdir(r"C:\\PROJECT\\pdfs") 
    print(file_list) 
    saved_path = os.getcwd() 
    print('Current working directory is '+saved_path) 
    os.chdir(r'C:\\PROJECT\\pdfs') 
    for file_name in file_list: 
     os.rename(file_name, file_name.translate(None, " ")) 
    os.chdir(saved_path) 
rename_files() 

def run(command): 
    if platform.system() != 'Windows': 
     args = shlex.split(command) 
    else: 
     args = command 
    s = subprocess.Popen(args, 
         stdout=subprocess.PIPE, 
         stderr=subprocess.PIPE) 
    output, errors = s.communicate() 
    return s.returncode == 0, output, errors 

# Change this to your PDF file base directory 
base_directory = 'C:\\PROJECT\\pdfs' 
if not os.path.isdir(base_directory): 
    print "%s is not a directory" % base_directory 
    exit(1) 
# Change this to your pdf2htmlEX executable location 
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py' 
if not os.path.isfile(bin_path): 
    print "Could not find %s" % bin_path 
    exit(1) 
for dir_path, dir_name_list, file_name_list in os.walk(base_directory): 
    for file_name in file_name_list: 
     # If this is not a PDF file 
     if not file_name.endswith('.pdf'): 
      # Skip it 
      continue 
     file_path = os.path.join(dir_path, file_name) 
     # Convert your PDF to HTML here 
     args = (bin_path, file_name, file_path) 
     success, output, errors = run("python %s -o %s.html %s " %args) 
     if not success: 
      print "Could not convert %s to HTML" % file_path 
      print "%s" % errors 
htmls_path = 'C:\\PROJECT' 
with open ('score.csv', 'w') as f: 
    writer = csv.writer(f) 
    for dir_path, dir_name_list, file_name_list in os.walk(htmls_path): 
     for file_name in file_name_list: 
      if not file_name.endswith('.html'): 
       continue 
      with open(file_name) as markup: 
       soup = BeautifulSoup(markup.read()) 
       text = soup.get_text() 
       match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s* 
       print(match) 
       writer.writerow(match) 
       for item in match: 
        data = item.split('/') 
        case_number = data[0] 
        case_year = data[1] 
        csvline = case_number + "," 

        browser = RoboBrowser() 
        browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA') 
        form = browser.get_forms()[0] # Get the first form on the page 
        form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number 
        form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year 

        browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit']) 

        # Use BeautifulSoup to parse this data 
        answer = browser.response.text 
        #print(answer) 
        soup = BeautifulSoup(answer) 
        for table in soup.findAll('table', {"class":"formTable"}): 
         for row in table.findAll('tr'): 
          #heading = row.find('td', {"class":"sectionHeading"}) 
          #if heading is not None: 
           #print(heading.get_text()); 
          #else: 
          label = row.find('td', {"class":"fieldLabel"}) 
          data = row.find('td', {"class":"fieldData"}) 
          if data is not None and label is not None: 
             csvline += label.get_text() + "," + data.get_text() + "," 
        print(csvline) 
        with open ('output_file_two.csv', 'a') as f: 
         writer = csv.writer(f) 
         writer.writerow(csvline)

編輯

的情況下，整個程序代碼

它的工作，這裏的工作

import shlex 
import subprocess 
import os 
import platform 
from bs4 import BeautifulSoup 
import re 
import unicodecsv as csv 
import requests 
from robobrowser import RoboBrowser 
import codecs 

def rename_files(): 
    file_list = os.listdir(r"C:\\PROJECT\\pdfs") 
    print(file_list) 
    saved_path = os.getcwd() 
    print('Current working directory is '+saved_path) 
    os.chdir(r'C:\\PROJECT\\pdfs') 
    for file_name in file_list: 
     os.rename(file_name, file_name.translate(None, " ")) 
    os.chdir(saved_path) 
rename_files() 

def run(command): 
    if platform.system() != 'Windows': 
     args = shlex.split(command) 
    else: 
     args = command 
    s = subprocess.Popen(args, 
         stdout=subprocess.PIPE, 
         stderr=subprocess.PIPE) 
    output, errors = s.communicate() 
    return s.returncode == 0, output, errors 


base_directory = 'C:\\PROJECT\\pdfs' 
if not os.path.isdir(base_directory): 
    print "%s is not a directory" % base_directory 
    exit(1) 

bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py' 
if not os.path.isfile(bin_path): 
    print "Could not find %s" % bin_path 
    exit(1) 
for dir_path, dir_name_list, file_name_list in os.walk(base_directory): 
    for file_name in file_name_list: 

     if not file_name.endswith('.pdf'): 

      continue 
     file_path = os.path.join(dir_path, file_name) 

     args = (bin_path, file_name, file_path) 
     success, output, errors = run("python %s -o %s.html %s " %args) 
     if not success: 
      print "Could not convert %s to HTML" % file_path 
      print "%s" % errors 
htmls_path = 'C:\\PROJECT' 
with open ('score.csv', 'w') as f: 
    writer = csv.writer(f) 
    for dir_path, dir_name_list, file_name_list in os.walk(htmls_path): 
     for file_name in file_name_list: 
      if not file_name.endswith('.html'): 
       continue 
      with open(file_name) as markup: 
       soup = BeautifulSoup(markup.read()) 
       text = soup.get_text() 
       match = re.findall("PA/(\S*)", text) 
       print(match) 
       writer.writerow(match) 
       for item in match: 
        data = item.split('/') 
        case_number = data[0] 
        case_year = data[1] 
        csvline = case_number + "," 

        browser = RoboBrowser() 
        browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA') 
        form = browser.get_forms()[0] 
        form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number 
        form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year 

        browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit']) 


        answer = browser.response.text 
        soup = BeautifulSoup(answer) 
        for table in soup.findAll('table', {"class":"formTable"}): 
         for row in table.findAll('tr'): 
          label = row.find('td', {"class":"fieldLabel"}) 
          data = row.find('td', {"class":"fieldData"}) 
          if data is not None and label is not None: 
           csvline += label.get_text() + "," + data.get_text() + "," 
           print(csvline) 
           my_file = codecs.open('final_output.csv', 'a', 'utf-8') 
           my_file.write(csvline)

來源

2017-05-05 fsgdfgsd

在最後的代碼中有你的代碼

writer = csv.writer(f) 
csv.writer(csvline) # here is the problem

看你初始化作家的一個問題，但你不使用它。

writer = csv.writer(f) 
writer.writerow(csvline)

來源

2017-05-05 08:16:23 VKolev

現在它promping另一個錯誤回溯（最近通話最後一個）：文件「C：\ PROJECT \ PDF文檔\ final.py」，線路103，在 writer.writerow（csvline） UnicodeEncodeError： 'ascii'編解碼器不能編碼字符u'\ xa0'在位置0：序號不在範圍內（128） – fsgdfgsd

什麼樣的錯誤？ – VKolev

你應該看看如何將數據編碼爲Unicode'csvline.encode（'utf-8'）'或者將文件設置爲utf-8編碼'，打開（'.... csv'，'w'，encoding = 'utf-8'）作爲f' – VKolev

這裏：

with open ('output_file_two.csv', 'a') as f: 
    writer = csv.writer(f) 
    csv.writer (csvline)

您instanciating一個csv.writer，但不使用它。這應該閱讀：

with open ('output_file_two.csv', 'a') as f: 
    writer = csv.writer(f) 
    writer.write(csvline)

現在有不少其他問題與您的代碼，第一個是手動創建「csvline作爲然後用csv.writer存儲到文件的文本。 csv.writer.write()需要行（元組）的列表，並負責正確地轉義需要轉義的內容，插入正確的分隔符等。它還有一個採用單個元組的方法，因此避免在內存FWIW中構建整個列表。

來源

2017-05-05 08:21:53

但是，當我試圖做你寫的它仍然提示有關編碼的錯誤，我試圖使用編碼uft-8，並提出了另一個錯誤 – fsgdfgsd

@Samuel這是另一個問題。您（顯然）必須先將行的文本部分編碼爲所需的編碼。如果您遇到問題，請首先了解unicode和編碼（現在需要的知識），然後發佈另一個問題（包含所有相關詳細信息），如果您仍有一些錯誤。 –

將某些東西保存到csv文件時遇到困難

回答

相關問題