无法将某些内容保存到csv文件中

时间:2017-05-05 08:10:11

标签: python csv export

我的程序完成了我想要的所有操作,但是没有将最终数据保存到csv文件中,我在它之前使用了一个打印来查看数据是否正确而且它是,它只是没有写入csv文件,我使用 soup = BeautifulSoup(answer) for table in soup.findAll('table', {"class":"formTable"}): for row in table.findAll('tr'): #heading = row.find('td', {"class":"sectionHeading"}) #if heading is not None: #print(heading.get_text()); #else: label = row.find('td', {"class":"fieldLabel"}) data = row.find('td', {"class":"fieldData"}) if data is not None and label is not None: csvline += label.get_text() + "," + data.get_text() + "," print(csvline) #csvline.encode('utf-8') with open ('output_file_two.csv', 'a', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(csvline) 是因为我不想让它重写已编写的内容,但它仍然会返回错误。

这是代码的一部分:

Traceback (most recent call last):
  File "C:\PROJECT\pdfs\final.py", line 95, in <module>
    with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function

这是错误:

import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs

def rename_files():
    file_list = os.listdir(r"C:\\PROJECT\\pdfs")
    print(file_list)
    saved_path = os.getcwd()
    print('Current working directory is '+saved_path)
    os.chdir(r'C:\\PROJECT\\pdfs')
    for file_name in file_list:
        os.rename(file_name, file_name.translate(None, " "))
    os.chdir(saved_path)
rename_files()

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (bin_path, file_name, file_path)
        success, output, errors = run("python %s -o %s.html %s " %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
    writer = csv.writer(f)
    for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
        for file_name in file_name_list:
            if not file_name.endswith('.html'):
                continue
            with open(file_name) as markup:
                soup = BeautifulSoup(markup.read())
                text = soup.get_text()
                match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
                print(match)
                writer.writerow(match)
                for item in match:
                    data = item.split('/')
                    case_number = data[0]
                    case_year = data[1]
                    csvline = case_number + ","

                    browser = RoboBrowser()
                    browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
                    form = browser.get_forms()[0]  # Get the first form on the page
                    form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
                    form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year

                    browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])

                    # Use BeautifulSoup to parse this data
                    answer = browser.response.text
                    #print(answer)
                    soup = BeautifulSoup(answer)
                    for table in soup.findAll('table', {"class":"formTable"}):
                        for row in table.findAll('tr'):
                            #heading = row.find('td', {"class":"sectionHeading"})
                            #if heading is not None:
                                #print(heading.get_text());
                            #else:
                             label = row.find('td', {"class":"fieldLabel"})
                             data = row.find('td', {"class":"fieldData"})
                             if data is not None and label is not None:
                                        csvline += label.get_text() + "," + data.get_text() + ","
                    print(csvline)
                    with open ('output_file_two.csv', 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow(csvline)

这是需要时的整个程序代码

import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs

def rename_files():
    file_list = os.listdir(r"C:\\PROJECT\\pdfs")
    print(file_list)
    saved_path = os.getcwd()
    print('Current working directory is '+saved_path)
    os.chdir(r'C:\\PROJECT\\pdfs')
    for file_name in file_list:
        os.rename(file_name, file_name.translate(None, " "))
    os.chdir(saved_path)
rename_files()

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors


base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)

bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:

        if not file_name.endswith('.pdf'):

            continue
        file_path = os.path.join(dir_path, file_name)

        args = (bin_path, file_name, file_path)
        success, output, errors = run("python %s -o %s.html %s " %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
    writer = csv.writer(f)
    for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
        for file_name in file_name_list:
            if not file_name.endswith('.html'):
                continue
            with open(file_name) as markup:
                soup = BeautifulSoup(markup.read())
                text = soup.get_text()
                match = re.findall("PA/(\S*)", text)
                print(match)
                writer.writerow(match)
                for item in match:
                    data = item.split('/')
                    case_number = data[0]
                    case_year = data[1]
                    csvline = case_number + ","

                    browser = RoboBrowser()
                    browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
                    form = browser.get_forms()[0]  
                    form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
                    form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year

                    browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])


                    answer = browser.response.text
                    soup = BeautifulSoup(answer)
                    for table in soup.findAll('table', {"class":"formTable"}):
                        for row in table.findAll('tr'):
                             label = row.find('td', {"class":"fieldLabel"})
                             data = row.find('td', {"class":"fieldData"})
                             if data is not None and label is not None:
                                csvline += label.get_text() + "," + data.get_text() + ","
                                print(csvline)
                                my_file = codecs.open('final_output.csv', 'a', 'utf-8')
                                my_file.write(csvline)

修改

它正在运作,这里的代码正常运作

$Password="@de08nt2128"; #password to access certificate after expting
$CertName="WMSvc-WIN-9KC7DG31JBV"; # name of the certificate to export
$RootCertName="WMSvc-WIN-9KC7DG31JBV"; # root certificate

$DestCertName="testcert"
$ExportPathRoot="C:\DestinationFolder"

$CertListToExport=Get-ChildItem -Path cert:\LocalMachine\My | ?{ $_.Subject -Like "*CN=$CertName*" -and $_.Issuer -eq "CN=$RootCertName" }

foreach($CertToExport in $CertListToExport | Sort-Object Subject)
{
    $DestCertName=$CertToExport.Subject.ToString().Replace("CN=","");

    $CertDestPath=Join-Path -Path $ExportPathRoot -ChildPath "$DestCertName.pfx"

    $type = [System.Security.Cryptography.X509Certificates.X509Certificate]::pfx
    $SecurePassword = ConvertTo-SecureString -String $Password -Force –AsPlainText

    $bytes = $CertToExport.export($type, $SecurePassword)
    [System.IO.File]::WriteAllBytes($CertDestPath, $bytes)

}
"Completed" 

2 个答案:

答案 0 :(得分:0)

最后,您的代码存在问题

writer = csv.writer(f)
csv.writer(csvline) # here is the problem

请参阅初始化编写器,但之后不要使用它。

writer = csv.writer(f)
writer.writerow(csvline)

答案 1 :(得分:0)

这里:

with open ('output_file_two.csv', 'a') as f:
    writer = csv.writer(f)
    csv.writer (csvline)

您正在实施csv.writer,但未使用它。这应该是:

with open ('output_file_two.csv', 'a') as f:
    writer = csv.writer(f)
    writer.write(csvline)

现在您的代码存在很多其他问题,第一个问题是手动创建&#39; csvline为文本,然后使用csv.writer将其存储到文件中。 csv.writer.write()需要一个行列表(元组),并负责正确转义需要转义的内容,插入正确的分隔符等。它还有一个writerow()方法,只需要一个元组,因此避免构建内存中的整个列表FWIW。