Selenium Python下载带有特定文件名的弹出式pdf文件

时间:2018-09-24 20:09:35

标签: python selenium pdf screen-scraping

我需要从网页下载一组单独的pdf文件。它是政府(土耳其教育部)公开提供的,因此完全合法。

但是我的硒浏览器仅显示pdf文件,如何下载并命名。

(此代码也来自网络)

# Import your newly installed selenium package
from selenium import webdriver
from bs4 import BeautifulSoup


# Now create an 'instance' of your driver
# This path should be to wherever you downloaded the driver
driver = webdriver.Chrome(executable_path="/Users/ugur/Downloads/chromedriver")
# A new Chrome (or other browser) window should open up
download_dir = "/Users/ugur/Downloads/" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()

profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
               "download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)



# Now just tell it wherever you want it to go
driver.get("https://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid=5&ders=29")
driver.find_element_by_id("ContentPlaceHolder1_dtYillikPlanlar_lnkIndir_2").click()
driver.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")

预先感谢

其他信息:

我有一个完美的python 2代码。但是,它以某种方式创建了空文件,我无法将其转换为python3。也许这有所帮助(没有冒犯性,但我从不喜欢硒)

import urllib
import urllib2
from bs4 import BeautifulSoup
import os


sinifId=5
maxOrd = 1
fileNames=[]
directory = '/Users/ugur/Downloads/Hasan'
print 'List of current files in directory '+ directory+'\n---------------------------------\n\n'
for current_file in os.listdir(directory):
    if (current_file.find('pdf')>-1 and current_file.find(' ')>-1):
        print current_file
        order = int(current_file.split(' ',1)[0])
        if order>maxOrd: maxOrd=order
        fileNames.append(current_file.split(' ',2)[1])

print '\n\nStarting download \n---------------------------------\n'
ctA=int(maxOrd+1)
for ders in [29]:
    urlSinif='http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)

    page = urllib2.urlopen(urlSinif)
    soup = BeautifulSoup(page,"lxml")
    st = soup.prettify()
    count=st.count('ctl00')-1
    dersAdi = soup.find('a', href='/kurslar/CevapAnahtarlari.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)).getText().strip()

    for testNo in range(count):

        if(str(sinifId)+str(ders)+str(testNo+1) in fileNames):
            print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' skipped'    
        else:

            annex=""
            if(testNo%2==1): annex="2"

            eiha_url = u'http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
            data = ('__EVENTTARGET','ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex), ('__EVENTARGUMENT', '39')

            print 'ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex

            new_data = urllib.urlencode(data)
            response = urllib2.urlopen(eiha_url, new_data)


            urllib.urlretrieve (str(response.url), directory+'/{0:0>3}'.format(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf')
            print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' downloaded'
            ctA=ctA+1

3 个答案:

答案 0 :(得分:2)

在启动Chrome之前添加您的选项,然后指定chrome_options参数。

download_dir = "/Users/ugur/Downloads/"
options = webdriver.ChromeOptions()

profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], 
           "download.default_directory": download_dir,
          "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)

driver = webdriver.Chrome(
    executable_path="/Users/ugur/Downloads/chromedriver",
    chrome_options=options
)

回答第二个问题:

  

我还能问一下如何指定文件名吗?

我发现了这个:Selenium give file name when downloading

我要做的是:

file_name = ''
while file_name.lower().endswith('.pdf') is False:
    time.sleep(.25)
    try:
        file_name = max([download_dir + '/' + f for f in os.listdir(download_dir)], key=os.path.getctime)
    except ValueError:
        pass

答案 1 :(得分:0)

非硒解决方案,您可以执行以下操作:

import requests
pdf_resp = requests.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
with open("save.pdf", "wb") as f:
    f.write(pdf_resp.content)

尽管您可能要先检查内容类型以确保它是pdf

答案 2 :(得分:0)

这是我用来下载具有特定文件名的pdf的代码示例。首先,您需要使用必需的选项配置chrome webdriver。然后,单击按钮(打开pdf弹出窗口)后,调用一个函数,等待下载完成并重命名下载的文件。

import os
import time
import shutil

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

# function to wait for download to finish and then rename the latest downloaded file
def wait_for_download_and_rename(newFilename):
    # function to wait for all chrome downloads to finish
    def chrome_downloads(drv):
        if not "chrome://downloads" in drv.current_url: # if 'chrome downloads' is not current tab
            drv.execute_script("window.open('');") # open a new tab
            drv.switch_to.window(driver.window_handles[1]) # switch to the new tab
            drv.get("chrome://downloads/") # navigate to chrome downloads
        return drv.execute_script("""
            return document.querySelector('downloads-manager')
            .shadowRoot.querySelector('#downloadsList')
            .items.filter(e => e.state === 'COMPLETE')
            .map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
            """)
    # wait for all the downloads to be completed
    dld_file_paths = WebDriverWait(driver, 120, 1).until(chrome_downloads) # returns list of downloaded file paths
    # Close the current tab (chrome downloads)
    if "chrome://downloads" in driver.current_url:
        driver.close()
    # Switch back to original tab
    driver.switch_to.window(driver.window_handles[0]) 
    # get latest downloaded file name and path
    dlFilename = dld_file_paths[0] # latest downloaded file from the list
    # wait till downloaded file appears in download directory
    time_to_wait = 20 # adjust timeout as per your needs
    time_counter = 0
    while not os.path.isfile(dlFilename):
        time.sleep(1)
        time_counter += 1
        if time_counter > time_to_wait:
            break
    # rename the downloaded file
    shutil.move(dlFilename, os.path.join(download_dir,newFilename))
    return

# specify custom download directory
download_dir = r'c:\Downloads\pdf_reports'

# for configuring chrome pdf viewer for downloading pdf popup reports
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', {
    "download.default_directory": download_dir, # Set own Download path
    "download.prompt_for_download": False, # Do not ask for download at runtime
    "download.directory_upgrade": True, # Also needed to suppress download prompt
    "plugins.plugins_disabled": ["Chrome PDF Viewer"], # Disable this plugin
    "plugins.always_open_pdf_externally": True, # Enable this plugin
    })

# get webdriver with options for configuring chrome pdf viewer
driver = webdriver.Chrome(options = chrome_options)

# open desired webpage
driver.get('https://mywebsite.com/mywebpage')

# click the button to open pdf popup
driver.find_element_by_id('someid').click()

# call the function to wait for download to finish and rename the downloaded file
wait_for_download_and_rename('My file.pdf')

# close the browser windows
driver.quit()

根据需要将超时(120)设置为等待时间。

相关问题