硒仅刮除最后一页数据

时间:2018-12-11 04:00:42

标签: selenium scrapy

我需要有关硒>硒的帮助

Selenium仅解析最后50页数据,但我需要所有页面数据。我收到了50条抓取的数据,但应该是304。**

我也尝试过传递meta={'s':s} from "lists"但结果相同。这次使用meta={'lists':lists}

传递了元数据
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException

class AthSpider(Spider):
    name = 'ath'

    def start_requests(self):
        self.driver = webdriver.Chrome()
        self.driver.get('https://www.athlinks.com/event/127711/results/Event/828080/Results')
        sleep(20)
        self.driver.find_element_by_xpath('//*[@class="view-all-results"]').click()
        sleep(20)
        sel = Selector(text=self.driver.page_source)
        lists=sel.xpath('//*[@class="row mx-0 link-to-irp"]')
        for s in lists:
            yield Request(self.driver.current_url,meta={'lists':lists},callback=self.parse_page)

        while True:
            try:
                next_page=self.driver.find_element_by_xpath("//button[contains(text(),'>')]")
                next_page.click()
                sleep(20)
                sel = Selector(text=self.driver.page_source)
                lists=sel.xpath('//*[@class="row mx-0 link-to-irp"]')
                for s in lists:
                    yield Request(self.driver.current_url,meta={'lists':lists},callback=self.parse_page)

            except NoSuchElementException:
                self.logger.info('No more pages to load.')
                self.driver.quit()
                break


    def parse_page(self, response):
        lists=response.meta['lists']
        lis=lists.xpath('//*[@class="row mx-0 link-to-irp"]')
        for s in lis:
            Name=s.xpath('.//*[@class="athName"]//text()').extract_first()
            Gender=s.xpath('.//*[@class="col-12 pl-0"]//text()').extract_first()
            if Gender:
                Gender=Gender.split()[0]
            Bib=s.xpath('.//*[@class="col-12 pl-0"]//span[2]//text()').extract_first()
            if Bib:
                Bib=Bib.split()[-1]
            City=s.xpath('.//*[@id="location"]//text()').extract_first()
            Pace=s.xpath('.//*[@class="col px-0"]//div[1]//text()').extract_first()
            Time=s.xpath('.//*[@class="col-2 px-0"]//text()').extract_first()

            yield {
                    'Name':Name,
                    'Gender':Gender,
                    'Bib':Bib,
                    'City':City,
                    'Pace':Pace,
                    'Time':Time


            }

0 个答案:

没有答案