延迟在Puppeteer中加载图像

时间:2019-07-15 21:24:40

标签: javascript node.js

我的代码使用数据抓取从该房地产网站(https://www.zillow.com/vancouver-bc/)中提取信息。现在,对于图像,我总是会在输入几次后得到一些垃圾数据。我尝试滚动到页面底部,然后截屏以查看是否所有延迟加载的图像都已实际加载。屏幕截图显示,所有图像在滚动后都已加载,但是我收到的数据仍然没有图像数据。我确定有一些代码错误,但是我找不到它。

let cheerio        = require('cheerio')
let puppeteer      = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent      = require('random-useragent')
const baseURL      = "https://www.zillow.com/vancouver-bc"
let estateData     = []
let urlLinks       = []

let getEstateData = async () => {
    estateData = []
    urlLinks   = []
    let url
    for (let pgNum = 1; pgNum <= 1; pgNum++) {
        if (pgNum === 1) {
            url = baseURL + "/"
        } else {
            url = baseURL + ("/" + pgNum + "_p")
        }
        urlLinks.push(url)
    }
    await searchWebsite()
    console.log("search over")
    return estateData
    //module.exports = estateData
}

function scrollPage(page) {
    return page.evaluate( () => {
        // Page evaluate's scope is the page.
        // You have to pass args as a second parameter to evalute, but functions come up undefined.
        // https://stackoverflow.com/questions/46088351/puppeteer-pass-variable-in-evaluate

        let interval;
        let scrollTop = 0;
        const scrollBottom = 20;
        const intervalRate = 50;
        const pageDocument = document.documentElement;
        // this needs to hang until the interval clears
        // or the script will just move on to the waitFor below.
        return new Promise(resolve => {
            function scroll() {
                if (scrollTop + scrollBottom < pageDocument.scrollHeight) {
                    scrollTop += scrollBottom;
                    window.scroll(0, scrollTop);
                } else {
                    clearInterval(interval);
                    resolve(window.data);
                }
            }

            interval = setInterval(scroll, intervalRate);
        });
    });
}

let searchWebsite = async () => {
    await puppeteer
        .launch({headless : false})
        .then(async function (browser) {
            let page = await browser.newPage();
            // await page.setRequestInterception(true)
            //
            // page.on('request', (req) => {
            //     if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
            //         req.abort()
            //     }
            //     else {
            //         req.continue()
            //     }
            //
            // })
            await page.setViewport({ width: 1001, height: 1001 });

            let html
            await page.setUserAgent(userAgent.getRandom())
            for(let url of urlLinks){
                console.log(url)
                await page.goto(url).then(async function () {
                    html = await page.content();
                    let obj = await cheerio('.list-card-link.list-card-info', html)
                    let imgObj = await cheerio(".list-card-top", html)
                    let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)


                    await scrollPage(page)
                        .then(async () => {
                            await page.screenshot({path: 'testScreenShot.png', fullPage: true});
                        })
                        .then(async () => {
                            let num = 0
                            console.log(obj.length, "scrapping")
                            for (let key in obj) {
                                if (obj[key].attribs) {
                                    try {
                                        let geoStr = await geoLocation[0].children[0].children[0].children[0].data
                                        let geoObj = await (JSON.parse(geoStr)["geo"])

                                        let extractedInfo = {
                                            estateName: await obj[key].children[0].children[0].data,
                                            estatePrice: await obj[key].children[2].children[0].children[0].data,
                                            saleType: await obj[key].children[1].children[0].next.data,
                                            estateConfig: {
                                                beds: await obj[key].children[2].children[1].children[0].children[0].data,
                                                bath: await obj[key].children[2].children[1].children[1].children[0].data,
                                                area: await obj[key].children[2].children[1].children[2].children[0].data
                                            },
                                            estateLocation: {
                                                longitude: await geoObj.longitude,
                                                latitude: await geoObj.latitude
                                            },
                                            estateLink: await obj[key].attribs.href,
                                            estateCoverImgLink: await imgObj[num].children[2].children[0].attribs.src
                                        }
                                        console.log(extractedInfo.estateName, extractedInfo.estateCoverImgLink)
                                        //console.log(geoLocation[0].children[0].children[0])
                                        await estateData.push(extractedInfo)
                                        num++
                                    } catch (e) {
                                        console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
                                    }
                                }
                            }
                        })
                    })
                    console.log(estateData.length)
            }
            //Now read the page

            console.log("total - ", estateData.length)

            await page.close()
            await browser.close()
        })
        .catch(function (err) {
            console.log(err)
        });
}



module.exports.getEstateData = getEstateData

0 个答案:

没有答案
相关问题