如何使用硒C#从网站抓取数据表?

时间:2019-02-05 18:55:24

标签: c# selenium selenium-webdriver automation selenium-chromedriver

我正在尝试从网页中抓取数据。但是,我在抓取表中的所有数据时遇到了麻烦。我需要切换页面以获取所有数据,并且我愿意使用DataGridTable获得输出。即使网站上的页面数发生了变化,我也很难弄清楚该如何做。我想按页自动在数据网格表中添加信息。我的输入(网站)仅显示25个项目。那就是为什么我在DataGridTable中有25个项目。我想证明“转到结束页面按钮”元素的“页数”是合理的。这样我的程序就知道要从网站上抓取多少页面。但是,如果有其他方法,我想知道谢谢。

这是我现在的代码。

DataTable dt = new DataTable();

        var header = driver.FindElement(By.CssSelector("#gridComponent > div.k-grid-header"));
        foreach (var row in header.FindElements(By.TagName("tr")))
        {
            //Configure Number of Col and row
            int cellIndex = 0;
            string[] arr = new string[32];

            //Get Cell Data
            foreach (var cell in row.FindElements(By.TagName("th")))
            {
                // Check the header cell for a checkbox child. If no
                // such child exists, add the column.
                var headerCheckboxes = cell.FindElements(By.CssSelector("input[type='checkbox']"));
                if (headerCheckboxes.Count == 0)
                {
                    //Number of Col Data Load
                    if (cellIndex <= 29)
                    {
                        arr[cellIndex] = cell.Text;
                        dt.Columns.Add(cell.Text);
                    }
                    else
                    cellIndex++;
                }
            }
            Console.WriteLine(arr);
        }

        var table = driver.FindElement(By.CssSelector("#gridComponent"));

        //Get Row value
        foreach (var row in table.FindElements(By.TagName("tr")))
        {
            //Configure Number of Col and row
            int cellIndex = 0;

            // Use a list instead of an array
            List<string> arr = new List<string>();

            //Get Cell Data
            foreach (var cell in row.FindElements(By.TagName("td")))
            {
                // Skip the first column in the row by checking
                // if the cell index is 0.
                if (cellIndex != 0)
                {
                    string cellValue = "";
                    Console.WriteLine(cell);
                    var checkboxes = cell.FindElements(By.CssSelector("input[type='checkbox']"));
                    if (checkboxes.Count > 0)
                    {
                        bool isChecked = false;
                        isChecked = checkboxes[0].Selected;
                        cellValue = isChecked.ToString();
                    }
                    else
                    {
                        cellValue = cell.Text;
                    }
                    arr.Add(cellValue);
                }
                cellIndex++;
            }
            dt.Rows.Add(arr.ToArray());
        }
        dataGridView1.DataSource = dt;
        driver.FindElement(By.CssSelector("#gridComponent > div.k-pager-wrap.k-grid-pager.k-widget.k-floatwrap > ul > li:nth-child(3)")).Click();
    }

This is the table that I am trying to scrape from.

这是上图所示的以下元素的代码。

<a href="#" aria-label="Go to the last page" title="Go to the last page" class="k-link k-pager-nav k-pager-last" data-page="3" tabindex="-1"><span class="k-icon k-i-arrow-end-right"></span></a>

非常感谢您。

2 个答案:

答案 0 :(得分:0)

您可能需要考虑索引信息“ 64个项目中的1-25”,因为它可以很好地指示总页数。

批次= 1-25,即每页25个项目 项目总数= 64

不。页数=汇总(64/25)

PS:一个更好的选择,无需任何计算,也许可以获取最后一页按钮的“数据页”属性。

答案 1 :(得分:0)

我终于得到了答案。

private List<List<string>> GetRecords(IWebElement table)
    {
        List<List<string>> rows = new List<List<string>>(); ;
        //Get Row value
        foreach (var row in table.FindElements(By.TagName("tr")))
        {
            //Configure Number of Col and row
            int cellIndex = 0;

            // Use a list instead of an array
            List<string> cols = new List<string>();

            //Get Cell Data
            foreach (var cell in row.FindElements(By.TagName("td")))
            {
                // Skip the first column in the row by checking
                // if the cell index is 0.
                if (cellIndex != 0)
                {
                    string cellValue = "";
                    Console.WriteLine(cell);
                    var checkboxes = cell.FindElements(By.CssSelector("input[type='checkbox']"));
                    if (checkboxes.Count > 0)
                    {
                        bool isChecked = false;
                        isChecked = checkboxes[0].Selected;
                        cellValue = isChecked.ToString();
                    }
                    else
                    {
                        cellValue = cell.Text;
                    }
                    cols.Add(cellValue);
                }
                cellIndex++;
            }

            rows.Add(cols);
        }

        return rows;
    }

    private void button1_Click(object sender, EventArgs e)
    {
        //Configure to Hide CMD
        var chromeDriverService = ChromeDriverService.CreateDefaultService();
        chromeDriverService.HideCommandPromptWindow = true;

        //Configure to Hide Chrome
        ChromeOptions option = new ChromeOptions();
        option.AddArgument("--headless");

        //HIDING CHROME UN-COMMNET THE SECOND ONE TO SHOW

        //IWebDriver driver = new ChromeDriver(chromeDriverService, option);
        IWebDriver driver = new ChromeDriver();


        driver.Url = "**************";
        driver.Manage().Window.Maximize();
        driver.SwitchTo().DefaultContent();

        //Log-in
        driver.FindElement(By.Id("username")).SendKeys("*****");
        driver.FindElement(By.Id("password")).SendKeys("******" + OpenQA.Selenium.Keys.Enter);

        //Entering Access Code
        driver.FindElement(By.Id("password")).SendKeys("*******");
        driver.FindElement(By.Id("accesscode")).SendKeys("********" + OpenQA.Selenium.Keys.Enter);

        //go to CustomerList
        driver.Navigate().GoToUrl("***********");
        driver.Navigate().GoToUrl("*****************");

        //Wait till load 3 seconds
        waitOnPage(2);

        DataTable dt = new DataTable();

        var header = driver.FindElement(By.CssSelector("#gridComponent > div.k-grid-header"));
        foreach (var row in header.FindElements(By.TagName("tr")))
        {
            //Configure Number of Col and row
            int cellIndex = 0;
            string[] arr = new string[32];

            //Get Cell Data
            foreach (var cell in row.FindElements(By.TagName("th")))
            {
                // Check the header cell for a checkbox child. If no
                // such child exists, add the column.
                var headerCheckboxes = cell.FindElements(By.CssSelector("input[type='checkbox']"));
                if (headerCheckboxes.Count == 0)
                {
                    //Number of Col Data Load
                    if (cellIndex <= 29)
                    {
                        arr[cellIndex] = cell.Text;
                        dt.Columns.Add(cell.Text);
                    }
                    else
                    cellIndex++;
                }
            }
            Console.WriteLine(arr);
        }

        var table = driver.FindElement(By.CssSelector("#gridComponent"));


        List<List<string>> records = GetRecords(table);

        // Supposing you want the footer information
        var lastPageStr = table.FindElement(By.ClassName("k-pager-last")).GetAttribute("data-page");
        var lastPage = Convert.ToInt16(lastPageStr);
        // You can select other info lik this


        // class="k-link k-pager-nav" data-page="1" 
        driver.FindElement(By.CssSelector("#gridComponent > div.k-pager-wrap.k-grid-pager.k-widget.k-floatwrap > ul > li:nth-child(3)")).Click();

        // Cycle over the pages
        for (int p = 0; p < (lastPage - 1); p++)
        {
            driver.FindElement(By.CssSelector("#gridComponent > div.k-pager-wrap.k-grid-pager.k-widget.k-floatwrap > a:nth-child(4) > span")).Click();
            waitOnPage(2);
            var rows = GetRecords(table);
            records.AddRange(rows);
        }

        // Add all rows to DT
        //dt.Rows.Add(records[4].ToArray());
        foreach(var row in records)
        {
            dt.Rows.Add(row.ToArray());
        }

        dataGridView1.DataSource = dt;
    }