如何提高网页抓取效率

时间:2016-08-04 15:31:50

标签: c# web-scraping

我在c#中创建了一个简单的应用程序,它使用HtmlAgilityPack来抓取xPath内容。我只是解析网站并提取块。

我的基本代码:

private void button1_Click(object sender, EventArgs e)
{
    OpenFileDialog ofd = new OpenFileDialog(); 
    ofd.Filter = ".txt|*.txt";
    if (ofd.ShowDialog() == DialogResult.OK)  //if I opened a file
    {
        label4.Text = ofd.SafeFileName;
        string line;
        string my_site = textBox1.Text;
        System.IO.StreamReader file = new System.IO.StreamReader(ofd.FileName);

        while ((line = file.ReadLine()) != null)  //load line by line
        {
            try
            {
                var Webget = new HtmlWeb();
                var doc = Webget.Load(line); //obtain source code
                int mine = Convert.ToInt32(scrape_mine(doc, my_site)); //scrape value and convert to Int
                int top = Convert.ToInt32(scrape_top(doc));  //scrape value and convert to Int
                if (top == 0)
                    top = 1;

                dataGridView1.Rows.Add(line, top - mine); //add scraped values to dataGrid

            }   //try

            catch (ArgumentNullException)  //if I couldn't scrape value
            {
                try //but source code contains "example"
                {
                    string sourceCode;
                    sourceCode = Worker.getSourceCode(line);
                    Regex r = new Regex(@"(example)");
                    var m = r.Match(sourceCode);
                    if (m.Groups[1].Value.ToString() == "example")
                        dataGridView1.Rows.Add(line, "ANOTHER");
                    else
                        dataGridView1.Rows.Add(line, "NONE");
                }


                catch //if I couldn't scrape value and source code does not contains "example"
                {
                    dataGridView1.Rows.Add(line, "NONE");
                }

            }   //catch

            catch (UriFormatException) //if I couldn't load website
            {
                dataGridView1.Rows.Add(line, "WRONG URL");

            }

        }   //while

        countRows(dataGridView1); //function which is sorting extracted data
        MessageBox.Show("Finished!");

    }   //if

}   //button

我想我不需要粘贴整个代码。问题是,正如您所看到的那样,我逐行采用并解析每个源代码。从1个站点解析5000个网址大约需要15分钟(ping~50ms)。有没有办法减少时间?

0 个答案:

没有答案
相关问题