我如何在div标签之外通过Web抓取数据

时间:2020-03-12 15:49:02

标签: vba web-scraping

我无法从https://next-episode.net/star-trek-picard抓取值。我希望从“ Previous__episode”标签ID中提取季节编号。该数字似乎介于2个标签之间,我似乎无法找到引用它的方法。香港专业教育学院以前一直拉所有previous_episode标签的内部文本,并使用正则表达式函数来隔离我想要给我以下内容的值

上一集

名称:碎块

日期:

2020年3月12日

季节:

1

集锦:8

摘要:情节摘要

(在excel中,它们之间没有任何一行。由于某些原因,问题框中的格式将它们全部放在一行上)

在今晚之前,数字1与“ Season:”位于同一行,从而允许我的正则表达式功能正常工作。 或者可能是一个正则表达式,让我在“季节:”之后的下一行获得1。

Dim XML_05 As New MSXML2.XMLHTTP60
Dim HTML_05 As New MSHTML.HTMLDocument

XML_05.Open "GET", Cells(Row, NextEpisodeURL).Value, False
XML_05.send
HTML_05.body.innerHTML = XML_05.responseText


Dim NETC_05 As MSHTML.IHTMLElementCollection
Dim NET_05 As MSHTML.IHTMLElement
Dim REC_05 As MSHTML.IHTMLElement
Dim CEC_05 As MSHTML.IHTMLElementCollection
Dim CE_05 As MSHTML.IHTMLElement
Dim REO_05 As VBScript_RegExp_55.RegExp
Dim MO_05 As Object
Dim SN_05() As String
Dim ENA_05() As String
Dim EN_05() As String
Dim LatestEpisodeName As String



Set NET_05 = HTML_05.getElementById("previous_episode")
Set REO_05 = New VBScript_RegExp_55.RegExp
    REO_05.Global = True
    REO_05.IgnoreCase = True


REO_05.Pattern = "(Name:(.*))"
    Set MO_05 = REO_05.Execute(NET_05.innerText)
        Debug.Print MO_05.Count
        Debug.Print MO_05(0).Value
            ENA_05 = Split(MO_05(0), ":")
        Debug.Print ENA_05(1)
        LatestEpisodeName = ENA_05(1)


REO_05.Pattern = "(Episode:([0-9]*))"
    Set MO_05 = REO_05.Execute(NET_05.innerText)
        Debug.Print MO_05.Count
        Debug.Print MO_05(0).Value
            EN_05 = Split(MO_05(0), ":")
        Debug.Print EN_05(1)
        Cells(Row, EpisodeNet).Value = EN_05(1)


REO_05.Pattern = "(Season:\s+([0-9]*))"
    Set MO_05 = REO_05.Execute(NET_05.innerText)
        Debug.Print MO_05.Count
        Debug.Print MO_05(5).Value
            SN_05 = Split(MO_05(0), ":")
        Debug.Print SN_05(1)
        Trim (SN_05(1))
        Cells(Row, SeasonNet).Value = SN_05(1)



Set NETC_05 = HTML_05.getElementById("next_episode").Children
    Cells(Row, CountDown).Value = NETC_05(5).innerText
    Debug.Print NETC_05(5).innerText

3 个答案:

答案 0 :(得分:2)

我想这可能会帮助您获得希望获取的必填字段:

Sub FetchData()
    Const Url$ = "https://next-episode.net/star-trek-picard"
    Dim HTML As New HTMLDocument, post As Object
    Dim rxp As New RegExp, R&

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", Url, False
        .send
        HTML.body.innerHTML = .responseText
    End With

    Set post = HTML.getElementById("previous_episode")

    With rxp
        .pattern = "(Name:(.*))"
        If .Execute(post.innerText).Count > 0 Then
            R = R + 1: Cells(R, 1) = .Execute(post.innerText)(0).SubMatches(0)
        End If

        .pattern = "(Season:\s*([0-9]*))"
        If .Execute(post.innerText).Count > 0 Then
            Cells(R, 2) = .Execute(post.innerText)(0).SubMatches(0)
        End If

        .pattern = "(Episode:([0-9]*))"
        If .Execute(post.innerText).Count > 0 Then
            Cells(R, 3) = .Execute(post.innerText)(0).SubMatches(0)
        End If
    End With
End Sub

答案 1 :(得分:0)

    Const Url$ = "https://next-episode.net/star-trek-picard"
    Dim HTML As New HTMLDocument, post As Object
    Dim rxp As New RegExp, R&

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", Url, False
        .send
        HTML.body.innerHTML = .responseText
    End With

    Set post = HTML.getElementById("previous_episode")

    With rxp
        .pattern = "(Name:(.*))"
        If .Execute(post.innerText).Count > 0 Then
            R = R + 1: Cells(R, 1) = .Execute(post.innerText)(0).SubMatches(0)
        End If

        .pattern = "(Season:\s+([0-9]*))"
        If .Execute(post.innerText).Count > 0 Then
            CleanString = Application.WorksheetFunction.Clean(.Execute(post.innerText)(0).SubMatches(0))
            SeasonNumber = Split(CleanString, ":")
            Cells(R, 2) = SeasonNumber(1)
        End If

        .pattern = "(Episode:([0-9]*))"
        If .Execute(post.innerText).Count > 0 Then
            Cells(R, 3) = .Execute(post.innerText)(0).SubMatches(0)
        End If
    End With
End Sub

答案 2 :(得分:0)

使用DOM更加方便快捷。您想要的节点是NextSiblings到与尼斯且快速的CSS选择器#previous_episode .subheadline匹配的项目。您将获得一个返回的nodeList,它们是左侧的元素,例如“名称”,“日期”等...只需使用NextSibling导航到右侧元素,例如“等着阿卡迪亚自我,第1部分”。

返回的nodeList从0开始,您可以索引以获取单个项目。请注意,您想要的第二项是textNode,因此必须使用NodeValue属性而不是innerText

这在处理HTML解析方面更快,更正确。

Option Explicit
Public Sub GetPriorEpisodeInfp()
    Const URL = "https://next-episode.net/star-trek-picard"
    Dim html As New mshtml.HTMLDocument, previousEpisodeItems As Object

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .send
        html.body.innerHTML = .responseText
    End With

    Set previousEpisodeItems = html.querySelectorAll("#previous_episode  .subheadline")

    With ActiveSheet
        .Cells(1, 1) = previousEpisodeItems.item(0).NextSibling.innerText
        .Cells(1, 2) = previousEpisodeItems.item(2).NextSibling.NodeValue 'textNode
        .Cells(1, 3) = previousEpisodeItems.item(3).NextSibling.innerText
    End With
End Sub