循环浏览网页并复制数据

时间:2015-02-07 16:40:32

标签: internet-explorer vbscript web-scraping wsh

我为一位朋友创建了这个脚本,该朋友在一个房地产网站上循环并为她提供了电子邮件地址(用于促销)。该网站免费提供,但一次抓取一个是不方便的。第一个脚本将每个页面数据转储到名为webdump的txt文件中,第二个脚本从第一个txt文件中提取电子邮件地址。将每个这些保存在单独的.vbs文件中。如果要测试脚本,可能需要将以下内容更改为较小的数字(这是处理的页数):

Do while i < 1334

第一个错误的方法,我不完全确定为什么,第二个提取的不只是电子邮件地址,再次,不完全确定原因。我不是一个高技能的vbs家伙,但这些问题与我的问题无关......底部的问题......

set ie = createobject("internetexplorer.application") 
Set objShell = CreateObject("WScript.Shell")
Dim i
i = 0

Do while i < 1334
i = i + 1

ie.navigate "http://www.reoagents.net/search-3.php?category=1&firmname=&business=&address=&zip=&phone=&fax=&mobile=&im=&manager=&mail=&www=&reserved_1=&reserved_2=&reserved_3=&filterbyday=ANY&loc_one=&loc_two=&loc_three=&loc_four=&location_text=&page="&i
do until ie.readystate = 4 : wscript.sleep 10: loop 

pageText = ie.document.body.innertext 

set fso = createobject("scripting.filesystemobject") 
set ts = fso.opentextfile("c:\webdump.txt",8,true) 
ts.write pageText 
ts.close 

loop

Wscript.Echo "All site data copied!"

第二部分:

Const ForReading = 1
Const ForWriting = 8

Set objRegEx = CreateObject("VBScript.RegExp")
objRegEx.Pattern = "@"

Set objFSO = CreateObject("Scripting.FileSystemObject")

'Input file
Set objFileIn = objFSO.OpenTextFile("C:\webdump.txt", ForReading)
strOutputFile = "C:\cleanaddress.txt"

Do Until objFileIn.AtEndOfStream
strSearchString = objFileIn.ReadLine
Set colMatches = objRegEx.Execute(strSearchString)  
If colMatches.Count > 0 Then
    For Each strMatch in colMatches 
' Output File
Set objFileOut = objFSO.OpenTextFile(strOutputFile, ForWriting, True)  

IF InStr(strSearchString," ") = 0 THEN
objFileOut.writeline strSearchString
ELSE
objFileOut.writeline Left(strSearchString,InStr(strSearchString," ")-1)


    END IF
    objFileOut.Close
    Set objFileOut = Nothing

    Next
End If
Loop

objFileIn.Close
Wscript.Echo "Done!"

我能够轻松浏览该网站上的网页,因为地址的方式是......最后一个地址是连续的,但是,现在我想用这个地址尝试一下:

https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes&FromSearchControl=Yes

似乎是基于java的。当我点击每个页面时,地址不会改变。在这种情况下,是否可以做类似于我在其他网站上所做的事情?

2 个答案:

答案 0 :(得分:2)

这是真正的jedi方法:)仅使用XMLHttpRequests,没有IE的缺点或依赖性。通过mshta动态创建的输出窗口没有临时文件。通过实现异步请求或多进程环境可以提高处理速度。不幸的是,目前停止脚本的唯一方法是wscript.exe进程终止。

Option Explicit

Dim oDisplay, sUrl, sRespHeaders, sRespText, arrSetHeaders, sEventTarget, arrFormData, lPage, lMember, i, arrFormStrings, sFormData, arrMembers, arrMemeber, sUrlEmail, sRespTextEmail, sEmail

Set oDisplay = New OutputWindow
sUrl = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes"
lPage = 0
lMember = 0

' Initial webpage request
oDisplay.Write("Connecting " & vbCrLf & sUrl)
XmlHttpRequest "GET", sUrl, Array(), "", sRespHeaders, sRespText

' Loop through all pages
Do
    ' Get cookies, form data, listctrl
    oDisplay.Write("Processing page #" & (lPage + 1))
    sEventTarget = ParseFragm("__doPostBack\('(ListControl_[\s\S]*?)',", sRespText)
    ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders
    ParseResponse "<input type=""hidden"" name=""([\S]*?)""[\s\S]*?value=""([\s\S]*?)"" />", sRespText, arrFormData

    ' Update form params
    For i = 0 To UBound(arrFormData)
        Select Case arrFormData(i)(0)
        Case "__POSTBACKCONTROL"
            arrFormData(i)(1) = "JumpToPage"
        Case "__EVENTTARGET"
            arrFormData(i)(1) = sEventTarget
        Case "__EVENTARGUMENT"
            arrFormData(i)(1) = CStr(lPage)
        End Select
    Next

    ' Jump to page #lPage
    arrFormStrings = Array()
    ReDim arrFormStrings(UBound(arrFormData))
    For i = 0 To UBound(arrFormData)
        arrFormStrings(i) = EncodeUriComponent(arrFormData(i)(0)) & "=" & EncodeUriComponent(arrFormData(i)(1))
    Next
    sFormData = Join(arrFormStrings, "&")
    PushItem arrSetHeaders, Array("Content-Type", "application/x-www-form-urlencoded")
    PushItem arrSetHeaders, Array("Content-Length", CStr(Len(sFormData)))

    ' New page POST request
    XmlHttpRequest "POST", sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText

    ' Parse members from new page
    ParseMembers sRespText, arrMembers

    ' Parse members emails, and output 
    For Each arrMemeber in arrMembers
        lMember = lMember + 1
        sUrlEmail = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=PrimaryContactInfo&ind_cst_key=" & arrMemeber(0)
        XmlHttpRequest "GET", sUrlEmail, Array(), "", "", sRespTextEmail
        sEmail = ParseFragm("""mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", sRespTextEmail)
        oDisplay.WriteTable(Array(CStr(lMember), sEmail, arrMemeber(0), arrMemeber(1)))
    Next

    lPage = lPage + 1
Loop


Sub ParseResponse(sPattern, sResponse, arrData)
    Dim oMatch
    arrData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            PushItem arrData, Array(oMatch.SubMatches(0), oMatch.SubMatches(1))
        Next
    End With
End Sub

Function ParseFragm(sPattern, sResponse)
    Dim oMatches
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = sPattern
        Set oMatches = .Execute(sResponse)
        If oMatches.Count > 0 Then ParseFragm = oMatches(0).SubMatches(0)
    End With
End Function

Sub ParseMembers(sRespText, arrMembers)
    Dim oMatch
    arrMembers = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = "<td class[\s\S]*?>([\s\S]*?<[\s\S]*?Key=([\s\S]*?)&[\s\S]*?)</td>"
        For Each oMatch In .Execute(sRespText)
            PushItem arrMembers, Array(oMatch.SubMatches(1), GetInnerText(oMatch.SubMatches(0)))
        Next
    End With
End Sub

Sub PushItem(arrList, varItem)
    ReDim Preserve arrList(UBound(arrList) + 1)
    arrList(UBound(arrList)) = varItem
End Sub

Function EncodeUriComponent(sText)
    With CreateObject("htmlfile")
        .Write ("<script language='JScript'></script>")
        EncodeUriComponent = .DocumentElement.Document.Script.EncodeUriComponent(sText)
    End With
End Function

Function GetInnerText(sText)
    With CreateObject("htmlfile")
        .Write ("<body>" & sText & "</body>")
        GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
    End With
End Function

Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
    Dim arrHeader
    With CreateObject("Msxml2.ServerXMLHTTP.3.0")
        .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        .Open sMethod, sUrl, False
        For Each arrHeader In arrSetHeaders
            .SetRequestHeader arrHeader(0), arrHeader(1)
        Next
        .Send sFormData
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With
End Sub

Class OutputWindow

    Dim oWnd, oDoc, oOutDiv, oCursorDiv, oOutTBody, sSignature, lCols

    Private Sub Class_Initialize()
        sSignature = "OutputWindow"
        ProvideWindow()
    End Sub

    Private Sub ProvideWindow()
        ' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
        Dim lWidth, lHeight
        GetWindow()
        If oWnd Is Nothing Then
            CreateWindow()
            With oWnd
                With .Document
                    .GetElementsByTagName("head")(0).AppendChild .CreateElement("style")
                    .stylesheets(0).cssText = "body, td, #output {font-family: consolas, courier new; font-size: 9pt;} #cursor {margin: 3px;} body {background-color: buttonface;} #output {height: 100%; width: 100%; overflow: scroll; background: #FFF;} div.hline {height: 1px; width: 100%; background-color: #000; overflow: hidden;} table {width: 100%; TEXT-ALIGN: center; border-COLLAPSE: collapse; background: transparent; margin-top: 1px;} td {border: black 1px solid;}"
                    .Title = "Output Window"
                    .Body.InnerHtml = "<div id='output'><div id='cursor'><img src='data:image/gif;base64,R0lGODlhAwAJAPAAAAAAAAAAACH5BAkeAAEAIf8LTkVUU0NBUEUyLjADAf//ACwAAAAAAwAJAAACBwxieMnrGgoAIfkECR4AAAAsAAAAAAMACQAAAgSEj6laADs=' /></div></div>"
                End With
                lWidth = CInt(.Screen.AvailWidth * 0.75)
                lHeight = CInt(.Screen.AvailHeight * 0.75)
                .ResizeTo .Screen.AvailWidth, .Screen.AvailHeight
                .ResizeTo lWidth + .Screen.AvailWidth - .Document.Body.OffsetWidth, lHeight + .Screen.AvailHeight - .Document.Body.OffsetHeight
                .MoveTo CInt((.Screen.AvailWidth - lWidth) / 2), CInt((.Screen.AvailHeight - lHeight) / 2)
            End With
        End If
        Set oDoc = oWnd.Document
        Set oOutDiv = oWnd.output
        Set oCursorDiv = oWnd.cursor
        lCols = -1
    End Sub

    Private Sub GetWindow()
        Dim oShellWnd
        On Error Resume Next
        For Each oShellWnd In CreateObject("Shell.Application").Windows
            Set oWnd = oShellWnd.GetProperty(sSignature)
            If Err.Number = 0 Then Exit Sub
            Err.Clear
        Next
        Set oWnd = Nothing
    End Sub

    Private Sub CreateWindow()
        Dim oProc
        Do
            Set oProc = CreateObject("WScript.Shell").exec("mshta ""about:<head><script>moveTo(-32000,-32000);window.document.title=' ';</script><hta:application id=app border=dialog minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=yes selection=yes innerborder=no /><object id='shellwindow' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shellwindow.putproperty('" & sSignature & "',window);</script></head>""")
            Do
                If oProc.Status > 0 Then Exit Do
                GetWindow()
                If Not (oWnd Is Nothing) Then Exit Sub
            Loop
        Loop
    End Sub

    Private Sub ChkDoc()
        On Error Resume Next
        If TypeName(oDoc) <> "HTMLDocument" Then ProvideWindow()
    End Sub

    Public Sub Write(sText)
        Dim oDiv
        ChkDoc()
        On Error Resume Next
        Set oDiv = oDoc.CreateElement("div")
        oDiv.InnerHtml = EscapeHtml(sText) & "<div class='hline'></div>"
        oOutDiv.AppendChild oDiv
        oOutDiv.AppendChild oCursorDiv
        oOutDiv.ScrollTop = oOutDiv.ScrollHeight
        lCols = -1
    End Sub

    Public Sub WriteTable(arrCells)
        Dim sInner, oTable, oRow, oTr, oCell, n
        ChkDoc()
        On Error Resume Next
        If UBound(arrCells) <> lCols Then
            Set oTable = oDoc.CreateElement("table")
            oOutDiv.AppendChild oTable
            Set oOutTBody = oDoc.CreateElement("tbody")
            oTable.AppendChild oOutTBody
            lCols = UBound(arrCells)
        End If
        Set oTr = oDoc.CreateElement("tr")
        oOutTBody.AppendChild oTr
        For n = 0 To lCols
            Set oCell = oTr.InsertCell(n)
            oCell.InnerHtml = EscapeHtml(arrCells(n))
        Next
        oOutDiv.AppendChild oCursorDiv
        oOutDiv.ScrollTop = oOutDiv.ScrollHeight
    End Sub

    Public Sub BreakTable()
        lCols = -1
    End Sub

    Private Function EscapeHtml(sCnt)
        Dim n
        sCnt = Replace(sCnt, "&", "&amp;")
        sCnt = Replace(sCnt, """", "&quot;")
        sCnt = Replace(sCnt, "<", "&lt;")
        sCnt = Replace(sCnt, ">", "&gt;")
        sCnt = Replace(sCnt, "'", "&#39;")
        sCnt = Replace(sCnt, vbCrLf, "<br>")
        sCnt = Replace(sCnt, Chr(9), "&nbsp;&nbsp;&nbsp;&nbsp;")
        sCnt = Replace(sCnt, "  ", " &nbsp;")
        sCnt = Replace(sCnt, "&nbsp; ", "&nbsp;&nbsp;")
        For n = 0 To 31
            sCnt = Replace(sCnt, Chr(n), "¶")
        Next
        EscapeHtml = sCnt
    End Function

    Private Sub Class_Terminate()
        ' oWnd.close
    End Sub

End Class

答案 1 :(得分:0)

虽然不完整,不是最佳,不是无bug,但这可能会有所帮助:

' VB Script Document
option explicit

Dim strResult: strResult = Wscript.ScriptName
Dim numResult: numResult = 0
Dim ii, IE, pageText, fso, ts, xLink, Links

  set fso = createobject("scripting.filesystemobject") 
  set ts = fso.opentextfile("d:\bat\files\28384650_webdump.txt",8,true) 

  set IE = createobject("internetexplorer.application") 

  'read first page
  IE.navigate "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes&FromSearchControl=Yes"
  IE.Visible = True

For ii = 1 to 3 '239
  ts.writeLine "-----------------" & ii
  strResult = strResult & vbNewLine & ii

  While IE.Busy
    Wscript.Sleep 100
  Wend
  While IE.ReadyState <> 4
    Wscript.Sleep 100
  Wend
  While IE.document.readystate <> "complete" 
      wscript.sleep 100
  Wend
  WScript.Sleep 100

  pageText = IE.document.body.innertext
  ts.writeLine pageText

  ' get sublinks and collect them in the 'strResult' variable
  Set Links = IE.document.getElementsByTagName("a")
  For Each xLink In Links
    If InStr(1, xLink.href, "WebCode=PrimaryContactInfo" _
      , vbTextCompare) > 0 Then
      If InStr(1, strResult, xLink.href, vbTextCompare) > 0 Then
      Else
        numResult = numResult + 1
        strResult = strResult & vbNewLine & xLink.href
      End If
    End If
  Next

  ' read a page of the 'ii' index
  IE.Navigate "javascript:window.__doPostBack('JumpToPage','" & ii+1 & "');"
  IE.Visible = True
Next

  ts.writeLine "===========" & numResult & vbTab & strResult
  ts.close 

Wscript.Echo "All site data copied! " _
    & numResult & vbNewline & strResult
Wscript.Quit

说明:

  • 使用通常的http(s)地址导航到第一个页面
  • 使用ii+1 ... javascript电话导航到__doPostBack索引的下一个页面(就像一个人跳转一样到页面字段,然后点击 GO 按钮)
  • 未完成:收集(间接)主要联络信息的链接 可以在不获取电子邮件地址的情况下找到电子邮件地址的网页<​​/ li>
  • 不是最佳:不断收集所访问网页的文字
  • not bugfree

    • 适用于新近清除的 MSIE 临时文件, 历史和饼干;否则从奇数(最后访问?)开始 netforum.avectra.com
    • 导航到ii+1 页面,因此在最后一页失败。