使用Rselenium下载多个PDF

时间:2019-01-03 05:44:19

标签: download rvest rselenium

我试图通过在页面上导航来下载多个PDF。即使我能够使用下拉菜单浏览页面并最后下载PDF。我收到此错误:

元素命令失败,因为所引用的元素不再附加到DOM。

下面是我的代码:

library(RSelenium)
library(stringr)

rd<-rsDriver()
remDr<-rd[["client"]]

remDr$navigate("http://secc.gov.in/lgdStateList")

#First drop down

stateEle<-remDr$findElement("id", "lgdState")
states<-stateEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% str_trim('left')
states<-states[-1]
for (i in 1:length(states)) {
  stateEle$clickElement()
  stateEle$sendKeysToElement(list(states[i]))
  stateEle$clickElement()
  #Second drop down

  distEle<-remDr$findElement("id", "lgdDistrict")
  districts<-distEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist%>%str_trim('left')
  districts<-districts[-1]
  for (j in 1:length(districts)) {
    distEle$clickElement()
    distEle$sendKeysToElement(list(districts[[j]]))
    distEle$clickElement()
    #Third drop down

    blockEle<-remDr$findElement("id", "lgdBlock")
    block<-blockEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist%>%str_trim('left')
    block<-block[-1]
    for (k in 1:length(block)) {
      blockEle$clickElement()
      blockEle$sendKeysToElement(list(block[[k]]))
      blockEle$clickElement()

      gpEle<-remDr$findElements('class', 'statesrow')
      for (m in 1:length(gpEle)) {
        h<-unlist(gpEle[[m]]$getElementAttribute('innerHTML'))
        h<-unlist(h%>% strsplit(., '<td>'))
        h<-h[-1]
        for (n in 1:length(h)) {
          xpath1<-paste('//*[@id="example"]/tbody/tr[',m,']/td[',n,']/a')
          pdfEle<-remDr$findElement('xpath', xpath1)
          pdfEle$clickElement()
          Sys.sleep(5)
        }
      }
    }
  }
}

1 个答案:

答案 0 :(得分:1)

根据您的要求

library(rvest)
url<-"http://secc.gov.in/lgdStateList"

page<-html_session(url)

## STATE LOOP ##
state <- html_nodes(page,css="#lgdState > option") %>% html_text()
state <- state[-1]
state_id <- html_nodes(page,css="#lgdState > option") %>% html_attr('value')
state_id <- state_id[-1]


for(i in 1:length(state)){
  page1<-rvest:::request_POST(page, url="http://secc.gov.in/lgdDistrictList",
                              body=list(
                                "stateCode"=state_id[i]
                              ),
                              encode="form")

  ## DISTRICT LOOP ##
  district <- html_nodes(page1,css="#lgdDistrict > option") %>% html_text()
  district <- district[-1]
  district_id <- html_nodes(page1,css="#lgdDistrict > option") %>% html_attr('value')
  district_id <- district_id[-1]

  for(j in 1:length(district)){
    page2<-rvest:::request_POST(page1,url="http://secc.gov.in/lgdBlockList",
                                body=list(
                                  "stateCode"=state_id[i],
                                  "districtCode"=district_id[j]
                                ),
                                encode = "form")

    ## BLOCK LOOP ##
    block <- html_nodes(page2, css="#lgdBlock > option") %>% html_text()
    block <- block [-1]
    block_id <- html_nodes(page2, css="#lgdBlock > option") %>% html_attr('value')
    block_id <- block_id[-1]

    for(k in 1:length(block)){
      page3<-rvest:::request_POST(page2,url="http://secc.gov.in/lgdGpList",
                                  body=list(
                                    "stateCode"=state_id[i],
                                    "districtCode"=district_id[j],
                                    "blockCode"=block_id[k]
                                  ),
                                  encode = "form")
      txt <- html_nodes(page3,css="#example a") %>% html_attr("onclick")
      library(stringr)

      gpcode<-sapply(txt,function(x){
        k <- str_extract_all(x, "\\([^()]+\\)")[[1]]
        k <- substring(k, 2, nchar(k)-1)
        regexp <- "[[:digit:]]+"
        k <- str_extract(strsplit(k, ",")[[1]][4], regexp)
      })

      ## GP CODE LOOP to download file ##
      for(l in 1:length(gpcode)){
        page4<-rvest:::request_POST(page3,url="http://secc.gov.in/downloadLgdwisePdfFile",
                                    body=list(
                                      "stateCode"=state_id[i],
                                      "districtCode"=district_id[j],
                                      "blockCode"=block_id[k],
                                      "gpCode"=gpcode[l]
                                    ),
                                    encode = "form")
        error = "PDF File for this Gram Panchayat is not available."
        error_displayed = try(html_nodes(page4,css=".error") %>% html_text())
        if(error != error_displayed){
          filename<-gsub("attachment;filename=","",page4$response$headers$`content-disposition`)
          filename<-str_replace_all(filename, '"', "")
          writeBin(page4$response$content,filename)
        }
      }

    }
  }


}

这也是没有RSelenium的情况。 :)