使用R传递多个搜索请求来刮取表

时间:2017-04-20 19:53:03

标签: r web-scraping

我尝试使用名字和姓氏在网站上多次搜索 (https://npiregistry.cms.hhs.gov/registry/)然后创建输出的数据框

我发现这与How to automate multiple requests to a web search form using R中描述的相似,但由于某些原因,我一直在收到错误     "错误:无法加载外部实体"`

以下是我用来提取记录的代码

fn  = rep(c('HARVEY','HARVEY'));

ln  = rep(c('BIDWELL','ADELSON'));

mydf = data.frame(fn,ln);


get_data = function(df){

  library(XML);

  root = 'http://npiregistry.cms.hhs.gov/'


  u = paste(root,'registry/search-results-table?','first_name=', df$fn, '&last_name=', 
            df$ln, sep = "");

  # encode url correctly
  url  = URLencode(u);

  # extract data from the right table
  data = readHTMLTable(url);

}


library(plyr)
mydata = adply(mydf, 1, get_data);

感谢您的帮助

2 个答案:

答案 0 :(得分:2)

呼叫需求是https:而不是http:。我还删除了仅用于基础R的plyr库:

library(rvest)
fn  = rep(c('HARVEY','HARVEY'));
ln  = rep(c('BIDWELL','ADELSON'));
mydf = data.frame(fn,ln);

get_data = function(df){
  root = 'https://npiregistry.cms.hhs.gov/'
  u = paste(root,'registry/search-results-table?','first_name=', df[1], '&last_name=', 
            df[2], sep = "");
  # encode url correctly
  url  = URLencode(u);
  #print(url)
  # extract data from the right table
  data = read_html(url);
  newresult<- html_nodes(data, "table")[1] %>%html_table()
  # convert result into a data frame
  newresult<-as.data.frame(newresult)
}

mydata = apply(mydf, 1, function(x) { get_data(x)})
#mydata is a list of data frames, do.call creates a single data.frame
finalanswer<-do.call(rbind, mydata)
#finalanswer needs some clean up.

答案 1 :(得分:2)

它有一个未经身份验证的API ......为什么不使用它?

library(httr)
library(jsonlite)
library(tidyverse)

npi_query <- function(f_name, l_name) {

  res <- GET("https://npiregistry.cms.hhs.gov/api/",
             query = list(first_name = f_name, last_name = l_name))

  stop_for_status(res)

  res <- content(res, as="text", encoding="UTF-8")
  res <- fromJSON(res, flatten=TRUE)

  as_tibble(res$results)

}

data_frame(
  fn = c('HARVEY', 'HARVEY'),
  ln = c('BIDWELL','ADELSON')
) -> lkp

map2_df(lkp$fn, lkp$ln, npi_query) %>%
  glimpse()
## Observations: 2
## Variables: 19
## $ taxonomies             <list> [<MA, 207R00000X, TRUE, 36065, Interna...
## $ addresses              <list> [<c("DORCHESTER", "DORCHESTER"), c("23...
## $ created_epoch          <int> 1152230400, 1168992000
## $ identifiers            <list> [[], []]
## $ other_names            <list> [[], []]
## $ number                 <int> 1336171859, 1205988342
## $ last_updated_epoch     <int> 1183852800, 1183852800
## $ enumeration_type       <chr> "NPI-1", "NPI-1"
## $ basic.status           <chr> "A", "A"
## $ basic.credential       <chr> "M.D.", "DMD"
## $ basic.first_name       <chr> "HARVEY", "HARVEY"
## $ basic.last_name        <chr> "BIDWELL", "ADELSON"
## $ basic.middle_name      <chr> "W", "JEROME"
## $ basic.name             <chr> "BIDWELL HARVEY", "ADELSON HARVEY"
## $ basic.gender           <chr> "M", "M"
## $ basic.sole_proprietor  <chr> "NO", "NO"
## $ basic.last_updated     <chr> "2007-07-08", "2007-07-08"
## $ basic.enumeration_date <chr> "2006-07-07", "2007-01-17"
## $ basic.name_prefix      <chr> NA, "DR."
相关问题