使用rvest包在多个页面上对Web内容进行搜索

时间:2016-03-17 20:50:48

标签: r web-scraping rvest

我是一名新手R程序员,但我一直在尝试使用rvest包对网上大学的网站进行一些网络搜索。我从网页上抓取的第一个信息表是所有博士级程序的列表。这是我的代码:

library(xml2)
library(httr)
library(rvest)
library(selectr)

刮刮卡佩拉博士

fileUrl <- read_html("http://www.capella.edu/online-phd-programs/")

使用chrome中的选择器小工具工具,我能够选择我想要提取的网站上的内容。在这种情况下,我选择所有的博士水平课程。

Degrees <- fileUrl %>%
html_nodes(".accordianparsys a") %>%
html_text() 
Degrees

接下来,我创建了博士级学位的数据框。

Capella_Doctoral = data.frame(Degrees)       

下面我要创建另一个列,将这些程序标记为来自Capella。

Capella_Doctoral$SchoolFlag <- "Capella" 
View(Capella_Doctoral)

在上面的代码中,一切似乎都很好用。但是,我想要提取的下一类信息是每个博士课程的学费和学分。这些信息存在于每个博士课程的页面上。例如,领导力博士课程将包含此页面上的学费和学分信息“http://www.capella.edu/online-degrees/phd-leadership/”。会计DBA课程将在此页面“http://www.capella.edu/online-degrees/dba-accounting/”中包含学费和学分信息。各个页面的共同主题是它包含“online-degrees /".

之后的程序名称

为了创建我需要的各种网页列表(包括博士课程名称的网页),我开发了以下代码。

将博士学位的格式设置为小写,删除任何前导和 尾随空格,然后用短划线替换任何空格

Lowercase <- tolower(Capella_Doctoral$Degrees) 
Lowercase

删除前导和尾随空格

trim <- function (x) gsub("^\\s+|\\s+$", "", x)
Trim <- trim(Lowercase)
Trim

用破折号替换空格

Dashes <- gsub(" ", "-", Trim)
Dashes
Dashes2 <- gsub("---", "-", Dashes)
Dashes2

接下来,我将重新格式化的博士学位添加到以下网址的末尾,以获取我需要从每个课程的学费和学分时间中获取信息所需的所有可能网址的列表

urls <- rbindlist(sapply(Dashes2, function(x) {
    url <- paste("http://www.capella.edu/online-degrees/",x,"/", sep="")
    data.frame(url)
}), fill=TRUE)
Spec_URLs <- data.frame(urls)
View(Spec_URLs)

现在我有一个我需要从中获取信息的所有网址的列表,我需要知道如何为每个网址使用以下功能。 以下代码仅提取其中一个网址的学费和学分信息。如何让它循环遍历所有URL?我的最终目标是将每个博士课程的所有学费和学分信息表汇总到一个数据框中。

fileUrl <- read_html("http://www.capella.edu/online-degrees/phd-leadership/")

Tuition <- fileUrl %>%
   html_nodes("p:nth-child(4) strong , .tooltip~ strong") %>%
   html_text() 
Tuition

结果: 学费 [1]“120学分”“每季度4,665美元”

1 个答案:

答案 0 :(得分:2)

这是一个快速而又肮脏的......我希望它不会创造出比答案更多的问题。本质上,此函数会抓取链接到部门的所有单个URL ...然后在每个返回一个聚合数据对象上执行相同的系列。在我们的例子中,数据框有82行。 如果你想清理它,你可以重新格式化列并清理NA的一些。希望它适合你。

library(rvest)
library(stringi)
library(htmltools)
library(plyr)
library(dplyr)
library(DT)


# This is a helper function I threw on top..
txt.safe_text <- function(x){
  str_in <- iconv(x, "latin1", "ASCII", sub="")  %>%  stri_enc_toutf8()
  str_in %>%
    stri_replace_all_fixed('<U+0080><U+0093>',"'\\-'") %>%
    stri_enc_toascii %>% htmlEscape %>%
    stri_unescape_unicode %>%
    stri_replace_all_regex("\\032\\032\\032","-")%>%
    stri_replace_all_regex("\n","")
}




# Heres the iterator. I gave it zero args for purposes of the concept but you
# could add varible urls or filtering functions

parse.apella <- function(){


  # html() was deprecated but I use the older version of rvest so set the new name
  # to an alias for reproduction.
  read_html <- html


  # This is our index table. We are going to use this as a key to then qry all
  # other site info but keep a backref to the school variable and url
  idx_df <-
    lapply(read_html("http://www.capella.edu/online-phd-programs/") %>%
             html_nodes(".accordianparsys a"),function(i)
               data.frame(focus = html_text(i),
                          link = paste0("http://www.capella.edu", html_attr(i,"href"))
                          )) %>% rbind.pages

  # Threw this in for use case later with rendering a datatable and then being able to
  # jump straight to the site you are referencing.

  idx_df$html_output <- sapply(1:nrow(idx_df),function(i)
    htmltools::HTML(paste0(sprintf('<a href="%s">%s</a>',idx_df[i,2],idx_df[i,1]))))


  # Ok...so... for every index in our idx_df table above we are going to:
  # read site > parse the p html tags > pass a text cleaning function >
  # replace the leftovers eg:'\t' > split the string on the new line '\n'
  # character for easier user in building a data frame later > filter out all
  # returned data that has a character length of less than  or equal to 2 >
  # create a data frame with a filtering column in our loop.

  # Note: this is going to get the data for I think 84 websites..so give it a second
  # to run.

  A <- llply(1:nrow(idx_df),function(ii)
    lapply(read_html(idx_df[[2]][[ii]]) %>%
             html_nodes(".gernic_large_text > p") %>%
             html_text %>% txt.safe_text %>%
             stri_replace_all_regex("\t","\n") %>%
             strsplit("\n"),function(i)
               stri_split_regex(i,"  ") %>% unlist %>%
             data.frame(raw_txt = .) %>% filter(nchar(raw_txt)>2) %>%
             mutate(df_idx = 1:length(raw_txt),
                    school_name = idx_df[[1]][[ii]],
                    html_link = idx_df[[3]][[ii]])
    )
  )


  # Above we built a list of data frames...and the rule we know is that any information
  # we are interested in would produce at least two rows of data as we split
  # our raw html on the new line character. This means any data frame in our list
  # with 1 row is non-imporant but was easier to filter out than parse out earlier.
  # So we remove all those data frames with only 1 row.
  CC <- lapply(1:length(A),function(i)A[[i]][mapply(nrow,A[[i]]) == 2] %>% rbind.pages)


  # Helper function for looping through. I shouldn't have used numbers for the column names
  # but i'm just slapping this together.
  # This is going to essentially go through our data frames and transpose the structure
  # so that our final product is a wide data structure rather than a long.

  trans_df <- function(df_in = NULL,i){
    tmp_d <-
      as.data.frame(
        t(c(df_in[[i]][df_in[[i]][[2]] == 2,4][[1]],
            df_in[[i]][df_in[[i]][[2]] == 2,3][[1]],
            df_in[[i]][df_in[[i]][[2]] == 2,1]))
      )

    colnames(tmp_d) <-  c('html_link','school name',df_in[[i]][df_in[[i]][[2]] == 1,1])
    tmp_d
  }


  #  For every index in our list we're going to transpose our structures
  # And do some text cleaning and splitting
  all_dat <- ldply(1:length(CC),function(i)trans_df(df_in = CC,i)) %>%
    mutate(short_name = stri_extract_first_words(`school name`),
           Cost =
             ifelse(!is.na(Cost),
                    stri_extract_first_words(Cost),
                    'Not Listed')
           ) %>% mutate(program =
                   stri_replace_all_regex(
                     `school name`,
                     paste0('(',short_name,'| - )'),"") %>%
                   stri_trim_both) %>%
    mutate(next_session = as.Date(strptime(`Next Start Date`,"%b. %d,%Y"))) %>%
    mutate(Cost = as.numeric(gsub(",","",Cost))) %>% 
  select(html_link,
         short_name,
         program,
         cost = Cost,
         credit_hours = `Transfer Credits`,
         next_session,
         total_credits = `Total Quarter Credits`,
         session_length = `Course Length`)

  # Quick thing I noticed on the credit hours. Loop back over and
  # grab only the numeric values
  all_dat$credit_hours <-
    lapply(all_dat$credit_hours,function(i)
      stri_extract_all_regex(i,"[[:digit:]]") %>%
        unlist %>% paste0(collapse = "") %>% as.numeric) %>%
    unlist


  # Should be done
  return(all_dat)
}



rock.apella <- parse.apella()

str(rock.apella)
# 'data.frame':  82 obs. of  8 variables:
# $ html_link     : chr  "<a href=\"http://www.capella.edu/online-degrees/phd-leadership\">PHD - Leadership </a>"| __truncated__ ...
# $ short_name    : chr  "PHD" "PHD" "PHD" "PHD" ...
# $ program       : chr  "Leadership" "Information Technology Education" "General Information Technology" "Information Assurance and Security" ...
# $ cost          : num  4665 4665 4665 4665 4665 ...
# $ credit_hours  : num  32 32 48 32 32 32 32 32 48 32 ...
# $ next_session  : Date, format: "2016-04-11" "2016-04-11" "2016-04-11" "2016-04-11" ...
# $ total_credits : chr  "120 Credits" "120 Credits" "120 Credits" "120 Credits" ...
# $ session_length: chr  "10 weeks" "10 weeks" "10 weeks" "10 weeks" ...

DT::datatable(rock.apella,escape = F, options = list(searchHighlight = TRUE), filter = 'top')

这是我们的最终输出 enter image description here

jsfiddle https://jsfiddle.net/cbfas/0x37vudv/1/

中的输出