Я пытаюсь объединить список с 10 списками в один фрейм данных, данные извлечены из Amazon с обзорами разных книг, показанных с 10 строками и четырьмя столбцами в 10 разных таблицах прямо сейчас в переменной "output_list". Любая помощь будет принята с благодарностью.
library(tidyverse)
library(rvest)
scrape_amazon <- function(ASIN, page_num){
url_reviews <- paste0("https://www.amazon.co.uk/product-reviews/",ASIN,"/?pageNumber=",page_num)
doc <- read_html(url_reviews) # Assign results to `doc`
# Review Title
doc %>%
html_nodes("[class='a-size-base a-link-normal review-title a-color-base review-title-content a-
text-bold']") %>%
html_text() -> review_title
# Review Text
doc %>%
html_nodes("[class='a-size-base review-text review-text-content']") %>%
html_text() -> review_text
# Number of stars in review
doc %>%
html_nodes("[data-hook='review-star-rating']") %>%
html_text() -> review_star
# Return a tibble
tibble(review_title,
review_text,
review_star,
page = page_num) %>% return()
}
ASIN <- "0007477155" # Specify ASIN
page_range <- 1:10 # Let's say we want to scrape pages 1 to 10
# Create a table that scrambles page numbers using `sample()`
# For randomising page reads!
match_key <- tibble(n = page_range,
key = sample(page_range,length(page_range)))
lapply(page_range, function(i){
j <- match_key[match_key$n==i,]$key
message("Getting page ",i, " of ",length(page_range), "; Actual: page ",j) # Progress bar
Sys.sleep(3) # Take a three second break
if((i %% 3) == 0){ # After every three scrapes... take another two second break
message("Taking a break...") # Prints a 'taking a break' message on your console
Sys.sleep(2) # Take an additional two second break
}
scrape_amazon(ASIN = ASIN, page_num = j) # Scrape
}) -> output_list
class(output_list)
length(output_list)
output_list
Большое спасибо