Разделение данных из одного столбца на разные столбцы в R - PullRequest
2 голосов
/ 29 апреля 2020

"Я выполнил очистку в R, но столкнулся с проблемой разделения данных на разные столбцы. Я не могу написать код для столбца 8:10 (последняя строка кода). Ниже приведен код"

library(xml2)
library(rvest)
library(stringr)
library(tidyr)

reddit_wbpg <- read_html("https://www.tripadvisor.in/Hotel_Review-g304551-d3583700-Reviews-or10-Lemon_Tree_Premier_Delhi_Airport-New_Delhi_National_Capital_Territory_of_Delhi.html")


title <- reddit_wbpg %>%
  html_node("title") %>%
  html_text()  

reviews <- reddit_wbpg %>%
  html_nodes("q.location-review-review-list-parts-ExpandableReview__reviewText--gOmRC") %>%
  html_text()  

user_data1 <- reddit_wbpg %>%
  html_nodes("div.social-member-event-MemberEventOnObjectBlock__event_type--3njyv") %>%
  html_text()

user_data2 <- reddit_wbpg %>%
  html_nodes("div.social-member-MemberHeaderStats__event_info--30wFs") %>%
  html_text()

review_title <- reddit_wbpg %>%
  html_nodes("div.location-review-review-list-parts-ReviewTitle__reviewTitle--2GO9Z") %>%
  html_text()


scraping_data <- data.frame(page_title= title, review_title = review_title, reviews = reviews, user_data1 = user_data1,user_data2 = user_data2)

scraping_data <- cbind(scraping_data,"a","a","a","a","a")
colnames(scraping_data)[6:10] <- c("user_name", "date", "location", "contribution" , "helpful_votes")


scraping_data[,6:7] <-   str_split_fixed(scraping_data$user_data1, " wrote a review", 2)
scraping_data[,8] <- str_extract(scraping_data$user_data2,"^.+?(?=[0-9]+ [hc])")
scraping_data[,9] <- str_extract(scraping_data$user_data2,"[0-9]+(?= contributions)")
scraping_data[,10] <- str_extract(scraping_data$user_data2,"[0-9]+(?= helpful votes)") 

Вывод можно увидеть на прилагаемом изображении:

Error in Row 1

1 Ответ

1 голос
/ 29 апреля 2020

Вот один подход с str_extract с использованием как позитивного, так и негативного прогноза:

scraping_data[,8] <- str_extract(scraping_data$user_data2,"^(?![0-9]+ (con|hel)).+?(?=[0-9]+ (con|hel)|$)")
scraping_data[,9] <- str_extract(scraping_data$user_data2,"[0-9]+(?= contribution)")
scraping_data[,10] <- str_extract(scraping_data$user_data2,"[0-9]+(?= helpful vote)")
scraping_data
#                               user_data1                                     user_data2 user_name date         location contribution helpful_votes
#1 mohd saqibsaqib wrote a review Mar 2020                 2 contributions2 helpful votes         a    a             <NA>            2             2
#2        hitesh k wrote a review Mar 2020                  4 contributions1 helpful vote         a    a             <NA>            4             1
#3          Basant wrote a review Mar 2020                                2 contributions         a    a             <NA>            2          <NA>
#4          RagP65 wrote a review Mar 2020 New Delhi, India9 contributions4 helpful votes         a    a New Delhi, India            9             4
#5          Mbosma wrote a review Mar 2020                                2 contributions         a    a             <NA>            2          <NA>
...