образец данных
Я изменил одно «Граничный максимум» на «Граничный максимум». Предполагаемая опечатка.
v <- c("Optimal <2.6 Desirable 2.6 - 3.3 Borderline high 3.4 - 4.0 High 4.1 - 4.8 Very high >=4.9",
"Desirable <5.2 Borderline high 5.2 - 6.1 High >= 6.2",
"Desirable <1.7 Borderline high 1.7 - 2.2 High 2.3 - 4.4 Very high >=4.5")
код
library(dplyr)
library(stringr)
data.frame( text = v, stringsAsFactors = FALSE ) %>%
#Extract text between "High" and "Very", trim whirespace
dplyr::mutate( High = trimws( stringr::str_extract(text, "(?<=High).*(?=Very)") ) ) %>%
#If no text was extracted, take everything after "High" until the end
dplyr::mutate( High = ifelse( is.na( High ), trimws( stringr::str_extract(text, "(?<=High).*(?=$)") ), High ) ) %>%
dplyr::select( High )
выход
# High
# 1 4.1 - 4.8
# 2 >= 6.2
# 3 2.3 - 4.4
обновление
Принимайте значения только после High
, если High
равно , а не , перед которым стоит [a-zA-Z]
.
data.frame( text = v, stringsAsFactors = FALSE ) %>%
#Extract text between "High" and "Very", trim whirespace
dplyr::mutate( High = trimws( stringr::str_extract(text, "(?<=[^a-zA-Z] High).*(?=Very)") ) ) %>%
#If no text was extracted, take everything after "High" until the end
dplyr::mutate( High = ifelse( is.na( High ), trimws( stringr::str_extract(text, "(?<=[^a-zA-Z] High).*(?=$)") ), High ) ) %>%
dplyr::select( High )
High
1 4.1 - 4.8
2 >= 6.2
3 2.3 - 4.4