Вероятно, это не самый лучший / самый короткий / самый быстрый ответ, но он обеспечивает необходимые результаты и может быть легко расширен, когда данные становятся более сложными.
выборка данных
vec <- c( "There is a 10cm length of Barrett's",
"The length of Barrett's is around 5 cm",
"The Barrett's measures 10cm in length above a 4cm hiatus hernia",
"The length of Barrett's is 5cm but the length of the dysplasia is 3cm")
код
library( tidyverse )
l <- lapply( vec, function(x) {
data.frame( value = as.numeric( unlist( str_extract_all( x, "[0-9]+" ) ) ),
position = as.numeric( unlist( gregexpr( "[0-9]+", x) ) ) )
})
matches <- as.data.frame( data.table::rbindlist(l, idcol = "id" ) )
df <- data.frame( text = vec, stringsAsFactors = FALSE )
pattern_ <-"Barrett's"
library( tidyverse )
df %>%
mutate( id = row_number(),
start_barrett = regexpr( pattern_, text),
end_barrett = start_barrett + nchar( pattern_ ) ) %>%
left_join( matches, by = "id" ) %>%
mutate( distance = ifelse( position > start_barrett, position - end_barrett, start_barrett - position ) ) %>%
group_by( id ) %>%
arrange( distance ) %>%
slice( 1L ) %>%
ungroup() %>%
select( text, value )
выход
# # A tibble: 4 x 2
# text value
# <chr> <dbl>
# 1 There is a 10cm length of Barrett's 10
# 2 The length of Barrett's is around 5 cm 5
# 3 The Barrett's measures 10cm in length above a 4cm hiatus hernia 10
# 4 The length of Barrett's is 5cm but the length of the dysplasia is 3cm 5