Question

У меня есть набор данных, состоящий из предложений с последовательными повторениями слов :

ДАННЫЕ :

df <- data.frame(
  Turn = c("oh is that that steak i got the other night",       # that that
           "no no no i 'm dave and you 're alan",               # no no no
           "yeah i mean the the film was quite long though",    # the the
           "it had steve martin in it it 's a comedy"))         # it it

ЦЕЛЬ :

Я хотел бы получить еще три столбца, добавленных к этому фрейму данных:

df$rep_Word: столбец, определяющий повторяющееся слово
df$rep_Pos: столбец, определяющий первую позицию в предложении, в котором повторяется слово
df$rep_Numb: столбец, определяющий количество повторений слова

Таким образом, ожидаемый фрейм данных выглядит следующим образом:

ОЖИДАЕМЫЙ РЕЗУЛЬТАТ :

df
                                            Turn rep_Word rep_Pos rep_Numb
1    oh is that that steak i got the other night     that       4        1
2            no no no i 'm dave and you 're alan       no       2        2
3 yeah i mean the the film was quite long though      the       5        1
4       it had steve martin in it it 's a comedy       it       7        1

ИСПЫТАННОЕ РЕШЕНИЕ ТАК ДАЛЕКО :

My есть предположение, что к искомой информации о повторном слове, а также о позиции и количестве повторений можно приблизиться с помощью strsplit и функции duplicated, например, таким образом:

df_split <- apply(df, 2, function(x) strsplit(x, "\\s"))

df_split
$Turn
$Turn[[1]]
 [1] "oh"    "is"    "that"  "that"  "steak" "i"     "got"   "the"   "other" "night"
$Turn[[2]]
 [1] "no"   "no"   "no"   "i"    "'m"   "dave" "and"  "you"  "'re"  "alan"
$Turn[[3]]
 [1] "yeah"   "i"      "mean"   "the"    "the"    "film"   "was"    "quite"  "long"   "though"
$Turn[[4]]
 [1] "it"     "had"    "steve"  "martin" "in"     "it"     "it"     "'s"     "a"      "comedy"

Например, для первого предложения в df, duplicated показывает, какое слово повторяется (а именно то, для которого duplicated оценивается как TRUE) и и номер, и положение повтора также могут быть считаны с этой информации:

duplicated(df_split$Turn[[1]])
 [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Проблема в том, что я не знаю, как задействовать duplicated таким образом, чтобы получить желаемое добавленное столбцы в df. Помощь в этом деле очень ценится.

B. Christian Kamgang · Answer 1 · 29 февраля 2020

Вот еще один способ решить вашу проблему.

df <- data.frame(
  Turn = c("oh is that that steak i got the other night",  # that that
           "no no no i 'm dave and you 're alan",               # no no no
           "yeah i mean the the film was quite long though",    # the the
           "it had steve martin in it it 's a comedy",         # it it)
           "it had steve martin in in it it 's a comedy",
           "yeah i mean the film was quite long though", 
           "hi hi then other words and hi hi again",
           "no no no i 'm dave yes yes and you 're alan no no no no"))  # no no no and no no no no

library(data.table)
cols <- c("rep_Word", "rep_Pos", "rep_Numb")
setDT(df)[, (cols) := {
  words <- strsplit(as.character(Turn), " ")[[1]]
  idx <- rleid(words)
  check <- duplicated(idx)
  chg <- check - shift(check, fill = FALSE)
  starts <- which(chg == 1)
  aend <- if(sum(chg) == 0L) which(chg == -1) else c(which(chg == -1), length(chg) + 1L)
  freq <- aend - starts
  wrd <- words[starts]
  no_dup_default <- .(.(NA_character_), .(NA_integer_), .(NA_integer_))
  if(length(wrd)) .(.(wrd), .(starts), .(freq)) else no_dup_default
}, seq.int(nrow(df))]


df
#                                                       Turn   rep_Word  rep_Pos rep_Numb
# 1:             oh is that that steak i got the other night       that        4        1
# 2:                     no no no i 'm dave and you 're alan         no        2        2
# 3:          yeah i mean the the film was quite long though        the        5        1
# 4:                it had steve martin in it it 's a comedy         it        7        1
# 5:             it had steve martin in in it it 's a comedy      in,it      6,8      1,1
# 6:              yeah i mean the film was quite long though         NA       NA       NA
# 7:                  hi hi then other words and hi hi again      hi,hi      2,8      1,1
# 8: no no no i 'm dave yes yes and you 're alan no no no no  no,yes,no  2, 8,14    2,1,3
#                

# or
df[, lapply(.SD, unlist), seq.int(nrow(df))][, -1]
#                                                        Turn rep_Word rep_Pos rep_Numb
#  1:             oh is that that steak i got the other night     that       4        1
#  2:                     no no no i 'm dave and you 're alan       no       2        2
#  3:          yeah i mean the the film was quite long though      the       5        1
#  4:                it had steve martin in it it 's a comedy       it       7        1
#  5:             it had steve martin in in it it 's a comedy       in       6        1
#  6:             it had steve martin in in it it 's a comedy       it       8        1
#  7:              yeah i mean the film was quite long though     <NA>      NA       NA
#  8:                  hi hi then other words and hi hi again       hi       2        1
#  9:                  hi hi then other words and hi hi again       hi       8        1
# 10: no no no i 'm dave yes yes and you 're alan no no no no       no       2        2
# 11: no no no i 'm dave yes yes and you 're alan no no no no      yes       8        1
# 12: no no no i 'm dave yes yes and you 're alan no no no no       no      14        3

tmfmnk · Answer 2 · 29 февраля 2020

Один purrr, dplyr и tibble параметр может быть:

bind_cols(df, 
          map_dfr(strsplit(df$Turn, " ", fixed = TRUE), 
                  ~ enframe(., value = "rep_word") %>%
                   group_by(rleid = with(rle(rep_word), rep(seq_along(lengths), lengths))) %>%
                   filter(n() > 1) %>%
                   summarise(rep_word = first(rep_word),
                             rep_pos = nth(name, 2),
                             rep_number = n()-1) %>%
                   select(-rleid) %>%
                   summarise_all(toString)))

                                            Turn rep_word rep_pos rep_number
1    oh is that that steak i got the other night     that       4          1
2            no no no i 'm dave and you 're alan       no       2          2
3 yeah i mean the the film was quite long though      the       5          1
4       it had steve martin in it it 's a comedy       it       7          1

Cole · Answer 3 · 29 февраля 2020

Вот супер базовый ответ, который основан на преобразовании слов в фактор. В нем также рассматриваются 1) предложения, в которых нет повторяющихся слов, и 2) предложения, в которых разные слова повторяются несколько раз.

   ID                                                    Turn rep_Word rep_Pos rep_Numb
1   1             oh is that that steak i got the other night     that       4        1
2   2                     no no no i 'm dave and you 're alan       no       2        2
3   3          yeah i mean the the film was quite long though      the       5        1
4   4                it had steve martin in it it 's a comedy       it       7        1
5   5             it had steve martin in in it it 's a comedy       in       6        1
6   5             it had steve martin in in it it 's a comedy       it       8        1
7   6              yeah i mean the film was quite long though     <NA>      NA        0
8   7                  hi hi then other words and hi hi again       hi       2        1
9   7                  hi hi then other words and hi hi again       hi       8        1
10  8 no no no i 'm dave yes yes and you 're alan no no no no       no       2        2
11  8 no no no i 'm dave yes yes and you 're alan no no no no      yes       8        1
12  8 no no no i 'm dave yes yes and you 're alan no no no no       no      14        3

Код, указанный выше:

l = list("oh is that that steak i got the other night",       # that that
            "no no no i 'm dave and you 're alan",               # no no no
            "yeah i mean the the film was quite long though",    # the the
            "it had steve martin in it it 's a comedy",         # it it)
         "it had steve martin in in it it 's a comedy",
         "yeah i mean the film was quite long though", 
         "hi hi then other words and hi hi again",
         "no no no i 'm dave yes yes and you 're alan no no no no")

n = length(l)
ans = vector('list', length = n)

for (i in seq_len(n)){
  sentence = l[[i]]
  words_fct = factor(strsplit(sentence, " ", fixed = TRUE)[[1L]])
  levs = as.integer(words_fct)
  inds = which(diff(levs) == 0L)

  rep_Numb = length(inds)
  if (length(rep_Numb > 1L)) {
    diffs = diff(inds) 
    diffs_eq_1 = diffs == 1L
    if (all(diffs_eq_1)) {
      inds = inds[1L]
    } else {
      inds = inds[c(TRUE, !diffs_eq_1)]
      sums = cumsum(diffs_eq_1)
      rep_Numb = c(sums[!diffs_eq_1], sums[length(sums)]) - c(0L, sums[!diffs_eq_1]) + 1L
    }
  }
  ans[[i]] = data.frame(ID = i,
                        Turn = sentence,
                        rep_Word = levels(words_fct)[levs[inds]],
                        rep_Pos = inds + 1L,
                        rep_Numb)
}

do.call(rbind, ans)

jay.sf · Answer 4 · 29 февраля 2020

duplicated будет считать четыре "это" в строке 4. Поэтому может быть лучше использовать rle.

v.rle <- lapply(strsplit(as.character(df$Turn), " "), rle)
v.rle.l <- mapply(`[`, v.rle, "lengths")
v.rle.v <- mapply(`[`, v.rle, "values")
res <- within(df, {
  rep_Pos <- mapply(function(x) el(which(x > 1)) + 1, v.rle.l)
  rep_Numb <- mapply(`[`, v.rle.l, rep_Pos - 1) - 1
  rep_Word <- mapply(`[`, v.rle.v, rep_Pos - 1)
})
res
#                                             Turn rep_Word rep_Numb rep_Pos
# 1    oh is that that steak i got the other night     that        1       4
# 2            no no no i 'm dave and you 're alan       no        2       2
# 3 yeah i mean the the film was quite long though      the        1       5
# 4       it had steve martin in it it 's a comedy       it        1       7

Редактировать

Чтобы адекватно учесть появление нескольких дубликатов подряд или в случае отсутствия дубликатов, вы можете использовать адаптированную версию ниже. Он отображает позиции и слова с двоеточием, если есть несколько дупликов, или возвращает NA в случае отсутствия дупликации.

df2 <- data.frame(
  Turn = c("oh is that that steak i got the other night",  # that that
           "no no no i 'm dave and you 're alan",          # no no no
           "yeah i mean the film was quite long though",                ## the the
           "it had steve martin in in it it 's a comedy"))              ## in in, it it

v.rle <- lapply(STRSP <- strsplit(as.character(df2$Turn), " "), rle)
v.rle.l <- mapply(`[`, v.rle, "lengths")
v.rle.v <- mapply(`[`, v.rle, "values")

res <- within(df2, {
  rep_Pos <- mapply(function(x) {
    w <- which(x > 1) + 1
    if (length(w) == 0) NA 
    else if (length(w) > 1) cbind(w + seq(w) - 1)
    else w
  }, v.rle.l)
  rep_Numb <- mapply(function(x) cbind(x[x > 1]), v.rle.l)
  rep_Numb[lengths(rep_Numb) == 0] <- NA
  rep_Word <- sapply(mapply(`[`, STRSP, lapply(rep_Pos, `-`, 1)), cbind)
})
res
#                                          Turn rep_Word rep_Numb rep_Pos
# 1 oh is that that steak i got the other night     that        1       4
# 2         no no no i 'm dave and you 're alan       no        2       2
# 3  yeah i mean the film was quite long though       NA       NA      NA
# 4 it had steve martin in in it it 's a comedy   in, it     1, 1    6, 8

Как определить повторяющиеся слова, а также положение и количество повторений в предложениях

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 4 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Как определить повторяющиеся слова, а также положение и количество повторений в предложениях

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 4 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов