Найти и напечатать строки, значения столбцов которых повторяются n раз - PullRequest
0 голосов
/ 23 сентября 2018

У меня есть файл:

scaffold_0      11498
scaffold_0      11501
scaffold_0      11728   "RHOH"
scaffold_0      12144   "RHOH"
scaffold_0      20708   "RHOH"
scaffold_0      23579   "RHOH"
scaffold_0      130818
scaffold_0      200485  "NSUN7"
scaffold_0      209928  "NSUN7"
scaffold_0      212965  "NSUN7"
scaffold_0      214055  "APBB2"
scaffold_0      223404
scaffold_0      223686  "APBB2"
scaffold_0      227687  "APBB2"
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      391742
scaffold_0      399332  "UCHL1"
scaffold_0      406726  "UCHL1"
scaffold_0      482215
scaffold_0      484921
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
scaffold_0      539819
scaffold_0      543347  "LIMCH1"
scaffold_0      568182  "LIMCH1"
scaffold_0      570321
scaffold_0      570325
scaffold_0      577502  "LIMCH1"
scaffold_0      578933  "LIMCH1"
scaffold_0      621330  "PHOX2B"
scaffold_0      623303  "PHOX2B"
scaffold_0      640271
scaffold_0      667510  "gene3"
scaffold_0      679096
scaffold_0      698659  "TMEM33"
scaffold_0      700427  "TMEM33"

, и я хочу напечатать строки, для которых элементы в 3-м столбце повторяются 3 или более раз.Чтобы удалить эти строки:

scaffold_0      399332  "UCHL1"
scaffold_0      406726  "UCHL1"
scaffold_0      621330  "PHOX2B"
scaffold_0      623303  "PHOX2B"
scaffold_0      667510  "gene3"
scaffold_0      698659  "TMEM33"
scaffold_0      700427  "TMEM33"

Я был бы рад сохранить порядок файлов и строки, для которых 3-й столбец пуст.Я попробовал:

sort -k3 file.txt | awk 'a[$3]++{ if(a[$3]>=2){ print b }; print $0}; {b=$0}'

Ответы [ 2 ]

0 голосов
/ 23 сентября 2018
$ awk 'NR==FNR{c[$3]++;next} c[$3]>2' file file
scaffold_0      11498
scaffold_0      11501
scaffold_0      11728   "RHOH"
scaffold_0      12144   "RHOH"
scaffold_0      20708   "RHOH"
scaffold_0      23579   "RHOH"
scaffold_0      130818
scaffold_0      200485  "NSUN7"
scaffold_0      209928  "NSUN7"
scaffold_0      212965  "NSUN7"
scaffold_0      214055  "APBB2"
scaffold_0      223404
scaffold_0      223686  "APBB2"
scaffold_0      227687  "APBB2"
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      391742
scaffold_0      482215
scaffold_0      484921
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
scaffold_0      539819
scaffold_0      543347  "LIMCH1"
scaffold_0      568182  "LIMCH1"
scaffold_0      570321
scaffold_0      570325
scaffold_0      577502  "LIMCH1"
scaffold_0      578933  "LIMCH1"
scaffold_0      640271
scaffold_0      679096

.

$ awk 'NR==FNR{if ($3!="") c[$3]++;next} c[$3]>2' file file
scaffold_0      11728   "RHOH"
scaffold_0      12144   "RHOH"
scaffold_0      20708   "RHOH"
scaffold_0      23579   "RHOH"
scaffold_0      200485  "NSUN7"
scaffold_0      209928  "NSUN7"
scaffold_0      212965  "NSUN7"
scaffold_0      214055  "APBB2"
scaffold_0      223686  "APBB2"
scaffold_0      227687  "APBB2"
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
scaffold_0      543347  "LIMCH1"
scaffold_0      568182  "LIMCH1"
scaffold_0      577502  "LIMCH1"
scaffold_0      578933  "LIMCH1"
0 голосов
/ 23 сентября 2018

Этот awk читает и хэширует весь файл в памяти

$ awk '{
    a[NR]=$0              # hash to a using record number as the key for order
    c[$3]++               # $3 counter
}
END {                     # after file records have been hashed
    for(i=1;i<=NR;i++) {  # iterate in order
        split(a[i],b)     # get the 3rd column
        if(c[b[3]]>=3)    # output if count is right
            print a[i]
    }
}' file

Пример вывода:

...
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      391742
scaffold_0      482215
scaffold_0      484921
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
...
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...