Ниже приведен способ определения частоты слов с использованием пакета tm
в R:
library(tm)
data("crude") #inbuild dataset in tm package, here you can use your text file
myTdm <- as.matrix(TermDocumentMatrix(crude))
FreqMat <- data.frame(ST = rownames(myTdm),
Freq = rowSums(myTdm),
row.names = NULL)
output
ST Freq
1 "(it) 1
2 "demand 1
3 "expansion 1
4 "for 1
5 "growth 1
6 "if 1
7 "is 2
8 "may 1
9 "none 2
10 "opec 2
11 "opec's 1
12 "our 1
13 "the 3
14 "there 1
15 "they 1
16 "this 4
17 "we 2
18 "will 1
19 (bpd) 3
20 (bpd). 1
21 (gcc) 1
22 (northern 1
23 (oil) 1
24 (opec) 2
25 (uae), 1
26 ... 2
27 1.1 1
28 1.11 1
29 1.15 1
30 1.2 3
31 1.50 4
32 1.9 1
33 1/8 1
34 10.8 1
35 100,000 1
36 12. 1
37 12.217 1
38 12.32 1
39 13-member 1
40 13-nation 3
41 13.81 1
42 15-18 2
43 15.6 1
44 15.8 6
45 16.00 1
46 16.35 1
47 16.50 1
48 16.67 1
49 16.85 1
50 168 1
51 17.52 1
52 180,000 1
53 19. 1
54 1985 1
55 1985. 2
56 1985/86 2
57 1985/86. 1
58 1986 1
59 1986, 2
60 1986/87 2
61 1987 3
62 1987/88 3
63 2.2 1
64 2.5 1
65 2.766 1
66 200,000 1
67 200-foot 1
68 20s, 2
69 22.26 1
70 24-hour 1
71 26. 1
72 285,000 2
73 3.5 1
74 3.6 1
75 3.7495/98 1
76 3.7500/03 1
77 3.8 1
78 3/8, 1
79 300 1
80 4.0 1
81 4.133 1
82 5-3/4 1
83 5.472 1
84 500 2
85 6-1/4, 1
86 6-5/8, 1
87 6-7/8 1
88 6.745 1
89 7-1/8, 1
90 7-3/8, 1
91 7.25 2
92 7.3 1
93 750 2
94 9.7 1
95 948,000 2
96 <chv> 1
97 <mob>, 1
98 <tx> 1
99 <xon>, 1
100 abdul-aziz 5
101 ability 5
102 ability, 1
103 able 1
104 about 12
105 above 8
106 abroad, 1
107 accept 1
108 accord 6
109 accord, 1
110 accord. 1
111 according 4
112 across 1
113 activity 1
114 activity. 1
115 add 1
116 added 1
117 added. 3
118 address 3
119 addressed 1
120 adherence 1
121 adhering 2
122 advantage 1
123 advisers 1
124 after 8
125 again," 1
126 against 3
127 agency 5
128 agreed 3
129 agreement 5
130 agreement. 1
131 agricultural 1
132 agriculture," 1
133 agriculture. 1
134 aground 1
135 al-khalifa 3
136 al-qabas 3
137 al-sabah 2
138 al-sabah, 1
139 al-thani 1
140 ali 6
....