Сравнение производительности
Примечание: время в таблице не включает время, необходимое для загрузки файлов.
| approach | american-english, | big.txt, | time w.r.t. defaultdict |
| | time, seconds | time, seconds | |
|----------------+-------------------+---------------+-------------------------|
| Counter | 0.451 | 3.367 | 3.6 |
| setdefault | 0.348 | 2.320 | 2.5 |
| list | 0.277 | 1.822 | 2 |
| try/except | 0.158 | 1.068 | 1.2 |
| defaultdict | 0.141 | 0.925 | 1 |
| numpy | 0.012 | 0.076 | 0.082 |
| S.Mark's ext. | 0.003 | 0.019 | 0.021 |
| ext. in Cython | 0.001 | 0.008 | 0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g
Используемые файлы: '/usr/share/dict/american-english'
и 'big.txt'
.
Сценарий, который сравнивает решения 'Counter', 'setdefault', 'list', 'try / исключением', 'defaultdict', 'numpy', 'cython' и @ S.Mark, находится на http://gist.github.com/347000
Самое быстрое решение - расширение Python, написанное на Cython:
import cython
@cython.locals(
chars=unicode,
i=cython.Py_ssize_t,
L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
for i in range(0x10000): # unicode code points > 0xffff are not supported
L[i] = 0
for c in chars:
L[c] += 1
return {unichr(i): L[i] for i in range(0x10000) if L[i]}
Предыдущее сравнение:
* python (dict) : 0.5 seconds
* python (list) : 0.5 (ascii) (0.2 if read whole file in memory)
* perl : 0.5
* python (numpy): 0.07
* c++ : 0.05
* c : 0.008 (ascii)
Входные данные:
$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études
$ du -h /usr/share/dict/american-english
912K /usr/share/dict/american-english
питон (счетчик): 0,5 секунды
#!/usr/bin/env python3.1
import collections, fileinput, textwrap
chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))
Запустите его:
$ time -p python3.1 count_char.py /usr/share/dict/american-english
Counter({'e': 87823, 's': 86620, 'i': 66548, 'a': 62778, 'n': 56696, 'r':
56286, 't': 51588, 'o': 48425, 'l': 39914, 'c': 30020, 'd': 28068, 'u': 25810,
"'": 24511, 'g': 22262, 'p': 20917, 'm': 20747, 'h': 18453, 'b': 14137, 'y':
12367, 'f': 10049, 'k': 7800, 'v': 7573, 'w': 6924, 'z': 3088, 'x': 2082, 'M':
1686, 'C': 1549, 'S': 1515, 'q': 1447, 'B': 1387, 'j': 1376, 'A': 1345, 'P':
974, 'L': 912, 'H': 860, 'T': 858, 'G': 811, 'D': 809, 'R': 749, 'K': 656, 'E':
618, 'J': 539, 'N': 531, 'W': 507, 'F': 502, 'O': 354, 'I': 344, 'V': 330, 'Z':
150, 'Y': 140, 'é': 128, 'U': 117, 'Q': 63, 'X': 42, 'è': 29, 'ö': 12, 'ü': 12,
'ó': 10, 'á': 10, 'ä': 7, 'ê': 6, 'â': 6, 'ñ': 6, 'ç': 4, 'å': 3, 'û': 3, 'í':
2, 'ô': 2, 'Å': 1})
real 0.44
user 0.43
sys 0.01
Perl: 0,5 секунды
time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english
Выход:
{'S' => 1515,'K' => 656,'' => 29,'d' => 28068,'Y' => 140,'E' => 618,'y' => 12367,'g' => 22262,'e' => 87823,'' => 2,'J' => 539,'' => 241,'' => 3,'' => 6,'' => 4,'' => 128,'D' => 809,'q' => 1447,'b' => 14137,'z' => 3088,'w' => 6924,'Q' => 63,'' => 10,'M' => 1686,'C' => 1549,'' => 10,'L' => 912,'X' => 42,'P' => 974,'' => 12,'\'' => 24511,'' => 6,'a' => 62778,'T' => 858,'N' => 531,'j' => 1376,'Z' => 150,'u' => 25810,'k' => 7800,'t' => 51588,'' => 6,'W' => 507,'v' => 7573,'s' => 86620,'B' => 1387,'H' => 860,'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' => 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,'R' => 749,'o' => 48425}
real 0.51
user 0.49
sys 0.02
питон (numpy): 0,07 секунды
На основании Ответ Муравьев Аасмы (изменен для поддержки юникода):
#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy
filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'
# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]
# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)
# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts if c[0] not in ' \t\n')
Выход:
("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01
с ++: 0,05 секунды
// $ g++ *.cc -lboost_program_options
// $ ./a.out /usr/share/dict/american-english
#include <iostream>
#include <fstream>
#include <cstdlib> // exit
#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>
int main(int argc, char* argv[]) {
using namespace std;
// open input file
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <filename>\n";
exit(2);
}
wifstream f(argv[argc-1]);
// assume the file has utf-8 encoding
locale utf8_locale(locale(""),
new boost::program_options::detail::utf8_codecvt_facet);
f.imbue(utf8_locale);
// count characters frequencies
typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;
hashtable_t counts;
for (wchar_t ch; f >> ch; )
counts[ch]++;
// print result
wofstream of("output.utf8");
of.imbue(utf8_locale);
BOOST_FOREACH(hashtable_t::value_type i, counts)
of << "(" << i.first << " " << i.second << ") ";
of << endl;
}
Результат:
$ cat output.utf8
(í 2) (O 354) (P 974) (Q 63) (R 749) (S 1,515) (ñ 6) (T 858) (U 117) (ó 10) (ô 2) (V 330) (W 507) (X 42) (ö 12) (Y 140) (Z 150) (û 3) (ü 12) (a 62,778) (b 14,137) (c 30,020) (d 28,068) (e 87,823) (f 10,049) (g 22,262) (h 18,453) (i 66,548) (j 1,376) (k 7,800) (l 39,914) (m 20,747) (n 56,696) (o 48,425) (p 20,917) (q 1,447) (r 56,286) (s 86,620) (t 51,588) (u 25,810) (Å 1) (' 24,511) (v 7,573) (w 6,924) (x 2,082) (y 12,367) (z 3,088) (A 1,345) (B 1,387) (C 1,549) (á 10) (â 6) (D 809) (E 618) (F 502) (ä 7) (å 3) (G 811) (H 860) (ç 4) (I 344) (J 539) (è 29) (K 656) (é 128) (ê 6) (L 912) (M 1,686) (N 531)
c (ascii): 0,0079 секунды
// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>
enum { N = 256 };
size_t counts[N];
int main(void) {
// count characters
int ch = -1;
while((ch = getchar()) != EOF)
++counts[ch];
// print result
size_t i = 0;
for (; i < N; ++i)
if (counts[i])
printf("('%c' %zu) ", (int)i, counts[i]);
return 0;
}