Почему размер буфера в io.Bytes намного больше, чем в HDF5? - PullRequest
0 голосов
/ 09 января 2019

У меня есть стандартный файл HDF5 с двумя массивами: один - 5000x5000, удваивается, а другой - 20x300 целых чисел. Файл большой, но не очень:

$ ls -lh
-rw-rw-r-- 1 . . 18M Jun 23  2014 trained.h5

Мне удалось вытащить данные из файла в Python, и теперь я хочу сохранить эти данные в чем-то независимом от HDF5. Я попробовал очень простой и понятный подход:

from mrth import *
import h5py
import json,io

print( "\nreading weights and groups from the file %s"%sys.argv[1] )
data = h5py.File(sys.argv[1], 'r')
all_weights = data["data"]["weights"].value
print "weights"
print " > type  :",type(all_weights)
print " > shape :",all_weights.shape
print all_weights

all_popmembers = data["data"]["popmembers"].value.T
print "popmembers"
print " > type  :",type(all_popmembers)
print " > shape :",all_popmembers.shape
print all_popmembers

Ncells,Ne,Ni     = 5000,4000,1000
Npop,Nmaxmembers = all_popmembers.shape
pmembership      = .05

print( "\nsaving weights and groups to the file %s"%sys.argv[2] )
with open(sys.argv[2],"w") as fd:
    # Saving some context
    json.dump({
        "Ncells"      : Ncells,
        "Ne"          : Ne,
        "Ni"          : Ni,
        "Npop"        : Npop,
        "pmembership" : pmembership,
        "Nmaxmembers" : Nmaxmembers
        },fd)
    fd.write("\n")
    with io.BytesIO() as bfd:
        save(bfd,all_weights)
        json.dump(bfd.tell(),fd) # saving data size
        fd.write("\n")
        fd.write(bfd.getvalue()) #saving the data
    with io.BytesIO() as bfd:
        save(bfd,all_popmembers)
        json.dump(bfd.tell(),fd) # saving data size
        fd.write("\n")
        fd.write(bfd.getvalue()) #saving the data

Все работает просто отлично, но размер файла данных, созданного таким образом, огромен:

$$ python h5_to_BP.py trained.h5 trained.OU
reading weights and groups from the file trained.h5
weights
 > type  : <type 'numpy.ndarray'>
 > shape : (5000, 5000)
[[ 0.          0.          0.         ...  0.         48.68430328
   0.        ]
 [ 0.          0.          0.         ... 49.50580978  0.
   0.        ]
 [ 1.81663287  1.80222368  0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 1.27279222  0.          0.         ...  0.          0.
  16.22809982]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
popmembers
 > type  : <type 'numpy.ndarray'>
 > shape : (20, 300)
[[ 716 1866 3129 ...   -1   -1   -1]
 [1229  529 2725 ...   -1   -1   -1]
 [3971 1522 2328 ...   -1   -1   -1]
 ...
 [1161   46 3721 ...   -1   -1   -1]
 [1451 1712 3988 ...   -1   -1   -1]
 [3615 2657 3566 ...   -1   -1   -1]]

saving weights and groups to the file trained.OU
$ ls -lh
-rw-rw-r-- 1 . . 18M  Jun 23  2014 trained.h5
-rw-rw-r-- 1 . . 191M Jan  8 20:33 trained.OU

Матрица весов очень скудна (почти все нули везде), поэтому я изменяю свой код, чтобы сохранить только положительные числа и индексы:

from mrth import *
import h5py
import json,io

print( "\nreading weights and groups from the file %s"%sys.argv[1] )
data = h5py.File(sys.argv[1], 'r')
all_weights = data["data"]["weights"].value
print "weights"
print " > type  :",type(all_weights)
print " > shape :",all_weights.shape
print all_weights

all_popmembers = data["data"]["popmembers"].value.T
print "popmembers"
print " > type  :",type(all_popmembers)
print " > shape :",all_popmembers.shape
print all_popmembers

Ncells,Ne,Ni     = 5000,4000,1000
Npop,Nmaxmembers = all_popmembers.shape
pmembership      = .05


x,y      = where( all_weights > 0. )  
p_pawid    = dstack( (x, y ) )[0]
x,y      = where( all_popmembers >= 0. )  
p_pmembers = dstack( (x, y ) )[0]
print( "\nsaving weights and groups to the file %s"%sys.argv[2] )
with open(sys.argv[2],"w") as fd:
    json.dump({
        "Ncells"      : Ncells,
        "Ne"          : Ne,
        "Ni"          : Ni,
        "Npop"        : Npop,
        "pmembership" : pmembership,
        "Nmaxmembers" : Nmaxmembers
        },fd)
    fd.write("\n")
    with io.BytesIO() as bfd:
        save(bfd,p_pawid)
        json.dump(bfd.tell(),fd)
        fd.write("\n")
        fd.write(bfd.getvalue())
    with io.BytesIO() as bfd:
        save(bfd,all_weights[p_pawid[:,0],p_pawid[:,1]])
        json.dump(bfd.tell(),fd)
        fd.write("\n")
        fd.write(bfd.getvalue())
    with io.BytesIO() as bfd:
        save(bfd,p_pmembers)
        json.dump(bfd.tell(),fd)
        fd.write("\n")
        fd.write(bfd.getvalue())
    with io.BytesIO() as bfd:
        save(bfd,all_popmembers[p_pmembers[:,0],p_pmembers[:,1]])
        json.dump(bfd.tell(),fd)
        fd.write("\n")
        fd.write(bfd.getvalue())

Размер падает, но он все еще огромен:

$ python h5_to_BP.py trained.h5 trained.BP
reading weights and groups from the file trained.h5
weights
 > type  : <type 'numpy.ndarray'>
 > shape : (5000, 5000)
[[ 0.          0.          0.         ...  0.         48.68430328
   0.        ]
 [ 0.          0.          0.         ... 49.50580978  0.
   0.        ]
 [ 1.81663287  1.80222368  0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 1.27279222  0.          0.         ...  0.          0.
  16.22809982]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
popmembers
 > type  : <type 'numpy.ndarray'>
 > shape : (20, 300)
[[ 716 1866 3129 ...   -1   -1   -1]
 [1229  529 2725 ...   -1   -1   -1]
 [3971 1522 2328 ...   -1   -1   -1]
 ...
 [1161   46 3721 ...   -1   -1   -1]
 [1451 1712 3988 ...   -1   -1   -1]
 [3615 2657 3566 ...   -1   -1   -1]]

saving weights and groups to the file trained.BP
$ls -lh
-rw-rw-r-- 1 . . 18M  Jun 23  2014 trained.h5
-rw-rw-r-- 1 . . 191M Jan  8 20:33 trained.OU
-rw-rw-r-- 1 . . 115M Jan  8 20:27 trained.BP

Кто-нибудь может объяснить это явление? Размер же данных увеличивается в разы 10 ! Я думаю, проблема в io.Bytes, это верно?

...