Почему pytorch 1.5 медленнее, чем Tensorflow 2.x? - PullRequest
0 голосов
/ 11 июля 2020

Я попробовал два кода версии для простой задачи классификации.

Версия Tensorflow

import  tensorflow as tf

from    tensorflow.keras 

import datasets, layers, optimizers, Sequential
import time
(xs, ys),(xs_, ys_) = datasets.mnist.load_data()

xs = tf.convert_to_tensor(xs, dtype=tf.float32)/255.

db = tf.data.Dataset.from_tensor_slices((xs,ys))

db = db.batch(30000)


network = Sequential([layers.Dense(8, activation='relu'),
                     layers.Dense(8, activation='relu'),
                     layers.Dense(10)])
network.build(input_shape=(None, 28*28))
optimizer = optimizers.SGD(lr=0.1)
tic=time.time()
for epoch in range(5):
    for step, (x,y) in enumerate(db):

        with tf.GradientTape() as tape:
            # [b, 28, 28] => [b, 784]
            x = tf.reshape(x, (-1, 28*28))
            # [b, 784] => [b, 10]
            # print(tf.reduce_max(x),tf.reduce_min(x))
            out = network(x)
            # [b] => [b, 10]
            y_onehot = tf.one_hot(y, depth=10)
            # [b, 10]
            # print(out,y_onehot)

            loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=out, labels=y_onehot))
        grads = tape.gradient(loss, network.trainable_variables)
        optimizer.apply_gradients(zip(grads, network.trainable_variables))
    print('epoch=',epoch,'loss=',loss.numpy())
toc=time.time()
print('elapsed time:',toc-tic)
# test

xs_=tf.convert_to_tensor(xs_,dtype=tf.float32)/255.
xs_=tf.reshape(xs_,[-1,28*28])


out = network(xs_)

pred=tf.argmax(out,axis=1)

correct=tf.reduce_sum((pred==ys_).numpy().astype('int'))
print(correct.numpy())

import  tensorflow as tf
from    tensorflow.keras import datasets, layers, optimizers, Sequential
import time
(xs, ys),(xs_, ys_) = datasets.mnist.load_data()

xs = tf.convert_to_tensor(xs, dtype=tf.float32)/255.

db = tf.data.Dataset.from_tensor_slices((xs,ys))

db = db.batch(30000)


network = Sequential([layers.Dense(8, activation='relu'),
                     layers.Dense(8, activation='relu'),
                     layers.Dense(10)])
network.build(input_shape=(None, 28*28))
optimizer = optimizers.SGD(lr=0.1)
tic=time.time()
for epoch in range(5):
    for step, (x,y) in enumerate(db):

        with tf.GradientTape() as tape:
            # [b, 28, 28] => [b, 784]
            x = tf.reshape(x, (-1, 28*28))
            # [b, 784] => [b, 10]
            # print(tf.reduce_max(x),tf.reduce_min(x))
            out = network(x)
            # [b] => [b, 10]
            y_onehot = tf.one_hot(y, depth=10)
            # [b, 10]
            # print(out,y_onehot)

            loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=out, labels=y_onehot))
        grads = tape.gradient(loss, network.trainable_variables)
        optimizer.apply_gradients(zip(grads, network.trainable_variables))
    print('epoch=',epoch,'loss=',loss.numpy())
toc=time.time()
print('elapsed time:',toc-tic)
# test

xs_=tf.convert_to_tensor(xs_,dtype=tf.float32)/255.
xs_=tf.reshape(xs_,[-1,28*28])


out = network(xs_)

pred=tf.argmax(out,axis=1)

correct=tf.reduce_sum((pred==ys_).numpy().astype('int'))
print(correct.numpy())

Версия Pytorch

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy
import time
print(torch.cuda.is_available())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
input_size  = 28*28   # images are 28x28 pixels
output_size = 10      # there are 10 classes

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('~/data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       # transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=30000, shuffle=True,num_workers=2,pin_memory=True)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('~/data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       # transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=1000, shuffle=True)

class FC2Layer(nn.Module):
    def __init__(self, input_size, n_hidden, output_size):
        super(FC2Layer, self).__init__()
        self.input_size = input_size
        self.network = nn.Sequential(
            nn.Linear(input_size, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, output_size),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, x):
        #x = x.view(-1, self.input_size)
        return self.network(x)


accuracy_list = []


def train(epoch, model, perm=torch.arange(0, 784).long()):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # send to device
        data, target = data.to(device), target.to(device)
        # print(torch.min(data))
        # permute pixels
        data = data.view(-1, 28 * 28)
        #data = data[:, perm]
        #data = data.view(-1, 1, 28, 28)

        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)

        loss.backward()
        optimizer.step()

    print('epoch=',epoch,'loss=',loss.cpu().item())


def test(model, perm=torch.arange(0, 784).long()):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        # send to device
        data, target = data.to(device), target.to(device)

        # permute pixels
        data = data.view(-1, 28 * 28)
        #data = data[:, perm]
        #data = data.view(-1, 1, 28, 28)
        output = model(data)
        test_loss += F.nll_loss(output, target,
                                reduction='sum').item()  # sum up batch loss
        pred = output.data.max(1, keepdim=True)[
            1]  # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    accuracy_list.append(accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

n_hidden = 8 # number of hidden units

model_fnn = FC2Layer(input_size, n_hidden, output_size)
model_fnn.to(device)
optimizer = optim.SGD(model_fnn.parameters(), lr=0.01)

tic=time.time()
for epoch in range(0, 5):
    train(epoch, model_fnn)
toc=time.time()
print('elapsed time:',toc-tic)
test(model_fnn)

Два приведенных выше кода используют одну и ту же архитектуру нейронных сетей, запускают 5 эпох.

Учитывается только время обучения.

Результаты следующие.

затраченное время для pytorch 1.5

затраченное время для тензорного потока 2.2

Почему так сильно отличается прошедшее время?

Спасибо!

...