this is a work in progress
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
class Args:
pass
args = Args()
args.batch_size = 12
args.cuda = True
args.lr = 0.001
args.momentum = 0.01
args.epochs = 10
args.log_interval = 10
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
Lets take a look into how the dataset looks like
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image
import pprint
import numpy
num_of_samples = 5
fig = plt.figure(1,(8., 8.))
grid = ImageGrid(fig, 111,
nrows_ncols=(num_of_samples, num_of_samples),
axes_pad=0.1)
output = numpy.zeros(num_of_samples ** 2)
for i, (data, target) in enumerate(test_loader):
if i < 1: #dirty trick to take just one sample
for j in range(num_of_samples ** 2):
grid[j].matshow(Image.fromarray(data[j][0].numpy()))
output[j] = target[j]
else:
break
output = output.reshape(num_of_samples, num_of_sample)
plt.show()
[[ 6. 9. 9. 5. 4.]
[ 3. 6. 5. 0. 1.]
[ 8. 1. 3. 6. 2.]
[ 9. 4. 8. 8. 6.]
[ 0. 6. 4. 2. 3.]]
You can see that the image of number <> is associated with number <>. It is a list of (image of number, number). As usual we are gonna feed the neural network with image from the left and its label from the right. We will train a set of feedforward networks in increasing order of complexity. What I mean by complexity is the number of neurons and number of layers.
class Model0(nn.Module):
def __init__(self):
super(Model0, self).__init__()
self.output_layer = nn.Linear(28*28, 10)
def forward(self, x):
x = self.output_layer(x)
return F.log_softmax(x)
class Model1(nn.Module):
def __init__(self):
super(Model1, self).__init__()
self.input_layer = nn.Linear(28*28, 5)
self.output_layer = nn.Linear(5, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model2(nn.Module):
def __init__(self):
super(Model2, self).__init__()
self.input_layer = nn.Linear(28*28, 6)
self.output_layer = nn.Linear(6, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model3(nn.Module):
def __init__(self):
super(Model3, self).__init__()
self.input_layer = nn.Linear(28*28, 7)
self.output_layer = nn.Linear(7, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model4(nn.Module):
def __init__(self):
super(Model4, self).__init__()
self.input_layer = nn.Linear(28*28, 8)
self.output_layer = nn.Linear(8, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model5(nn.Module):
def __init__(self):
super(Model5, self).__init__()
self.input_layer = nn.Linear(28*28, 9)
self.output_layer = nn.Linear(9, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model6(nn.Module):
def __init__(self):
super(Model6, self).__init__()
self.input_layer = nn.Linear(28*28, 10)
self.output_layer = nn.Linear(10, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model7(nn.Module):
def __init__(self):
super(Model7, self).__init__()
self.input_layer = nn.Linear(28*28, 100)
self.output_layer = nn.Linear(100, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model8(nn.Module):
def __init__(self):
super(Model8, self).__init__()
self.input_layer = nn.Linear(28*28, 100)
self.hidden_layer = nn.Linear(100, 100)
self.output_layer = nn.Linear(100, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.hidden_layer(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model9(nn.Module):
def __init__(self):
super(Model9, self).__init__()
self.input_layer = nn.Linear(28*28, 100)
self.hidden_layer = nn.Linear(100, 100)
self.hidden_layer1 = nn.Linear(100, 100)
self.output_layer = nn.Linear(100, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.hidden_layer(x)
x = self.hidden_layer1(x)
x = self.output_layer(x)
return F.log_softmax(x)
class Model10(nn.Module):
def __init__(self):
super(Model10, self).__init__()
self.input_layer = nn.Linear(28*28, 100)
self.hidden_layer = nn.Linear(100, 100)
self.hidden_layer1 = nn.Linear(100, 100)
self.hidden_layer2 = nn.Linear(100, 100)
self.output_layer = nn.Linear(100, 10)
def forward(self, x):
x = self.input_layer(x)
x = self.hidden_layer(x)
x = self.hidden_layer1(x)
x = self.hidden_layer2(x)
x = self.output_layer(x)
return F.log_softmax(x)
Lets create the model instances. If you have GPU this is how you can make use of it, by calling .cuda() on models and tensors
models = Model0(), Model1(), Model2(), Model3(), Model4(), Model5(), Model6(), Model7(), Model8(), Model9(), Model10()
if args.cuda:
for model in models:
model.cuda()
def train(epoch, model, print_every=100):
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for i in range(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
if args.cuda:
data, target = data.cuda(), target.cuda()
data = data.view(args.batch_size , -1)
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if i % print_every == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
i, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.data[0]))
This is where the actual training starts. It will take a while, so I just trained them for 100 times on entire training dataset.
for model in models:
train(100, model)
Train Epoch: 0 [59988/60000 (100%)] Loss: 0.061506
.
.
.
.
Train Epoch: 98 [59988/60000 (100%)] Loss: 0.018422
Train Epoch: 99 [59988/60000 (100%)] Loss: 0.336890
Saving the model weights into a file, this should be in the above snippet or inside the training function for saving models every epoch.Lets just keep this simple
for i, model in enumerate(models):
torch.save(model.state_dict(), 'mnist_mlp_multiple_model{}.pth'.format(i))
For the sake of completeness, this is how you load the saved models
models = Model0(), Model1(), Model2(), Model3(), Model4(), Model5(), Model6(), Model7(), Model8(), Model9(), Model10()
if args.cuda:
for model in models:
model.cuda()
for i, model in enumerate(models):
model.load_state_dict(torch.load('mnist_mlp_multiple_model{}.pth'.format(i)))
Before we run the model over the test dataset, let take a peek into how one of the model performs
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image
import pprint
import numpy
fig = plt.figure(1,(8., 8.))
grid = ImageGrid(fig, 111, # similar to subplot(111)
nrows_ncols=(3, 3), # creates 2x2 grid of axes
axes_pad=0.1, # pad between axes in inch.
)
output = numpy.zeros(9)
for i, (data, target) in enumerate(test_loader):
if i < 1: #dirty trick
data1 = data.cuda()
data1 = data1.view(data.size()[0], -1)
out = models[9](Variable(data1))
for j in range(9):
grid[j].matshow(Image.fromarray(data[j][0].numpy()))
output[j] = out.data.max(1)[1][j].cpu().numpy()[0]
else:
break
output = output.reshape(3,3)
print(output)
plt.show()
[[ 6. 2. 9. 1. 8.]
[ 5. 6. 5. 7. 5.]
[ 4. 8. 6. 3. 0.]
[ 6. 1. 0. 9. 3.]
[ 7. 2. 8. 4. 4.]]
As you can see, the results are not so bad.Lets test all our models.
def test(model):
model.eval()
test_loss = 0
correct = 0
for data, target in test_loader:
if args.cuda:
data, target = data.cuda(), target.cuda()
data = data.view(data.size()[0], -1)
data, target = Variable(data, volatile=True), Variable(target)
output = model(data)
test_loss += F.nll_loss(output, target).data[0]
pred = output.data.max(1)[1] # get the index of the max log-probability
correct += pred.eq(target.data).cpu().sum()
test_loss = test_loss
test_loss /= len(test_loader) # loss function already averages over batch size
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image
import pprint
import numpy
accuracy = []
for model in models:
accuracy.append(test_tuts(model))
pprint.pprint(accuracy)
Test set: Average loss: 0.2764, Accuracy: 9250/10000 (92%)
Test set: Average loss: 0.3591, Accuracy: 9010/10000 (90%)
Test set: Average loss: 0.3204, Accuracy: 9121/10000 (91%)
Test set: Average loss: 0.2954, Accuracy: 9189/10000 (92%)
Test set: Average loss: 0.2767, Accuracy: 9237/10000 (92%)
Test set: Average loss: 0.2699, Accuracy: 9267/10000 (93%)
Test set: Average loss: 0.2700, Accuracy: 9251/10000 (93%)
Test set: Average loss: 0.2690, Accuracy: 9244/10000 (92%)
Test set: Average loss: 0.2755, Accuracy: 9240/10000 (92%)
Test set: Average loss: 0.2745, Accuracy: 9253/10000 (93%)
Test set: Average loss: 0.2789, Accuracy: 9232/10000 (92%)
[92.5, 90.1, 91.21, 91.89, 92.37, 92.67, 92.51, 92.44, 92.4, 92.53, 92.32]
plt.plot(range(len(accuracy)), accuracy, linewidth=1.0)
plt.axis([0, 10, 0, 100])
plt.show()
plt.plot(range(len(accuracy)), accuracy, linewidth=1.0)
plt.axis([0, 10, 90, 93])
plt.show()
The right image is a little zoomed in version of the left one. Little dissappointing, isn't it? The more complex models doesn't seem to perform as we would expect. So we can understand that the performance is not proportional to number of layers in neural network. It is in how they interact with each other.