sshuair's note

不同learning rate比较

2017-07-21

在SGD优化中,通常要设置一个learning rate(lr),这个lr 太大不好,太小也不好,太大容易越过最低点而无法下降,太小了收敛太慢

在cifar10数据集上,尝试了一下lr:[10, 5, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001],每种lr进行了1000个epoch迭代,网络使用了较为简单的LeNet网络。

从训练结果来看,lr=10lr=5 这种特别大的lr情况下,在第一个epoch中前面几十个batchloss就会暴涨,超过最大值,最终出现NAN的情况。

1
2
3
import pandas as pd
df = pd.read_csv('learning_rate.csv')
df['train_lr_10'][15:35]
15    2.540980e+00
16    2.426146e+00
17    2.351841e+00
18    1.845384e+01
19    2.158724e+01
20    5.895906e+00
21    6.195771e+00
22    7.015497e+00
23    2.514503e+01
24    6.082247e+00
25    4.786431e+00
26    5.093921e+00
27    5.260998e+00
28    4.673571e+00
29    5.651251e+00
30    5.188484e+00
31    4.449006e+00
32    1.332958e+01
33    3.456380e+12
34             NaN
Name: train_lr_10, dtype: float64

lr=1时,也是属于非常大的情况,可以发现loss几乎不变,而lr=0.00001又是另一种极端情况,在1000个epoch中,loss收敛的非常慢,而lr=0.01, lr=0.001, lr=0.0001这几种情况相对较其他几种极端情况有明显的下降趋势。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from bokeh.plotting import figure, output_notebook, show
output_notebook()
p1 = figure(tools="pan,box_zoom,reset,save",
title="different learning rate (train)", x_axis_label='epoch', y_axis_label='loss',
plot_width=900# y_range=[0, 3],
)
p1.line(df['epoch'], df['train_lr_1'], line_color='blue', line_width=2, legend="lr=1")
p1.line(df['epoch'], df['train_lr_0.1'], line_color="yellow",line_width=2, legend="lr=0.1")
p1.line(df['epoch'], df['train_lr_0.01'], line_color="red",line_width=2, legend="lr=0.01")
p1.line(df['epoch'], df['train_lr_0.001'], line_color="green",line_width=2, legend="lr=0.001")
p1.line(df['epoch'], df['train_lr_0.0001'], line_color="brown", line_width=2, legend="lr=0.0001")
p1.line(df['epoch'], df['train_lr_0.00001'], line_color="black", line_width=2, legend="lr=0.00001")
p1.legend.location = "top_right"
p1.legend.click_policy="hide"
show(p1)

再来看看训练数据集(train)与测试数据集(val)的loss,因为使用的是比较简单的网络,没有用dropout, 也没有data augmentation因此出现了过拟合的现象。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
p = figure(
tools="pan,box_zoom,reset,save", title="different learning rate (train)", x_axis_label='epoch', y_axis_label='loss',
plot_width=900
)
p.circle(df['epoch'], df['train_lr_0.01'],color='red', size=2, legend="lr=0.01(train)")
p.circle(df['epoch'], df['val_lr_0.01'], color='blue', size=2, legend="lr=0.01(val)")
p.square(df['epoch'], df['train_lr_0.001'], color='red',size=2, legend="lr=0.001(train)")
p.square(df['epoch'], df['val_lr_0.001'], color='blue', size=2, legend="lr=0.001(val)")
p.triangle(df['epoch'], df['train_lr_0.0001'], color='red',size=2, legend="lr=0.0001(train)")
p.triangle(df['epoch'], df['val_lr_0.0001'], color='blue', size=2, legend="lr=0.0001(val)")
p.legend.location = "top_right"
p.legend.click_policy="hide"
show(p)


再来看看通过通过动态调整lr的情况,这里建档的使用

1
2
3
4
5
6
7
8
9
10
11
12
13
p = figure(tools="pan,box_zoom,reset,save",
title="different learning rate (train)", x_axis_label='epoch', y_axis_label='loss',
plot_width=900# y_range=[0, 3],
)
p.line(df['epoch'], df['train_lr_1'], line_color='blue', line_width=2, legend="lr=1")
p.line(df['epoch'], df['train_lr_0.01'], line_color="red",line_width=2, legend="lr=0.01")
p.line(df['epoch'], df['train_lr_0.001'], line_color="green",line_width=2, legend="lr=0.001")
p.line(df['epoch'], df['train_lr_0.0001'], line_color="brown", line_width=2, legend="lr=0.0001")
p.line(df['epoch'], df['train_lr_0.00001'], line_color="black", line_width=2, legend="lr=0.00001")
p.legend.location = "top_right"
p.legend.click_policy="hide"
show(p)

训练测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# coding=utf-8
# 测试不同learning rate对模型过拟合、训练时间的影响
import argparse
import logging
import torch
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torchvision import transforms, datasets, models
import model
def logger(filename='trainning'):
logging.basicConfig(filename=filename, level=logging.INFO,
filemode='w', format='%(asctime)s - %(levelname)s: %(message)s')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
def phare_args():
pharer = argparse.ArgumentParser()
pharer.add_argument('--gpu', default=-1, type=int,
help='weather use gpu device')
pharer.add_argument('--lr', required=False,
type=float, help='learning rate')
pharer.add_argument('--log', required=False,
default='train_log', help='log file path')
pharer.add_argument('--batch_size', default=200, type=int, help='batch_size')
args = pharer.parse_args()
args.cuda = True if args.gpu > -1 else False
return args
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def train(epoch, args):
net.train()
avg_loss = 0
avg_acc = 0
for batch_idx, (data, target) in enumerate(train_loader):
if args.cuda:
data, target = Variable(data).cuda(), Variable(target).cuda()
else:
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = net(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# calculate the evaluation. accuracy, precision, recall, f1, mape...
# in this case, we use accuracy
pred = output.data.max(1)[1]
correct = pred.cpu().eq(target.data.cpu()).sum()
acc = float(correct) / len(data)
logging.info('Train epoch [{}]: Batch[{}/{}] ({:.2%}) loss: {:.6f} acc: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), batch_idx / len(train_loader),
loss.data[0], acc
))
avg_loss += loss.data[0]
avg_acc += acc
logging.info('Train epoch [{}] train avg loss: {:.6f} train avg acc: {:.6f}'.format(
epoch, avg_loss / len(train_loader), avg_acc / len(train_loader)
))
def val(epoch, args):
net.eval()
avg_loss = 0
avg_acc = 0
for batch_size, (data, target) in enumerate(val_loader):
if args.cuda:
data, target = Variable(data).cuda(), Variable(target).cuda()
else:
data, target = Variable(data), Variable(target)
output = net(data)
loss = criterion(output, target)
pred = output.data.max(1)[1]
correct = pred.cpu().eq(target.data.cpu()).sum()
acc = float(correct) / len(data)
avg_loss += loss.data[0]
avg_acc += acc
logging.info('Val epoch [{}] val avg loss: {:.6f} val avg acc: {:.6f}\n'.format(
epoch, avg_loss / len(val_loader), avg_acc / len(val_loader)
))
# cs231n: http://cs231n.github.io/neural-networks-3/
def adaptive_learning_rate(optimizer, lr0, epoch, method='step'):
# lr = param_group['lr']
if method=='step':
if epoch%5 ==0:
lr = lr0/2.0
for param_group in optimizer.param_groups:
param_group['lr']=lr
if __name__ == '__main__':
args = phare_args()
logger(args.log)
if args.cuda:
torch.cuda.set_device(args.gpu)
train_loader = DataLoader(
datasets.CIFAR10('../data/cifar', train=True,
transform=transforms.Compose([transforms.ToTensor()])),
batch_size=args.batch_size,
shuffle=True
)
val_loader = DataLoader(
datasets.CIFAR10('../data/cifar', train=False,
transform=transforms.Compose([transforms.ToTensor()])),
batch_size=args.batch_size,
shuffle=True
)
# net = Net()
net = model.cifar10(n_channel=128)
if args.cuda:
net.cuda()
optimizer = optim.SGD(params=net.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.005)
criterion = nn.CrossEntropyLoss()
for epoch in range(1, 1000):
train(epoch, args)
val(epoch, args)

扫描二维码,分享此文章