import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import datetime
import math
if __name__=='__main__':
assert torch.cuda.is_available()
transform=transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=(0.485,0.456,0.406),std=(0.229,0.224,0.225)),
])
train_set=torchvision.datasets.CIFAR10(root='./data',train=True,transform=transform)
test_set=torchvision.datasets.CIFAR10(root='./data',train=False,transform=transform)
model=torch.hub.load('pytorch/vision:v0.10.0','resnet34',pretrained=False)
model.fc=nn.Linear(model.fc.in_features,10)
model=model.to('cuda')
start_datetime=datetime.datetime.now()
step_lr_decay=0
lr_now=0.1
sqrtannealing=True
start_batchsize=256
criterion=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(),lr=lr_now,weight_decay=1e-3,momentum=0.9)
scheduler=optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode='min',factor=0.125,patience=1,verbose=True)
train_log=[]
test_log=[]
for i in range(300):
batchsize=start_batchsize//2**step_lr_decay if sqrtannealing else start_batchsize
batchsize=max(batchsize,1)
train_loader=torch.utils.data.DataLoader(train_set,batch_size=batchsize,shuffle=True)
test_loader=torch.utils.data.DataLoader(test_set,batch_size=start_batchsize*2,shuffle=True)
model.train()
loss_sum,cnt,n=0,0,len(train_loader.dataset)
for data,target in train_loader:
data,target=data.to('cuda'),target.to('cuda')
optimizer.zero_grad()
output=model(data)
loss=criterion(output,target)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 10.0)
optimizer.step()
loss_sum+=loss.item()*len(data)
cnt+=len(data)
print('[{}]{:3}: {:.6f} {:3}/{:3}({:4.0%})'.format(
str(datetime.datetime.now()-start_datetime).split('.')[0],
i+1,loss_sum/cnt,cnt,n,cnt/n
),end='\r',flush=True)
print('[{}]{:3}: train loss = {:.12f}'.format(
str(datetime.datetime.now()-start_datetime).split('.')[0],
i+1,loss_sum/cnt
),flush=True)
train_log.append(loss_sum/cnt)
model.eval()
test_loss,cnt,n=0,0,len(test_loader.dataset)
with torch.no_grad():
for data,target in test_loader:
data,target=data.to('cuda'),target.to('cuda')
output=model(data)
loss=criterion(output,target)
test_loss+=loss.item()*len(data)
cnt+=len(data)
print('[{}]{:3}: {:.6f} {:3}/{:3}({:4.0%})'.format(
str(datetime.datetime.now()-start_datetime).split('.')[0],
i+1,test_loss/cnt,cnt,n,cnt/n
),end='\r',flush=True)
print('[{}]{:3}: test loss = {:.12f}'.format(
str(datetime.datetime.now()-start_datetime).split('.')[0],
i+1,test_loss/cnt
),flush=True)
scheduler.step(test_loss)
test_log.append(test_loss/cnt)
assert len(scheduler._last_lr)==1
if scheduler._last_lr[0]<lr_now :
print('[{}]{:3}: lr {:.12f} -> {:.12f}'.format(
str(datetime.datetime.now()-start_datetime).split('.')[0],
i+1,lr_now,scheduler._last_lr[0]
),flush=True)
lr_now=scheduler._last_lr[0]
step_lr_decay+=1
#print("ne wbatchsize=",batchsize)
if sqrtannealing:
assert len(optimizer.param_groups)==1
m_old=optimizer.param_groups[0]['momentum']
x=2
#m_new=(m_old-1)/math.sqrt(x)+1
m_new=max(1-math.sqrt(x)*(1-m_old),0.1)
optimizer.param_groups[0]['momentum']=m_new
print(optimizer.param_groups[0]['momentum'])
if (i+1)%10==0:
model.eval()
print(step_lr_decay)
correct,cnt=[0]*10,[0]*10
#print(train_loss)
with torch.no_grad():
for data,target in test_loader:
data,target=data.to('cuda'),target.to('cuda')
output=model(data)
_,pred = torch.max(output,1)
pred=pred==target
for j in range(10):
cnt[j]+=(target==j).sum()
correct[j]+=pred[target==j].sum()
print('[{}]{:3}:'.format(
str(datetime.datetime.now()-start_datetime).split('.')[0],i+1
),flush=True)
for j in range(10):
print('{:10} : {:5}/{:5}({:4.0%})'.format(
train_set.classes[j],correct[j],cnt[j],correct[j]/cnt[j]
),flush=True)
print('{:10} : {:5}/{:5}({:4.0%})'.format(
'all',sum(correct),sum(cnt),sum(correct)/sum(cnt)
),flush=True)
print(train_log)
print(test_log)