import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.optim as optim import datetime import math if __name__=='__main__': assert torch.cuda.is_available() transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=(0.485,0.456,0.406),std=(0.229,0.224,0.225)), ]) train_set=torchvision.datasets.CIFAR10(root='./data',train=True,transform=transform) test_set=torchvision.datasets.CIFAR10(root='./data',train=False,transform=transform) model=torch.hub.load('pytorch/vision:v0.10.0','resnet34',pretrained=False) model.fc=nn.Linear(model.fc.in_features,10) model=model.to('cuda') start_datetime=datetime.datetime.now() step_lr_decay=0 lr_now=0.1 sqrtannealing=True start_batchsize=256 criterion=nn.CrossEntropyLoss() optimizer=optim.SGD(model.parameters(),lr=lr_now,weight_decay=1e-3,momentum=0.9) scheduler=optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode='min',factor=0.125,patience=1,verbose=True) train_log=[] test_log=[] for i in range(300): batchsize=start_batchsize//2**step_lr_decay if sqrtannealing else start_batchsize batchsize=max(batchsize,1) train_loader=torch.utils.data.DataLoader(train_set,batch_size=batchsize,shuffle=True) test_loader=torch.utils.data.DataLoader(test_set,batch_size=start_batchsize*2,shuffle=True) model.train() loss_sum,cnt,n=0,0,len(train_loader.dataset) for data,target in train_loader: data,target=data.to('cuda'),target.to('cuda') optimizer.zero_grad() output=model(data) loss=criterion(output,target) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 10.0) optimizer.step() loss_sum+=loss.item()*len(data) cnt+=len(data) print('[{}]{:3}: {:.6f} {:3}/{:3}({:4.0%})'.format( str(datetime.datetime.now()-start_datetime).split('.')[0], i+1,loss_sum/cnt,cnt,n,cnt/n ),end='\r',flush=True) print('[{}]{:3}: train loss = {:.12f}'.format( str(datetime.datetime.now()-start_datetime).split('.')[0], i+1,loss_sum/cnt ),flush=True) train_log.append(loss_sum/cnt) model.eval() test_loss,cnt,n=0,0,len(test_loader.dataset) with torch.no_grad(): for data,target in test_loader: data,target=data.to('cuda'),target.to('cuda') output=model(data) loss=criterion(output,target) test_loss+=loss.item()*len(data) cnt+=len(data) print('[{}]{:3}: {:.6f} {:3}/{:3}({:4.0%})'.format( str(datetime.datetime.now()-start_datetime).split('.')[0], i+1,test_loss/cnt,cnt,n,cnt/n ),end='\r',flush=True) print('[{}]{:3}: test loss = {:.12f}'.format( str(datetime.datetime.now()-start_datetime).split('.')[0], i+1,test_loss/cnt ),flush=True) scheduler.step(test_loss) test_log.append(test_loss/cnt) assert len(scheduler._last_lr)==1 if scheduler._last_lr[0]<lr_now : print('[{}]{:3}: lr {:.12f} -> {:.12f}'.format( str(datetime.datetime.now()-start_datetime).split('.')[0], i+1,lr_now,scheduler._last_lr[0] ),flush=True) lr_now=scheduler._last_lr[0] step_lr_decay+=1 #print("ne wbatchsize=",batchsize) if sqrtannealing: assert len(optimizer.param_groups)==1 m_old=optimizer.param_groups[0]['momentum'] x=2 #m_new=(m_old-1)/math.sqrt(x)+1 m_new=max(1-math.sqrt(x)*(1-m_old),0.1) optimizer.param_groups[0]['momentum']=m_new print(optimizer.param_groups[0]['momentum']) if (i+1)%10==0: model.eval() print(step_lr_decay) correct,cnt=[0]*10,[0]*10 #print(train_loss) with torch.no_grad(): for data,target in test_loader: data,target=data.to('cuda'),target.to('cuda') output=model(data) _,pred = torch.max(output,1) pred=pred==target for j in range(10): cnt[j]+=(target==j).sum() correct[j]+=pred[target==j].sum() print('[{}]{:3}:'.format( str(datetime.datetime.now()-start_datetime).split('.')[0],i+1   ),flush=True) for j in range(10): print('{:10} : {:5}/{:5}({:4.0%})'.format( train_set.classes[j],correct[j],cnt[j],correct[j]/cnt[j] ),flush=True) print('{:10} : {:5}/{:5}({:4.0%})'.format( 'all',sum(correct),sum(cnt),sum(correct)/sum(cnt) ),flush=True) print(train_log) print(test_log)