# Engineer Note ## huggingface's accelerate ### 1. ```accelerate.utils.find_executable_batch_size``` - Usage: ```python!= @accelerate.utils.find_executable_batch_size(starting_batch_size=1024) def launch_a_iter(batch_size, model, image): pred = 0 dis_batch = torch.split(image.cpu(), batch_size) batch_bar = tqdm(dis_batch, total=len(dis_batch), desc=f'Batch({batch_size}) process', leave=False, ncols=100) for cimg in batch_bar: reg_x = torch.cat([cimg] * 3, dim=1).to('cuda:0') shape: torch.Size = reg_x.size() # shape. pred_ = torch.sum(model(reg_x)) # Poor cuda memory pred += pred_ del reg_x torch.cuda.empty_cache() gc.collect() batch_bar.set_postfix({'Batch Agatston': pred_.item(), 'Size': list(shape)}) return pred ``` - Issue - 無限 No executable batch size found, reached zero. 在 accelerate.utils.memory.py用來遞減`batch_size`到適當的大小 ```python!=132 while True: if batch_size == 0: raise RuntimeError("No executable batch size found, reached zero.") try: return function(batch_size, *args, **kwargs) except Exception as e: if should_reduce_batch_size(e): gc.collect() if is_xpu_available(): torch.xpu.empty_cache() elif is_npu_available(): torch.npu.empty_cache() else: torch.cuda.empty_cache() batch_size //= 2 else: raise ``` 然而當`batch_size<=1`依然觸發`CUDAOutOfMemory`時由於while迴圈持續條件為: ```python!=65 def should_reduce_batch_size(exception: Exception) -> bool: """ Checks if `exception` relates to CUDA out-of-memory, CUDNN not supported, or CPU out-of-memory Args: exception (`Exception`): An exception """ _statements = [ "CUDA out of memory.", # CUDA OOM "cuDNN error: CUDNN_STATUS_NOT_SUPPORTED.", # CUDNN SNAFU "DefaultCPUAllocator: can't allocate memory", # CPU OOM ] if isinstance(exception, RuntimeError) and len(exception.args) == 1: return any(err in exception.args[0] for err in _statements) return False ``` 因此儘管`batch_size`已經降到1或0,但是因為判斷條是裝飾器包裹的method是否拋出與CUDA相關的`RuntimeError`,導致 ` - Solution 在被裝飾器包裹住的method中自行拋出例外狀況 ```python!= # Example @accelerate.utils.find_executable_batch_size(starting_batch_size=1024) def launch_a_iter(batch_size, model, image): pred = 0 dis_batch = torch.split(image.cpu(), batch_size) batch_bar = tqdm(dis_batch, total=len(dis_batch), desc=f'Batch({batch_size}) process', leave=False, ncols=100) try: for cimg in batch_bar: reg_x = torch.cat([cimg] * 3, dim=1).to('cuda:0') shape: torch.Size = reg_x.size() # shape. pred_ = torch.sum(model(reg_x)) # Poor cuda memory pred += pred_ accelerate.utils.release_memory(pred_, reg_x, cimg) batch_bar.set_postfix({'Batch Agatston': pred_.item(), 'Size': list(shape)}) except RuntimeError: raise Exception("Escaping this sample.") # 不拋出RuntimeError使should_reduce_batch_size為False return pred ``` # Pytorch ## Build cuda usage snapshot ```python!=0 import torch torch.cuda.memory._record_memory_history(True) """ Training statement """ torch.cuda.memory._dump_snapshot('<save path end with .pickle>') torch.cuda.memory._record_memory_history(enabled=None) # record end ``` ## BatchNorm2d got NaN - Solve.1 將`momentum`設定的更大 - Solve.2 將`track_running_stats`設定為`False`