From dc6ecfa049ceecae6e36151dbc542c2e2930d211 Mon Sep 17 00:00:00 2001 From: T Phil Date: Tue, 14 Mar 2023 14:53:20 +0100 Subject: [PATCH 1/2] Initial work to get nnUNet running on Intel ARC / XPU --- nnunet/backends/.backends.py.swp | Bin 0 -> 16384 bytes nnunet/backends/__init__.py | 3 + nnunet/backends/backends.py | 319 ++++++++++++++++++ .../generic_modular_UNet.py | 10 +- .../generic_modular_preact_residual_UNet.py | 41 +-- .../generic_modular_residual_UNet.py | 31 +- nnunet/network_architecture/neural_network.py | 46 +-- nnunet/run/run_training.py | 11 +- .../BraTS2020/nnUNetTrainerV2BraTSRegions.py | 10 +- .../network_training/network_trainer.py | 65 ++-- .../network_training/nnUNetTrainer.py | 18 +- .../network_training/nnUNetTrainerV2.py | 37 +- .../network_training/nnUNetTrainerV2_DDP.py | 52 +-- .../network_training/nnUNetTrainerV2_DP.py | 41 ++- .../benchmarking/nnUNetTrainerV2_2epochs.py | 24 +- .../benchmarking/nnUNetTrainerV2_dummyLoad.py | 16 +- nnunet/utilities/to_torch.py | 8 - 17 files changed, 544 insertions(+), 188 deletions(-) create mode 100644 nnunet/backends/.backends.py.swp create mode 100644 nnunet/backends/__init__.py create mode 100644 nnunet/backends/backends.py diff --git a/nnunet/backends/.backends.py.swp b/nnunet/backends/.backends.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..ffddb12fee0e7a8f814944a67acb9b34b9f8773e GIT binary patch literal 16384 zcmeI2OKjXk7{@0~D3COOiUhPAI&B2mjl$+p0i}>anhGgGh#-=-5~{3Od$w!EYa4sq zWI;S`h=zyG)P{}*p3 za z6CuBV3*a0$15SgZzy~$J!L#6gum#)$2EcDag!~9DgRjAH@DcbBya5h@1@H`b6byl@ zgM|DIu7Gpk3^)x=fwzGUo&uY}AGZ>67Muhpz@Gy7?41o(93Hc6u3r>Rffe$<|5AFq9!B004@)c+R4QvBLU=aL1 zK*;yt0{9ZV3*G@$Fb$?aA2^Q#oo~R$zymJ;7tDhP!2sw7m(edyfaBmr5cZSa{g|dM zvqnwdw0*&BotF$I*f_OJUzB$fnabcbDjY3-;iKept2R!fRAbCL?2ps2vE>!{A2NhF zWG;`3z_Y1*gM81Hu+S)-ZZY4d^D^tfg{8gGyDX*nbJk$4;GV2Rm>HEoy`?=pFj&RsLNoXg z6B6Z%m-tGhS+gQ3(+y?^%+h?$4UJh=cl-H->vU4cLEO;!x?bB9sfzH1E1H_l^g1t< z6IIf8Wr;g<6HQ92u_Mg1l5D0Q-C$KKX~6Iy#T9oi*$tC>^1i%5XTp!L*-D4e0Zn6^ z+UO%y%h8uj`|wQqOH@voms>uMy1e`*uh`o2W>H%d-H@8pw8N~2NN5Fr)*oW&qRuRO z0UxJPm1ZV8=orb^E#AW7tS{W6MowP(f*;%N6=QQ~&jRrkH>2zomDUJz-6O4|lYA)U z`+1^OKH-`%x0bu>8+}>n(xLXR0x)YP6MAhsncFf)haQ+Ul$tlX3YMrD?`yH{Sy`U_{2^2?)%`ScVM-lVvaf8>{p0gO-iDX3$~Wkzmc z=+L6cE}To{5SbmM^0w2V*VV4H8Vwe0gK}ih;L8-6)^+rcrqEfaQy{!uOt(ZU8(6#V2e_j?xBdtL8geL>y>ZF$Cb+C`Y7|II z5z3HS*>$OLG=#+KjnG~yV?2DKe34X0Y}*8yr0jN3CH1tEF4+RNYB#sEZ?;{r*kNc1 z+fv!2tg5%lfi5zR@0n3%gjNGV0~_l%5!ejP-=w0>o0NGRYezv#O)ko0gswP&WuP-A z2Dm}l3Tw(W-$HXy?iY}1-bMpc)8NcP4i7nC%;!{RE~}cBDNODoS@cppjfJ8t+JX4A z+N6u&d;N5wjuo-o|I7V%0ObC^_51!=?Cp<%WpDuO2fM*mumxPj-u@D}2#$jzpaEvU z0O$voVDnS(33v;HK6>vR)l;Q_Qa~x76i^B%1(X6x0i}RaKq+uzDd2k8b(i9;StUC2 zE0fk%ZHA_gz{>4Ys`&Ib+4iSSy*eJo6+9UW_ttX1p0}Sr2TK4(f|BIQV~P> HUy=R+!QhV7 literal 0 HcmV?d00001 diff --git a/nnunet/backends/__init__.py b/nnunet/backends/__init__.py new file mode 100644 index 000000000..7f2c62465 --- /dev/null +++ b/nnunet/backends/__init__.py @@ -0,0 +1,3 @@ +from nnunet.backends.backends import AutoBackend, CudaBackend, IntelXPUBackend, is_backend_cuda, is_backend_xpu + +backend = AutoBackend() diff --git a/nnunet/backends/backends.py b/nnunet/backends/backends.py new file mode 100644 index 000000000..8ec4bab7d --- /dev/null +++ b/nnunet/backends/backends.py @@ -0,0 +1,319 @@ +# nnunet backend adapters by Thomas Phil +# In an ideal world these capabilities +# would be abstracted away in PyTorch +# but unfortunately xpu is not (yet) +# abstracted away in pytorch +from abc import ABC, abstractmethod + +import torch + +# Intel XPU imports +_intel_xpu_avail = False +try: + import intel_extension_for_pytorch as ipex + + if ipex.xpu.is_available() and ipex.xpu.device_count() > 0: + from intel_extension_for_pytorch.xpu.amp import autocast as xpu_autocast + _intel_xpu_avail = True +except: + pass + +# CUDA imports +_cuda_avail = False +if torch.cuda.is_available() and torch.cuda.device_count() > 0: + from torch.backends import cudnn + from torch.cuda.amp import autocast as cuda_amp_autocast, GradScaler as CudaAmpGradScaler + _cuda_avail = True + + + +class BackendAdapter(ABC): + @abstractmethod + def name(self): + pass + + @abstractmethod + def set_deterministic(self, val): + pass + + @abstractmethod + def is_deterministic(self): + pass + + @abstractmethod + def set_benchmark(self, val): + pass + + @abstractmethod + def is_benchmark(self): + pass + + @abstractmethod + def autocast(self, *args, **kwargs): + pass + + @abstractmethod + def to(self, *args, **kwargs): + pass + + @abstractmethod + def is_available(self): + pass + + @abstractmethod + def empty_cache(self): + pass + + @abstractmethod + def manual_seed(self, *args, **kwargs): + pass + + @abstractmethod + def manual_seed_all(self, *args, **kwargs): + pass + + @abstractmethod + def set_device(self, *args, **kwargs): + pass + + @abstractmethod + def optimizer(self, *args, **kwargs): + pass + + @abstractmethod + def get_gradscaler(self, *args, **kwargs): + pass + + +class AutoBackend(BackendAdapter): + def __init__(self, *args, **kwargs): + if _cuda_avail: + self._backend = CudaBackend(*args, **kwargs) + elif _intel_xpu_avail: + self._backend = IntelXPUBackend(*args, **kwargs) + else: + self._backend = MockBackend(*args, **kwargs) + + print(f'Using backend: {self.name()}') + + def name(self): + return f'autobackend.{self._backend.name()}' + + def is_cuda(self): + return is_backend_cuda(self._backend) + + def is_xpu(self): + return is_backend_xpu(self._backend) + + def set_deterministic(self, val): + return self._backend.set_deterministic(val) + + def is_deterministic(self): + return self._backend.is_deterministic() + + def set_benchmark(self, val): + return self._backend.set_benchmark(val) + + def is_benchmark(self): + return self._backend.is_benchmark() + + def autocast(self, *args, **kwargs): + return self._backend.autocast(*args, **kwargs) + + def to(self, *args, **kwargs): + return self._backend.to(*args, **kwargs) + + def is_available(self): + return self._backend.is_available() + + def empty_cache(self): + return self._backend.empty_cache() + + def manual_seed(self, *args, **kwargs): + return self._backend.manual_seed(*args, **kwargs) + + def manual_seed_all(self, *args, **kwargs): + return self._backend.manual_seed_all(*args, **kwargs) + + def set_device(self, *args, **kwargs): + return self._backend.set_device(*args, **kwargs) + + def optimizer(self, *args, **kwargs): + return self._backend.optimizer(*args, **kwargs) + + def get_gradscaler(self, *args, **kwargs): + return self._backend.get_gradscaler(*args, **kwargs) + + +class MockContextManager(): + def __enter__(self): + return + + def __exit__(self, type, value, traceback): + return + + +class MockBackend(BackendAdapter): + def name(self): + return "mock" + + def set_deterministic(self, val): + pass + + def is_deterministic(self): + return False + + def set_benchmark(self, val): + pass + + def is_benchmark(self): + return False + + def autocast(self, *args, **kwargs): + return MockContextManager() + + def to(self, data, non_blocking=True, gpu_id=0): + pass + + def is_available(self): + return False + + def empty_cache(self): + pass + + def manual_seed(self, *args, **kwargs): + pass + + def manual_seed_all(self, *args, **kwargs): + pass + + def set_device(self, *args, **kwargs): + pass + + def optimizer(self, model, optimizer, *args, **kwargs): + return model, optimizer + + def get_gradscaler(self): + pass + + +class CudaBackend(BackendAdapter): + def name(self): + return "torch.backends.cudnn" + + def set_deterministic(self, val): + cudnn.deterministic = val + + def is_deterministic(self): + return cudnn.deterministic + + def set_benchmark(self, val): + cudnn.benchmark = val + + def is_benchmark(self): + return cudnn.benchmark + + def autocast(self, *args, **kwargs): + return cuda_amp_autocast(*args, **kwargs) + + def to(self, data, non_blocking=True, gpu_id=0): + if isinstance(data, list): + data = [i.cuda(gpu_id, non_blocking=non_blocking) for i in data] + else: + data = data.cuda(gpu_id, non_blocking=non_blocking) + return data + + def is_available(self): + return _cuda_avail + + def empty_cache(self): + return torch.cuda.empty_cache() + + def manual_seed(self, *args, **kwargs): + torch.cuda.manual_seed(*args, **kwargs) + + def manual_seed_all(self, *args, **kwargs): + torch.cuda.manual_seed_all(*args, **kwargs) + + def set_device(self, *args, **kwargs): + torch.cuda.set_device(*args, **kwargs) + + def optimizer(self, model, optimizer, *args, **kwargs): + return model, optimizer + + def get_gradscaler(self): + return CudaAmpGradScaler() + + +class IntelXPUBackend(BackendAdapter): + def name(self): + return "intel_extension_for_pytorch.xpu" + + def set_deterministic(self, val): + pass + + def is_deterministic(self): + return False + + def set_benchmark(self, val): + pass + + def is_benchmark(self): + return False + + def autocast(self, dtype=None, *args, **kwargs): + if dtype == torch.float16: + dtype = torch.bfloat16 + + # Intel ARC only supports 16 and bits at the time of writing this + # at some point we should enable some autodetect for compatibility + supported_dtypes = [torch.bfloat16] # last one should be highest order + + if dtype is None: + dtype = supported_dtypes[-1] + elif dtype not in supported_dtypes: + old = dtype + dtype = supported_dtypes[-1] # last one should be highest order + print(f'WARN: {self.name()} autocast requested unsupported dtype {old} - autocasting to {dtype} instead') + + return xpu_autocast(dtype=dtype, enabled=True, cache_enabled=False, *args, **kwargs) + + def to(self, obj, non_blocking=True, *args, **kwargs): + if isinstance(obj, list): + obj = [i.to('xpu') for i in obj] + else: + obj = obj.to('xpu') + return obj + + def is_available(self): + return _intel_xpu_avail + + def empty_cache(self): + return ipex.xpu.empty_cache() + + def manual_seed(self, *args, **kwargs): + return ipex.xpu.manual_seed(*args, **kwargs) + + def manual_seed_all(self, *args, **kwargs): + return ipex.xpu.manual_seed_all(*args, **kwargs) + + def set_device(self, *args, **kwargs): + return ipex.xpu.set_device(*args, **kwargs) + + def optimizer(self, model, optimizer, dtype=torch.bfloat16, *args, **kwargs): + return ipex.optimize(model, optimizer=optimizer, dtype=dtype) + + def get_gradscaler(self, *args, **kwargs): + return None + +def is_backend_cuda(backend): + if isinstance(backend, AutoBackend): + return backend.is_cuda() + + return isinstance(backend, [CudaBackend]) + +def is_backend_xpu(backend): + if isinstance(backend, AutoBackend): + return backend.is_xpu() + + return isinstance(backend, [IntelXPUBackend]) diff --git a/nnunet/network_architecture/generic_modular_UNet.py b/nnunet/network_architecture/generic_modular_UNet.py index da658a852..e161706d4 100644 --- a/nnunet/network_architecture/generic_modular_UNet.py +++ b/nnunet/network_architecture/generic_modular_UNet.py @@ -22,6 +22,8 @@ import numpy as np from torch.optim import SGD +from nnunet.backends import backend + """ The idea behind this modular U-net ist that we decouple encoder and decoder and thus make things a) a lot more easy to combine and b) enable easy swapping between segmentation or classification mode of the same architecture @@ -403,15 +405,15 @@ def compute_reference_for_vram_consumption_2d(): (2, 2)) patch_size = (256, 256) batch_size = 56 - unet = PlainConvUNet(4, 32, (2, 2, 2, 2, 2, 2, 2), 2, pool_op_kernel_sizes, conv_op_kernel_sizes, - get_default_network_config(2, dropout_p=None), 4, (2, 2, 2, 2, 2, 2), False, False, max_features=512).cuda() + unet = backend.to(PlainConvUNet(4, 32, (2, 2, 2, 2, 2, 2, 2), 2, pool_op_kernel_sizes, conv_op_kernel_sizes, + get_default_network_config(2, dropout_p=None), 4, (2, 2, 2, 2, 2, 2), False, False, max_features=512)) optimizer = SGD(unet.parameters(), lr=0.1, momentum=0.95) unet.compute_reference_for_vram_consumption_3d() unet.compute_reference_for_vram_consumption_2d() - dummy_input = torch.rand((batch_size, 4, *patch_size)).cuda() - dummy_gt = (torch.rand((batch_size, 1, *patch_size)) * 4).round().clamp_(0, 3).cuda().long() + dummy_input = backend.to(torch.rand((batch_size, 4, *patch_size))) + dummy_gt = backend.to((torch.rand((batch_size, 1, *patch_size)) * 4).round().clamp_(0, 3)).long() optimizer.zero_grad() skips = unet.encoder(dummy_input) diff --git a/nnunet/network_architecture/generic_modular_preact_residual_UNet.py b/nnunet/network_architecture/generic_modular_preact_residual_UNet.py index 7846f8252..e671ea98d 100644 --- a/nnunet/network_architecture/generic_modular_preact_residual_UNet.py +++ b/nnunet/network_architecture/generic_modular_preact_residual_UNet.py @@ -1,8 +1,9 @@ import numpy as np from copy import deepcopy import torch -from torch.backends import cudnn -from torch.cuda.amp import GradScaler, autocast + +from nnunet.backends import backend + from torch.nn import Identity from nnunet.network_architecture.generic_UNet import Upsample @@ -492,8 +493,8 @@ def compute_approx_vram_consumption(patch_size, base_num_features, max_num_featu def find_3d_configuration(): - cudnn.benchmark = True - cudnn.deterministic = False + backend.set_benchmark(True) + backend.set_deterministic(False) conv_op_kernel_sizes = ((3, 3, 3), (3, 3, 3), @@ -518,11 +519,11 @@ def find_3d_configuration(): max_features = 320 batch_size = 2 - unet = FabiansPreActUNet(input_modalities, base_num_features, blocks_per_stage_encoder, feat_map_mult_on_downscale, + unet = backend.to(FabiansPreActUNet(input_modalities, base_num_features, blocks_per_stage_encoder, feat_map_mult_on_downscale, pool_op_kernel_sizes, conv_op_kernel_sizes, get_default_network_config(3, dropout_p=None), num_classes, - blocks_per_stage_decoder, True, False, max_features=max_features).cuda() + blocks_per_stage_decoder, True, False, max_features=max_features)) - scaler = GradScaler() + scaler = backend.get_gradscaler() optimizer = SGD(unet.parameters(), lr=0.1, momentum=0.95) print(unet.compute_approx_vram_consumption(patch_size, base_num_features, max_features, input_modalities, @@ -531,13 +532,13 @@ def find_3d_configuration(): loss = DC_and_CE_loss({'batch_dice': True, 'smooth': 1e-5, 'do_bg': False}, {}) - dummy_input = torch.rand((batch_size, input_modalities, *patch_size)).cuda() - dummy_gt = (torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, num_classes-1).cuda().long() + dummy_input = backend.to(torch.rand((batch_size, input_modalities, *patch_size))) + dummy_gt = backend.to((torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, num_classes-1)).long() for i in range(10): optimizer.zero_grad() - with autocast(): + with backend.autocast(): skips = unet.encoder(dummy_input) print([i.shape for i in skips]) output = unet.decoder(skips)[0] @@ -548,15 +549,15 @@ def find_3d_configuration(): scaler.step(optimizer) scaler.update() - with autocast(): + with backend.autocast(): import hiddenlayer as hl g = hl.build_graph(unet, dummy_input, transforms=None) g.save("/home/fabian/test_arch.pdf") def find_2d_configuration(): - cudnn.benchmark = True - cudnn.deterministic = False + backend.set_benchmark(True) + backend.set_deterministic(False) conv_op_kernel_sizes = ((3, 3), (3, 3), @@ -583,11 +584,11 @@ def find_2d_configuration(): max_features = 512 batch_size = 50 - unet = FabiansPreActUNet(input_modalities, base_num_features, blocks_per_stage_encoder, feat_map_mult_on_downscale, + unet = backend.to(FabiansPreActUNet(input_modalities, base_num_features, blocks_per_stage_encoder, feat_map_mult_on_downscale, pool_op_kernel_sizes, conv_op_kernel_sizes, get_default_network_config(2, dropout_p=None), num_classes, - blocks_per_stage_decoder, True, False, max_features=max_features).cuda() + blocks_per_stage_decoder, True, False, max_features=max_features)) - scaler = GradScaler() + scaler = backend.get_gradscaler() optimizer = SGD(unet.parameters(), lr=0.1, momentum=0.95) print(unet.compute_approx_vram_consumption(patch_size, base_num_features, max_features, input_modalities, @@ -596,13 +597,13 @@ def find_2d_configuration(): loss = DC_and_CE_loss({'batch_dice': True, 'smooth': 1e-5, 'do_bg': False}, {}) - dummy_input = torch.rand((batch_size, input_modalities, *patch_size)).cuda() - dummy_gt = (torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, num_classes-1).cuda().long() + dummy_input = backend.to(torch.rand((batch_size, input_modalities, *patch_size))) + dummy_gt = backend.to((torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, num_classes-1)).long() for i in range(10): optimizer.zero_grad() - with autocast(): + with backend.autocast(): skips = unet.encoder(dummy_input) print([i.shape for i in skips]) output = unet.decoder(skips)[0] @@ -613,7 +614,7 @@ def find_2d_configuration(): scaler.step(optimizer) scaler.update() - with autocast(): + with backend.autocast(): import hiddenlayer as hl g = hl.build_graph(unet, dummy_input, transforms=None) g.save("/home/fabian/test_arch.pdf") diff --git a/nnunet/network_architecture/generic_modular_residual_UNet.py b/nnunet/network_architecture/generic_modular_residual_UNet.py index 503485bd0..0b50a0904 100644 --- a/nnunet/network_architecture/generic_modular_residual_UNet.py +++ b/nnunet/network_architecture/generic_modular_residual_UNet.py @@ -22,7 +22,8 @@ from nnunet.training.loss_functions.dice_loss import DC_and_CE_loss from torch import nn from torch.optim import SGD -from torch.backends import cudnn + +from nnunet.backends import backend class ResidualUNetEncoder(nn.Module): @@ -380,8 +381,8 @@ def find_3d_configuration(): # since this is more parameter intensive than the UNet, we will test a configuration that has a lot of parameters # herefore we copy the UNet configuration for Task005_Prostate - cudnn.deterministic = False - cudnn.benchmark = True + backend.set_deterministic(False) + backend.set_benchmark(True) patch_size = (20, 320, 256) max_num_features = 320 @@ -411,17 +412,17 @@ def find_3d_configuration(): [3, 3, 3], [3, 3, 3]] - unet = FabiansUNet(num_modalities, initial_num_features, blocks_per_stage_encoder[:len(conv_op_kernel_sizes)], 2, + unet = backend.to(FabiansUNet(num_modalities, initial_num_features, blocks_per_stage_encoder[:len(conv_op_kernel_sizes)], 2, pool_op_kernel_sizes, conv_op_kernel_sizes, get_default_network_config(3, dropout_p=None), num_classes, blocks_per_stage_decoder[:len(conv_op_kernel_sizes)-1], False, False, - max_features=max_num_features).cuda() + max_features=max_num_features)) optimizer = SGD(unet.parameters(), lr=0.1, momentum=0.95) loss = DC_and_CE_loss({'batch_dice': True, 'smooth': 1e-5, 'do_bg': False}, {}) - dummy_input = torch.rand((batch_size, num_modalities, *patch_size)).cuda() - dummy_gt = (torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, 2).cuda().long() + dummy_input = backend.to(torch.rand((batch_size, num_modalities, *patch_size))) + dummy_gt = backend.to((torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, 2)) for _ in range(20): optimizer.zero_grad() @@ -434,7 +435,7 @@ def find_3d_configuration(): optimizer.step() if _ == 0: - torch.cuda.empty_cache() + backend.empty_cache() # that should do. Now take the network hyperparameters and insert them in FabiansUNet.compute_approx_vram_consumption # whatever number this spits out, save it to FabiansUNet.use_this_for_batch_size_computation_3D @@ -454,8 +455,8 @@ def find_2d_configuration(): # since this is more parameter intensive than the UNet, we will test a configuration that has a lot of parameters # herefore we copy the UNet configuration for Task003_Liver - cudnn.deterministic = False - cudnn.benchmark = True + backend.set_deterministic(False) + backend.set_benchmark(True) patch_size = (512, 512) max_num_features = 512 @@ -487,17 +488,17 @@ def find_2d_configuration(): [3, 3], [3, 3]] - unet = FabiansUNet(num_modalities, initial_num_features, blocks_per_stage_encoder[:len(conv_op_kernel_sizes)], 2, + unet = backend.to(FabiansUNet(num_modalities, initial_num_features, blocks_per_stage_encoder[:len(conv_op_kernel_sizes)], 2, pool_op_kernel_sizes, conv_op_kernel_sizes, get_default_network_config(2, dropout_p=None), num_classes, blocks_per_stage_decoder[:len(conv_op_kernel_sizes)-1], False, False, - max_features=max_num_features).cuda() + max_features=max_num_features)) optimizer = SGD(unet.parameters(), lr=0.1, momentum=0.95) loss = DC_and_CE_loss({'batch_dice': True, 'smooth': 1e-5, 'do_bg': False}, {}) - dummy_input = torch.rand((batch_size, num_modalities, *patch_size)).cuda() - dummy_gt = (torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, 2).cuda().long() + dummy_input = backend.to(torch.rand((batch_size, num_modalities, *patch_size))) + dummy_gt = backend.to((torch.rand((batch_size, 1, *patch_size)) * num_classes).round().clamp_(0, 2)).long() for _ in range(20): optimizer.zero_grad() @@ -510,7 +511,7 @@ def find_2d_configuration(): optimizer.step() if _ == 0: - torch.cuda.empty_cache() + backend.empty_cache() # that should do. Now take the network hyperparameters and insert them in FabiansUNet.compute_approx_vram_consumption # whatever number this spits out, save it to FabiansUNet.use_this_for_batch_size_computation_2D diff --git a/nnunet/network_architecture/neural_network.py b/nnunet/network_architecture/neural_network.py index 7cd69dbbf..1a12bb4ff 100644 --- a/nnunet/network_architecture/neural_network.py +++ b/nnunet/network_architecture/neural_network.py @@ -16,13 +16,13 @@ import numpy as np from batchgenerators.augmentations.utils import pad_nd_image from nnunet.utilities.random_stuff import no_op -from nnunet.utilities.to_torch import to_cuda, maybe_to_torch +from nnunet.utilities.to_torch import maybe_to_torch from torch import nn import torch from scipy.ndimage.filters import gaussian_filter from typing import Union, Tuple, List -from torch.cuda.amp import autocast +from nnunet.backends import backend class NeuralNetwork(nn.Module): @@ -39,7 +39,7 @@ def set_device(self, device): if device == "cpu": self.cpu() else: - self.cuda(device) + backend.to(device) def forward(self, x): raise NotImplementedError @@ -108,7 +108,7 @@ def predict_3D(self, x: np.ndarray, do_mirroring: bool, mirror_axes: Tuple[int, :param mixed_precision: if True, will run inference in mixed precision with autocast() :return: """ - torch.cuda.empty_cache() + backend.empty_cache() assert step_size <= 1, 'step_size must be smaller than 1. Otherwise there will be a gap between consecutive ' \ 'predictions' @@ -134,7 +134,7 @@ def predict_3D(self, x: np.ndarray, do_mirroring: bool, mirror_axes: Tuple[int, assert len(x.shape) == 4, "data must have shape (c,x,y,z)" if mixed_precision: - context = autocast + context = backend.autocast else: context = no_op @@ -198,7 +198,7 @@ def predict_2D(self, x, do_mirroring: bool, mirror_axes: tuple = (0, 1, 2), use_ :param verbose: Do you want a wall of text? If yes then set this to True :return: """ - torch.cuda.empty_cache() + backend.empty_cache() assert step_size <= 1, 'step_size must be smaler than 1. Otherwise there will be a gap between consecutive ' \ 'predictions' @@ -223,7 +223,7 @@ def predict_2D(self, x, do_mirroring: bool, mirror_axes: tuple = (0, 1, 2), use_ assert len(x.shape) == 3, "data must have shape (c,x,y)" if mixed_precision: - context = autocast + context = backend.autocast else: context = no_op @@ -329,8 +329,8 @@ def _internal_predict_3D_3Dconv_tiled(self, x: np.ndarray, step_size: float, do_ gaussian_importance_map = torch.from_numpy(gaussian_importance_map) #predict on cpu if cuda not available - if torch.cuda.is_available(): - gaussian_importance_map = gaussian_importance_map.cuda(self.get_device(), non_blocking=True) + if backend.is_available(): + gaussian_importance_map = backend.to(gaussian_importance_map, self.get_device(), non_blocking=True) else: gaussian_importance_map = None @@ -357,7 +357,7 @@ def _internal_predict_3D_3Dconv_tiled(self, x: np.ndarray, step_size: float, do_ device=self.get_device()) if verbose: print("moving data to GPU") - data = torch.from_numpy(data).cuda(self.get_device(), non_blocking=True) + data = backend.to(torch.from_numpy(data), self.get_device(), non_blocking=True) if verbose: print("initializing result_numsamples (on GPU)") aggregated_nb_of_predictions = torch.zeros([self.num_classes] + list(data.shape[1:]), dtype=torch.half, @@ -511,14 +511,14 @@ def _internal_maybe_mirror_and_pred_3D(self, x: Union[np.ndarray, torch.tensor], result_torch = torch.zeros([1, self.num_classes] + list(x.shape[2:]), dtype=torch.float) - if torch.cuda.is_available(): - x = to_cuda(x, gpu_id=self.get_device()) - result_torch = result_torch.cuda(self.get_device(), non_blocking=True) + if backend.is_available(): + x = backend.to(x, gpu_id=self.get_device()) + result_torch = backend.to(result_torch, self.get_device(), non_blocking=True) if mult is not None: mult = maybe_to_torch(mult) - if torch.cuda.is_available(): - mult = to_cuda(mult, gpu_id=self.get_device()) + if backend.is_available(): + mult = backend.to(mult, gpu_id=self.get_device()) if do_mirroring: mirror_idx = 8 @@ -577,14 +577,14 @@ def _internal_maybe_mirror_and_pred_2D(self, x: Union[np.ndarray, torch.tensor], x = maybe_to_torch(x) result_torch = torch.zeros([x.shape[0], self.num_classes] + list(x.shape[2:]), dtype=torch.float) - if torch.cuda.is_available(): - x = to_cuda(x, gpu_id=self.get_device()) - result_torch = result_torch.cuda(self.get_device(), non_blocking=True) + if backend.is_available(): + x = backend.to(x, gpu_id=self.get_device()) + result_torch = backend.to(result_torch, self.get_device(), non_blocking=True) if mult is not None: mult = maybe_to_torch(mult) - if torch.cuda.is_available(): - mult = to_cuda(mult, gpu_id=self.get_device()) + if backend.is_available(): + mult = backend.to(mult, gpu_id=self.get_device()) if do_mirroring: mirror_idx = 4 @@ -657,8 +657,8 @@ def _internal_predict_2D_2Dconv_tiled(self, x: np.ndarray, step_size: float, do_ gaussian_importance_map = self._gaussian_2d gaussian_importance_map = torch.from_numpy(gaussian_importance_map) - if torch.cuda.is_available(): - gaussian_importance_map = gaussian_importance_map.cuda(self.get_device(), non_blocking=True) + if backend.is_available(): + gaussian_importance_map = backend.to(gaussian_importance_map, self.get_device(), non_blocking=True) else: gaussian_importance_map = None @@ -685,7 +685,7 @@ def _internal_predict_2D_2Dconv_tiled(self, x: np.ndarray, step_size: float, do_ device=self.get_device()) if verbose: print("moving data to GPU") - data = torch.from_numpy(data).cuda(self.get_device(), non_blocking=True) + data = backend.to(torch.from_numpy(data), self.get_device(), non_blocking=True) if verbose: print("initializing result_numsamples (on GPU)") aggregated_nb_of_predictions = torch.zeros([self.num_classes] + list(data.shape[1:]), dtype=torch.half, diff --git a/nnunet/run/run_training.py b/nnunet/run/run_training.py index f44a1e273..f8cf546ee 100644 --- a/nnunet/run/run_training.py +++ b/nnunet/run/run_training.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# ensure that ipex extensions are loaded in the propper order +from nnunet.backends import backend import argparse from batchgenerators.utilities.file_and_folder_operations import * @@ -138,6 +140,7 @@ def main(): plans_file, output_folder_name, dataset_directory, batch_dice, stage, \ trainer_class = get_default_configuration(network, task, network_trainer, plans_identifier) + print(f'Using training class {trainer_class} {trainer_class.__module__}') if trainer_class is None: raise RuntimeError("Could not find trainer class in nnunet.training.network_training") @@ -179,7 +182,13 @@ def main(): # new training without pretraine weights, do nothing pass - trainer.run_training() + from torch.profiler import profile, record_function, ProfilerActivity + + print(dir(ProfilerActivity)) + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: + trainer.run_training() + prof.export_chrome_trace("trace.json") + else: if valbest: trainer.load_best_checkpoint(train=False) diff --git a/nnunet/training/network_training/competitions_with_custom_Trainers/BraTS2020/nnUNetTrainerV2BraTSRegions.py b/nnunet/training/network_training/competitions_with_custom_Trainers/BraTS2020/nnUNetTrainerV2BraTSRegions.py index a462c1375..d6c32cd79 100644 --- a/nnunet/training/network_training/competitions_with_custom_Trainers/BraTS2020/nnUNetTrainerV2BraTSRegions.py +++ b/nnunet/training/network_training/competitions_with_custom_Trainers/BraTS2020/nnUNetTrainerV2BraTSRegions.py @@ -33,7 +33,9 @@ from nnunet.training.network_training.nnUNetTrainerV2 import nnUNetTrainerV2 from nnunet.training.network_training.nnUNetTrainerV2_DDP import nnUNetTrainerV2_DDP from nnunet.utilities.distributed import awesome_allgather_function -from nnunet.utilities.to_torch import maybe_to_torch, to_cuda +from nnunet.utilities.to_torch import maybe_to_torch + +from nnunet.backends import backend class nnUNetTrainerV2BraTSRegions_BN(nnUNetTrainerV2): @@ -331,9 +333,9 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= data = maybe_to_torch(data) target = maybe_to_torch(target) - if torch.cuda.is_available(): - data = to_cuda(data, gpu_id=None) - target = to_cuda(target, gpu_id=None) + if backend.is_available(): + data = backend.to(data, gpu_id=None) + target = backend.to(target, gpu_id=None) self.optimizer.zero_grad() diff --git a/nnunet/training/network_training/network_trainer.py b/nnunet/training/network_training/network_trainer.py index abba3067c..2a7944309 100644 --- a/nnunet/training/network_training/network_trainer.py +++ b/nnunet/training/network_training/network_trainer.py @@ -21,9 +21,10 @@ from nnunet.network_architecture.neural_network import SegmentationNetwork from sklearn.model_selection import KFold from torch import nn -from torch.cuda.amp import GradScaler, autocast from torch.optim.lr_scheduler import _LRScheduler +from nnunet.backends import backend + matplotlib.use("agg") from time import time, sleep import torch @@ -32,11 +33,10 @@ import matplotlib.pyplot as plt import sys from collections import OrderedDict -import torch.backends.cudnn as cudnn from abc import abstractmethod from datetime import datetime from tqdm import trange -from nnunet.utilities.to_torch import maybe_to_torch, to_cuda +from nnunet.utilities.to_torch import maybe_to_torch class NetworkTrainer(object): @@ -57,18 +57,18 @@ def __init__(self, deterministic=True, fp16=False): - predict_test_case """ self.fp16 = fp16 - self.amp_grad_scaler = None + self.grad_scaler = None if deterministic: np.random.seed(12345) torch.manual_seed(12345) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(12345) - cudnn.deterministic = True - torch.backends.cudnn.benchmark = False + if backend.is_available(): + backend.manual_seed_all(12345) + backend.set_deterministic(True) + backend.set_benchmark(False) else: - cudnn.deterministic = False - torch.backends.cudnn.benchmark = True + backend.set_deterministic(False) + backend.set_benchmark(True) ################# SET THESE IN self.initialize() ################################### self.network: Tuple[SegmentationNetwork, nn.DataParallel] = None @@ -281,8 +281,8 @@ def save_checkpoint(self, fname, save_optimizer=True): 'plot_stuff': (self.all_tr_losses, self.all_val_losses, self.all_val_losses_tr_mode, self.all_val_eval_metrics), 'best_stuff' : (self.best_epoch_based_on_MA_tr_loss, self.best_MA_tr_loss_for_patience, self.best_val_eval_criterion_MA)} - if self.amp_grad_scaler is not None: - save_this['amp_grad_scaler'] = self.amp_grad_scaler.state_dict() + if self.grad_scaler is not None: + save_this['amp_grad_scaler'] = self.grad_scaler.state_dict() torch.save(save_this, fname) self.print_to_log_file("done, saving took %.2f seconds" % (time() - start_time)) @@ -360,7 +360,7 @@ def load_checkpoint_ram(self, checkpoint, train=True): self._maybe_init_amp() if train: if 'amp_grad_scaler' in checkpoint.keys(): - self.amp_grad_scaler.load_state_dict(checkpoint['amp_grad_scaler']) + self.grad_scaler.load_state_dict(checkpoint['amp_grad_scaler']) self.network.load_state_dict(new_state_dict) self.epoch = checkpoint['epoch'] @@ -400,8 +400,8 @@ def load_checkpoint_ram(self, checkpoint, train=True): self._maybe_init_amp() def _maybe_init_amp(self): - if self.fp16 and self.amp_grad_scaler is None: - self.amp_grad_scaler = GradScaler() + if self.fp16 and self.grad_scaler is None: + self.grad_scaler = backend.get_gradscaler() def plot_network_architecture(self): """ @@ -412,23 +412,23 @@ def plot_network_architecture(self): pass def run_training(self): - if not torch.cuda.is_available(): - self.print_to_log_file("WARNING!!! You are attempting to run training on a CPU (torch.cuda.is_available() is False). This can be VERY slow!") + if not backend.is_available(): + self.print_to_log_file("WARNING!!! You are attempting to run training on a CPU (no CUDA/XPU accelerator found). This can be VERY slow!") _ = self.tr_gen.next() _ = self.val_gen.next() - if torch.cuda.is_available(): - torch.cuda.empty_cache() + if backend.is_available(): + backend.empty_cache() self._maybe_init_amp() maybe_mkdir_p(self.output_folder) self.plot_network_architecture() - if cudnn.benchmark and cudnn.deterministic: - warn("torch.backends.cudnn.deterministic is True indicating a deterministic training is desired. " - "But torch.backends.cudnn.benchmark is True as well and this will prevent deterministic training! " + if backend.is_benchmark() and backend.is_deterministic(): + warn(f"{backend.name()}.deterministic is True indicating a deterministic training is desired. " + f"But {backend.name()}.benchmark is True as well and this will prevent deterministic training! " "If you want deterministic then set benchmark=False") if not self.was_initialized: @@ -632,22 +632,27 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= data = maybe_to_torch(data) target = maybe_to_torch(target) - if torch.cuda.is_available(): - data = to_cuda(data) - target = to_cuda(target) + if backend.is_available(): + data = backend.to(data) + target = backend.to(target) self.optimizer.zero_grad() if self.fp16: - with autocast(): + with backend.autocast(dtype=torch.float16): output = self.network(data) del data l = self.loss(output, target) - if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + if do_backprop and self.grad_scaler: + # not all accelerators have a grad scaler implementation that is used in this way + # e.g. at the time of writing CUDA supports GradScaler and Intel XPU does not. + self.grad_scaler.scale(l).backward() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + elif do_backprop: + l.backward() + self.optimizer.step() else: output = self.network(data) del data diff --git a/nnunet/training/network_training/nnUNetTrainer.py b/nnunet/training/network_training/nnUNetTrainer.py index 393eaa3f8..74554f297 100644 --- a/nnunet/training/network_training/nnUNetTrainer.py +++ b/nnunet/training/network_training/nnUNetTrainer.py @@ -43,6 +43,8 @@ from nnunet.utilities.nd_softmax import softmax_helper from nnunet.utilities.tensor_utilities import sum_tensor +from nnunet.backends import backend + matplotlib.use("agg") @@ -260,13 +262,13 @@ def initialize_network(self): self.net_num_pool_op_kernel_sizes, self.net_conv_kernel_sizes, False, True, True) self.network.inference_apply_nonlin = softmax_helper - if torch.cuda.is_available(): - self.network.cuda() + if backend.is_available(): + self.network = backend.to(self.network) def initialize_optimizer_and_scheduler(self): assert self.network is not None, "self.initialize_network must be called first" - self.optimizer = torch.optim.Adam(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay, - amsgrad=True) + self.network, self.optimizer = backend.optimizer(model=self.network, optimizer=torch.optim.Adam(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay, + amsgrad=True)) self.lr_scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', factor=0.2, patience=self.lr_scheduler_patience, verbose=True, threshold=self.lr_scheduler_eps, @@ -276,8 +278,8 @@ def plot_network_architecture(self): try: from batchgenerators.utilities.file_and_folder_operations import join import hiddenlayer as hl - if torch.cuda.is_available(): - g = hl.build_graph(self.network, torch.rand((1, self.num_input_channels, *self.patch_size)).cuda(), + if backend.is_available(): + g = hl.build_graph(self.network, backend.to(torch.rand((1, self.num_input_channels, *self.patch_size))), transforms=None) else: g = hl.build_graph(self.network, torch.rand((1, self.num_input_channels, *self.patch_size)), @@ -292,8 +294,8 @@ def plot_network_architecture(self): self.print_to_log_file(self.network) self.print_to_log_file("\n") finally: - if torch.cuda.is_available(): - torch.cuda.empty_cache() + if backend.is_available(): + backend.empty_cache() def save_debug_information(self): # saving some debug information diff --git a/nnunet/training/network_training/nnUNetTrainerV2.py b/nnunet/training/network_training/nnUNetTrainerV2.py index e5e77e265..ac3bdec47 100644 --- a/nnunet/training/network_training/nnUNetTrainerV2.py +++ b/nnunet/training/network_training/nnUNetTrainerV2.py @@ -20,7 +20,7 @@ import torch from nnunet.training.data_augmentation.data_augmentation_moreDA import get_moreDA_augmentation from nnunet.training.loss_functions.deep_supervision import MultipleOutputLoss2 -from nnunet.utilities.to_torch import maybe_to_torch, to_cuda +from nnunet.utilities.to_torch import maybe_to_torch from nnunet.network_architecture.generic_UNet import Generic_UNet from nnunet.network_architecture.initialization import InitWeights_He from nnunet.network_architecture.neural_network import SegmentationNetwork @@ -31,7 +31,7 @@ from nnunet.utilities.nd_softmax import softmax_helper from sklearn.model_selection import KFold from torch import nn -from torch.cuda.amp import autocast +from nnunet.backends import backend, is_backend_cuda from nnunet.training.learning_rate.poly_lr import poly_lr from batchgenerators.utilities.file_and_folder_operations import * @@ -87,6 +87,7 @@ def initialize(self, training=True, force_load_plans=False): self.ds_loss_weights = weights # now wrap the loss self.loss = MultipleOutputLoss2(self.loss, self.ds_loss_weights) + backend.to(self.loss) ################# END ################### self.folder_with_preprocessed_data = join(self.dataset_directory, self.plans['data_identifier'] + @@ -157,14 +158,16 @@ def initialize_network(self): dropout_op_kwargs, net_nonlin, net_nonlin_kwargs, True, False, lambda x: x, InitWeights_He(1e-2), self.net_num_pool_op_kernel_sizes, self.net_conv_kernel_sizes, False, True, True) - if torch.cuda.is_available(): - self.network.cuda() + + if backend.is_available(): + self.network = backend.to(self.network) + self.network.inference_apply_nonlin = softmax_helper def initialize_optimizer_and_scheduler(self): assert self.network is not None, "self.initialize_network must be called first" - self.optimizer = torch.optim.SGD(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay, - momentum=0.99, nesterov=True) + self.network, self.optimizer = backend.optimizer(model=self.network, optimizer=torch.optim.SGD(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay, + momentum=0.99, nesterov=True)) self.lr_scheduler = None def run_online_evaluation(self, output, target): @@ -236,24 +239,28 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= data = maybe_to_torch(data) target = maybe_to_torch(target) - if torch.cuda.is_available(): - data = to_cuda(data) - target = to_cuda(target) + if backend.is_available(): + data = backend.to(data) + target = backend.to(target) self.optimizer.zero_grad() if self.fp16: - with autocast(): + with backend.autocast(): output = self.network(data) del data l = self.loss(output, target) - if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + if do_backprop and self.grad_scaler: + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + elif do_backprop: + l.backward() + torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) + self.optimizer.step() else: output = self.network(data) del data diff --git a/nnunet/training/network_training/nnUNetTrainerV2_DDP.py b/nnunet/training/network_training/nnUNetTrainerV2_DDP.py index 737bf2fb8..5aa05173d 100644 --- a/nnunet/training/network_training/nnUNetTrainerV2_DDP.py +++ b/nnunet/training/network_training/nnUNetTrainerV2_DDP.py @@ -37,14 +37,14 @@ from nnunet.utilities.distributed import awesome_allgather_function from nnunet.utilities.nd_softmax import softmax_helper from nnunet.utilities.tensor_utilities import sum_tensor -from nnunet.utilities.to_torch import to_cuda, maybe_to_torch +from nnunet.utilities.to_torch import maybe_to_torch from torch import nn, distributed -from torch.backends import cudnn -from torch.cuda.amp import autocast from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim.lr_scheduler import _LRScheduler from tqdm import trange +from nnunet.backends import backend + class nnUNetTrainerV2_DDP(nnUNetTrainerV2): def __init__(self, plans_file, fold, local_rank, output_folder=None, dataset_directory=None, batch_dice=True, @@ -58,12 +58,12 @@ def __init__(self, plans_file, fold, local_rank, output_folder=None, dataset_dir self.distribute_batch_size = distribute_batch_size np.random.seed(local_rank) torch.manual_seed(local_rank) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(local_rank) + if backend.is_available(): + backend.manual_seed_all(local_rank) self.local_rank = local_rank - if torch.cuda.is_available(): - torch.cuda.set_device(local_rank) + if backend.is_available(): + backend.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://') self.loss = None @@ -209,24 +209,28 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= data = maybe_to_torch(data) target = maybe_to_torch(target) - if torch.cuda.is_available(): - data = to_cuda(data, gpu_id=None) - target = to_cuda(target, gpu_id=None) + if backend.is_available(): + data = backend.to(data, gpu_id=None) + target = backend.to(target, gpu_id=None) self.optimizer.zero_grad() if self.fp16: - with autocast(): + with backend.autocast(): output = self.network(data) del data l = self.compute_loss(output, target) - if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + if do_backprop and self.grad_scaler: + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + elif do_backprop: + l.backward() + torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) + self.optimizer.step() else: output = self.network(data) del data @@ -323,8 +327,8 @@ def run_training(self): if self.local_rank == 0: self.save_debug_information() - if not torch.cuda.is_available(): - self.print_to_log_file("WARNING!!! You are attempting to run training on a CPU (torch.cuda.is_available() is False). This can be VERY slow!") + if not backend.is_available(): + self.print_to_log_file("WARNING!!! You are attempting to run training on a CPU (no CUDA/XPU accelerator found). This can be VERY slow!") self.maybe_update_lr(self.epoch) # if we dont overwrite epoch then self.epoch+1 is used which is not what we # want at the start of the training @@ -338,17 +342,17 @@ def run_training(self): _ = self.tr_gen.next() _ = self.val_gen.next() - if torch.cuda.is_available(): - torch.cuda.empty_cache() + if backend.is_available(): + backend.empty_cache() self._maybe_init_amp() maybe_mkdir_p(self.output_folder) self.plot_network_architecture() - if cudnn.benchmark and cudnn.deterministic: - warn("torch.backends.cudnn.deterministic is True indicating a deterministic training is desired. " - "But torch.backends.cudnn.benchmark is True as well and this will prevent deterministic training! " + if backend.is_benchmark() and backend.is_deterministic(): + warn(f"{backend.name()}.deterministic is True indicating a deterministic training is desired. " + f"But {backend.name()}.benchmark is True as well and this will prevent deterministic training! " "If you want deterministic then set benchmark=False") if not self.was_initialized: @@ -653,7 +657,7 @@ def load_checkpoint_ram(self, checkpoint, train=True): if self.fp16: self._maybe_init_amp() if 'amp_grad_scaler' in checkpoint.keys(): - self.amp_grad_scaler.load_state_dict(checkpoint['amp_grad_scaler']) + self.grad_scaler.load_state_dict(checkpoint['amp_grad_scaler']) self.network.load_state_dict(new_state_dict) self.epoch = checkpoint['epoch'] diff --git a/nnunet/training/network_training/nnUNetTrainerV2_DP.py b/nnunet/training/network_training/nnUNetTrainerV2_DP.py index 0af5c9867..bbd5ca739 100644 --- a/nnunet/training/network_training/nnUNetTrainerV2_DP.py +++ b/nnunet/training/network_training/nnUNetTrainerV2_DP.py @@ -19,16 +19,17 @@ from nnunet.network_architecture.generic_UNet_DP import Generic_UNet_DP from nnunet.training.data_augmentation.data_augmentation_moreDA import get_moreDA_augmentation from nnunet.training.network_training.nnUNetTrainerV2 import nnUNetTrainerV2 -from nnunet.utilities.to_torch import maybe_to_torch, to_cuda +from nnunet.utilities.to_torch import maybe_to_torch from nnunet.network_architecture.initialization import InitWeights_He from nnunet.network_architecture.neural_network import SegmentationNetwork from nnunet.training.dataloading.dataset_loading import unpack_dataset from nnunet.training.network_training.nnUNetTrainer import nnUNetTrainer from nnunet.utilities.nd_softmax import softmax_helper from torch import nn -from torch.cuda.amp import autocast from torch.nn.parallel.data_parallel import DataParallel +from nnunet.backends import backend + class nnUNetTrainerV2_DP(nnUNetTrainerV2): def __init__(self, plans_file, fold, output_folder=None, dataset_directory=None, batch_dice=True, stage=None, @@ -144,14 +145,16 @@ def initialize_network(self): self.conv_per_stage, 2, conv_op, norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs, net_nonlin, net_nonlin_kwargs, True, False, InitWeights_He(1e-2), self.net_num_pool_op_kernel_sizes, self.net_conv_kernel_sizes, False, True, True) - if torch.cuda.is_available(): - self.network.cuda() + + if backend.is_available(): + self.network = backend.to(self.network) + self.network.inference_apply_nonlin = softmax_helper def initialize_optimizer_and_scheduler(self): assert self.network is not None, "self.initialize_network must be called first" - self.optimizer = torch.optim.SGD(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay, - momentum=0.99, nesterov=True) + self.network, self.optimizer = backend.optimizer(model=self.network, optimizer=torch.optim.SGD(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay, + momentum=0.99, nesterov=True)) self.lr_scheduler = None def run_training(self): @@ -175,14 +178,14 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= data = maybe_to_torch(data) target = maybe_to_torch(target) - if torch.cuda.is_available(): - data = to_cuda(data) - target = to_cuda(target) + if backend.is_available(): + data = backend.to(data) + target = backend.to(target) self.optimizer.zero_grad() if self.fp16: - with autocast(): + with backend.autocast(): ret = self.network(data, target, return_hard_tp_fp_fn=run_online_evaluation) if run_online_evaluation: ces, tps, fps, fns, tp_hard, fp_hard, fn_hard = ret @@ -192,12 +195,18 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= del data, target l = self.compute_loss(ces, tps, fps, fns) - if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + if do_backprop and self.grad_scaler: + # not all architectures have a grad scaler at the time of writing this code + # e.g. CUDA has one, but Intel XPU has not + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + elif do_backprop: + l.backward() + torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) + self.optimizer.step() else: ret = self.network(data, target, return_hard_tp_fp_fn=run_online_evaluation) if run_online_evaluation: @@ -253,4 +262,4 @@ def compute_loss(self, ces, tps, fps, fns): else: loss += self.loss_weights[i] * (ces[i].mean() + dice_loss) ########### - return loss \ No newline at end of file + return loss diff --git a/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_2epochs.py b/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_2epochs.py index 296ca7f4e..60038777a 100644 --- a/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_2epochs.py +++ b/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_2epochs.py @@ -120,11 +120,11 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= l = self.loss(output, target) if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() else: output = self.network(data) del data @@ -194,11 +194,11 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= l = self.loss(output, target) if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() else: output = self.network(data) del data @@ -270,11 +270,11 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= l = self.compute_loss(output, target) if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() else: output = self.network(data) del data diff --git a/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_dummyLoad.py b/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_dummyLoad.py index 355857a9e..fc0d333f6 100644 --- a/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_dummyLoad.py +++ b/nnunet/training/network_training/nnUNet_variants/benchmarking/nnUNetTrainerV2_dummyLoad.py @@ -43,11 +43,11 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= l = self.loss(output, target) if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() else: output = self.network(data) del data @@ -118,11 +118,11 @@ def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation= l = self.loss(output, target) if do_backprop: - self.amp_grad_scaler.scale(l).backward() - self.amp_grad_scaler.unscale_(self.optimizer) + self.grad_scaler.scale(l).backward() + self.grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) - self.amp_grad_scaler.step(self.optimizer) - self.amp_grad_scaler.update() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() else: output = self.network(data) del data diff --git a/nnunet/utilities/to_torch.py b/nnunet/utilities/to_torch.py index ab68035eb..9ed07ebe4 100644 --- a/nnunet/utilities/to_torch.py +++ b/nnunet/utilities/to_torch.py @@ -21,11 +21,3 @@ def maybe_to_torch(d): elif not isinstance(d, torch.Tensor): d = torch.from_numpy(d).float() return d - - -def to_cuda(data, non_blocking=True, gpu_id=0): - if isinstance(data, list): - data = [i.cuda(gpu_id, non_blocking=non_blocking) for i in data] - else: - data = data.cuda(gpu_id, non_blocking=non_blocking) - return data From a0799f8b78791ba412be6c8cbb25e778395b257c Mon Sep 17 00:00:00 2001 From: T Phil Date: Tue, 14 Mar 2023 14:53:48 +0100 Subject: [PATCH 2/2] Cleanup --- nnunet/backends/.backends.py.swp | Bin 16384 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 nnunet/backends/.backends.py.swp diff --git a/nnunet/backends/.backends.py.swp b/nnunet/backends/.backends.py.swp deleted file mode 100644 index ffddb12fee0e7a8f814944a67acb9b34b9f8773e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI2OKjXk7{@0~D3COOiUhPAI&B2mjl$+p0i}>anhGgGh#-=-5~{3Od$w!EYa4sq zWI;S`h=zyG)P{}*p3 za z6CuBV3*a0$15SgZzy~$J!L#6gum#)$2EcDag!~9DgRjAH@DcbBya5h@1@H`b6byl@ zgM|DIu7Gpk3^)x=fwzGUo&uY}AGZ>67Muhpz@Gy7?41o(93Hc6u3r>Rffe$<|5AFq9!B004@)c+R4QvBLU=aL1 zK*;yt0{9ZV3*G@$Fb$?aA2^Q#oo~R$zymJ;7tDhP!2sw7m(edyfaBmr5cZSa{g|dM zvqnwdw0*&BotF$I*f_OJUzB$fnabcbDjY3-;iKept2R!fRAbCL?2ps2vE>!{A2NhF zWG;`3z_Y1*gM81Hu+S)-ZZY4d^D^tfg{8gGyDX*nbJk$4;GV2Rm>HEoy`?=pFj&RsLNoXg z6B6Z%m-tGhS+gQ3(+y?^%+h?$4UJh=cl-H->vU4cLEO;!x?bB9sfzH1E1H_l^g1t< z6IIf8Wr;g<6HQ92u_Mg1l5D0Q-C$KKX~6Iy#T9oi*$tC>^1i%5XTp!L*-D4e0Zn6^ z+UO%y%h8uj`|wQqOH@voms>uMy1e`*uh`o2W>H%d-H@8pw8N~2NN5Fr)*oW&qRuRO z0UxJPm1ZV8=orb^E#AW7tS{W6MowP(f*;%N6=QQ~&jRrkH>2zomDUJz-6O4|lYA)U z`+1^OKH-`%x0bu>8+}>n(xLXR0x)YP6MAhsncFf)haQ+Ul$tlX3YMrD?`yH{Sy`U_{2^2?)%`ScVM-lVvaf8>{p0gO-iDX3$~Wkzmc z=+L6cE}To{5SbmM^0w2V*VV4H8Vwe0gK}ih;L8-6)^+rcrqEfaQy{!uOt(ZU8(6#V2e_j?xBdtL8geL>y>ZF$Cb+C`Y7|II z5z3HS*>$OLG=#+KjnG~yV?2DKe34X0Y}*8yr0jN3CH1tEF4+RNYB#sEZ?;{r*kNc1 z+fv!2tg5%lfi5zR@0n3%gjNGV0~_l%5!ejP-=w0>o0NGRYezv#O)ko0gswP&WuP-A z2Dm}l3Tw(W-$HXy?iY}1-bMpc)8NcP4i7nC%;!{RE~}cBDNODoS@cppjfJ8t+JX4A z+N6u&d;N5wjuo-o|I7V%0ObC^_51!=?Cp<%WpDuO2fM*mumxPj-u@D}2#$jzpaEvU z0O$voVDnS(33v;HK6>vR)l;Q_Qa~x76i^B%1(X6x0i}RaKq+uzDd2k8b(i9;StUC2 zE0fk%ZHA_gz{>4Ys`&Ib+4iSSy*eJo6+9UW_ttX1p0}Sr2TK4(f|BIQV~P> HUy=R+!QhV7