-
Notifications
You must be signed in to change notification settings - Fork 2
/
repro_mod_tracker.py
79 lines (69 loc) · 2.55 KB
/
repro_mod_tracker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import torch
from torch import nn
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.utils.flop_counter import FlopCounterMode
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.testing._internal.distributed.fake_pg import FakeStore
from torch.utils.viz._cycles import warn_tensor_cycles
from torch.distributed._composable.checkpoint_activation import checkpoint
from torch.distributed._tools.mod_tracker import ModTracker
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
def forward(self, x):
x = self.pool(self.relu(self.conv(x)))
return x
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
self.conv_block1 = ConvBlock(3, 8)
self.conv_block2 = ConvBlock(8, 16)
self.conv_block3 = ConvBlock(16, 32)
self.flatten = nn.Flatten()
self.mlp1 = nn.Linear(25088, 128) # Adjust input features
self.relu = nn.ReLU()
self.mlp2 = nn.Linear(128, num_classes)
def forward(self, x):
x = self.conv_block1(x)
x = self.conv_block2(x)
x = self.conv_block3(x)
x = self.flatten(x)
x = self.relu(self.mlp1(x))
x = self.mlp2(x)
return x
def main(rank,):
torch.cuda.set_device(rank)
use_ac = True
# warn_tensor_cycles()
model = SimpleCNN().cuda()
if use_ac:
for module in model.modules():
if isinstance(module, ConvBlock):
checkpoint(module)
model = FSDP(model)
inputs = torch.randn(512, 3, 224, 224).cuda()
# with FlopCounterMode() as flop_counter:
with ModTracker():
for i in range(1000):
if i % 100 == 0 and rank == 0:
print (f"i={i}")
print(f"Memory: {torch.cuda.memory_allocated()/2**30} GiB")
pred = model(inputs)
pred.sum().backward()
if __name__ == "__main__":
# Initialize the distributed process group
# os.environ["MASTER_ADDR"] = "localhost"
# os.environ["MASTER_PORT"] = "12355"
# dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
world_size = 8
rank = 0
store = FakeStore()
torch.distributed.init_process_group(
"fake", rank=0, world_size=world_size, store=store
)
main(rank)