device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyModel().to(device)
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# 分散環境の初期化
def setup(rank, world_size):
dist.init_process_group("nccl", rank=rank, world_size=world_size)
# 初期化関数の呼び出し
setup(rank, world_size)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
from torch.utils.data.distributed import DistributedSampler
train_sampler = DistributedSampler(dataset)
dataloader = DataLoader(
dataset,
batch_size=64, # 各GPUのバッチサイズ
sampler=train_sampler
)
model = MyModel().to(device)
model = MyModel()
model = DDP(model, device_ids=[local_rank])
for epoch in range(num_epochs):
for data, target in dataloader:
data = data.to(device)
target = target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch) # DDPで重要
for data, target in dataloader:
data = data.to(local_rank)
target = target.to(local_rank)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def cleanup():
dist.destroy_process_group()
Mathematics is the language with which God has written the universe.