定義:NCCL
2015年に単一ノード内のGPU間通信に焦点を当てたNCCL 1.0がリリース.2016年には,マルチノードに対応しInfiniBandのサポートも追加されたNCCL 2.0がリリース.その後も改良が続く.
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import torchvision
import torchvision.transforms as transforms
# 分散学習の初期化
def setup():
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# 単一GPUの場合はrank=0, world_size=1
dist.init_process_group(
backend="nccl",
rank=0,
world_size=1
)
# シンプルなCNNモデル
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(torch.relu(self.conv1(x)))
x = self.pool(torch.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
def train():
# GPUが利用可能か確認
if not torch.cuda.is_available():
print("GPU is not available. Using CPU instead.")
device = torch.device("cpu")
return
device = torch.device("cuda:0")
setup() # 分散学習の初期化
# データの前処理
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
# データセットの準備
trainset = torchvision.datasets.CIFAR10(
root='./data',
train=True,
download=True,
transform=transform
)
# 分散サンプラーとデータローダーの設定
train_sampler = DistributedSampler(trainset, num_replicas=1, rank=0)
trainloader = DataLoader(
trainset,
batch_size=64,
sampler=train_sampler,
num_workers=2
)
# モデルの設定
model = SimpleModel().to(device)
model = DDP(model, device_ids=[0])
# 損失関数とオプティマイザーの設定
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# 学習ループ
print("Starting training...")
num_epochs = 2
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
running_loss = 0.0
for i, data in enumerate(trainloader):
inputs, labels = data[0].to(device), data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 99:
print(f'[Epoch {epoch + 1}, Batch {i + 1}] loss: {running_loss / 100:.3f}')
running_loss = 0.0
print("Finished Training")
dist.destroy_process_group()
if __name__ == "__main__":
train()
MASTER_ADDR, MASTER_PORTは分散処理の通信に使用するアドレスとポートを示している.backend="nccl"とすることで,NVIDIAのGPU向け通信ライブラリ[NCCL]を使用できるようにしている.
rank=0 というのは,このプロセスのIDであり,単一GPUなので0.world_size=1というのは,使用するGPU総数であり,ここでは1台なので1.
続いて,SimpleModelでモデルの定義を行っている.
結果は,
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
100%|██████████| 170M/170M [00:05<00:00, 30.8MB/s]
Extracting ./data/cifar-10-python.tar.gz to ./data
Starting training...
[Epoch 1, Batch 100] loss: 2.304
[Epoch 1, Batch 200] loss: 2.304
[Epoch 1, Batch 300] loss: 2.303
[Epoch 1, Batch 400] loss: 2.303
[Epoch 1, Batch 500] loss: 2.302
[Epoch 1, Batch 600] loss: 2.301
[Epoch 1, Batch 700] loss: 2.300
[Epoch 2, Batch 100] loss: 2.298
[Epoch 2, Batch 200] loss: 2.296
[Epoch 2, Batch 300] loss: 2.294
[Epoch 2, Batch 400] loss: 2.289
[Epoch 2, Batch 500] loss: 2.283
[Epoch 2, Batch 600] loss: 2.272
[Epoch 2, Batch 700] loss: 2.251
Finished Training
Mathematics is the language with which God has written the universe.