This repository was archived by the owner on Jun 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 550
Add distributed training examples of PyTorch #4821
Merged
Merged
Changes from 23 commits
Commits
Show all changes
56 commits
Select commit
Hold shift + click to select a range
c612608
imagenet-nccl for test
vvfreesoul 9b8ce66
imagenet-nccl for test
vvfreesoul e6772d3
imagenet-nccl for test
vvfreesoul b1f5b8c
imagenet-nccl for test
vvfreesoul 31a46c8
imagenet-nccl for test
vvfreesoul 9057564
imagenet-nccl for test
vvfreesoul 610a420
imagenet-nccl for test
vvfreesoul da4b007
imagenet-nccl for test
vvfreesoul 6a5fc8c
imagenet-nccl for test
vvfreesoul f51c5aa
imagenet-nccl for test
vvfreesoul b4f03fe
imagenet-nccl for test
vvfreesoul e18c9f8
imagenet-nccl for test
vvfreesoul cf7c284
imagenet-nccl for test
vvfreesoul 3a84055
Add distributed training examples of PyTorch
vvfreesoul 4ad2f85
Add distributed training examples of PyTorch
vvfreesoul 43a11d2
Add distributed training examples of PyTorch
vvfreesoul 2e59d33
Add distributed training examples of PyTorch
vvfreesoul ed0a7c6
Add distributed training examples of PyTorch
vvfreesoul 6ac0633
Add distributed training examples of PyTorch
vvfreesoul 562c448
Add distributed training examples of PyTorch
vvfreesoul e4b5dd1
Add distributed training examples of PyTorch
vvfreesoul ce8b3ce
Add distributed training examples of PyTorch
vvfreesoul 0fd1f19
Add distributed training examples of PyTorch
vvfreesoul 7db6cbd
Add distributed training examples of PyTorch
vvfreesoul f46a663
Add distributed training examples of PyTorch
vvfreesoul 4519685
Add distributed training examples of PyTorch
vvfreesoul 326b051
Add distributed training examples of PyTorch
vvfreesoul d9f2d8d
Add distributed training examples of PyTorch
vvfreesoul 4bdb7c5
Add distributed training examples of PyTorch
vvfreesoul 4cbb352
Add distributed training examples of PyTorch
vvfreesoul 2c488f5
Add distributed training examples of PyTorch
vvfreesoul 9a93e9f
Add distributed training examples of PyTorch
vvfreesoul 1bb98ac
Add distributed training examples of PyTorch
vvfreesoul 353bfdf
Add distributed training examples of PyTorch
vvfreesoul 078d645
Add distributed training examples of PyTorch
vvfreesoul 4efc9ac
Add distributed training examples of PyTorch
vvfreesoul 6373f3a
Add distributed training examples of PyTorch
vvfreesoul 429a6e9
Add distributed training examples of PyTorch
vvfreesoul f8fa108
Add distributed training examples of PyTorch
vvfreesoul 659c48b
Add distributed training examples of PyTorch
vvfreesoul 0037ab4
Merge remote-tracking branch 'origin/master'
vvfreesoul f957c60
Add distributed training examples of PyTorch
vvfreesoul 863eda6
Add distributed training examples of PyTorch
vvfreesoul 640c193
Add distributed training examples of PyTorch
vvfreesoul 42cda8e
Add distributed training examples of PyTorch
vvfreesoul 8c2c599
Add distributed training examples of PyTorch
vvfreesoul eed7c7f
Add distributed training examples of PyTorch
vvfreesoul adeb4c6
Add distributed training examples of PyTorch
vvfreesoul 1f675a1
Add distributed training examples of PyTorch
vvfreesoul f0242c7
Add distributed training examples of PyTorch
vvfreesoul a54c606
Add distributed training examples of PyTorch
vvfreesoul f494dcf
Add distributed training examples of PyTorch
vvfreesoul c46c462
Add distributed training examples of PyTorch
vvfreesoul b18d0df
Add distributed training examples of PyTorch
vvfreesoul f585648
Add distributed training examples of PyTorch
vvfreesoul 853d112
Add distributed training examples of PyTorch
vvfreesoul File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Empty file.
117 changes: 117 additions & 0 deletions
117
examples/Distributed-example/Lite-imagenet-singal-mul-DDP-nccl-gloo.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| import os | ||
| from datetime import datetime | ||
| import argparse | ||
| import torch.multiprocessing as mp | ||
| import torch.backends.cudnn as cudnn | ||
| import torchvision | ||
| import torchvision.transforms as transforms | ||
| import torch | ||
| import torch.nn as nn | ||
| import torch.nn.functional as F | ||
| import torch.distributed as dist | ||
| from apex.parallel import DistributedDataParallel as DDP | ||
| from apex import amp | ||
|
|
||
| import torchvision.datasets as datasets | ||
| import torchvision.models as models | ||
| model_names = sorted(name for name in models.__dict__ | ||
| if name.islower() and not name.startswith("__") | ||
| and callable(models.__dict__[name])) | ||
| def main(): | ||
| print('run main') | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument('data', metavar='DIR', | ||
| help='path to dataset') | ||
| parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', | ||
| choices=model_names, | ||
| help='model architecture: ' + | ||
| ' | '.join(model_names) + | ||
| ' (default: resnet18)') | ||
| parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N', | ||
| help='number of data loading workers (default: 4)') | ||
| parser.add_argument('-g', '--gpus', default=1, type=int, | ||
| help='number of gpus per node') | ||
| parser.add_argument('-nr', '--nr', default=0, type=int, | ||
| help='ranking within the nodes') | ||
| parser.add_argument('-b', '--batch-size', default=256, type=int, | ||
| metavar='N', | ||
| help='mini-batch size (default: 256), this is the total ' | ||
| 'batch size of all GPUs on the current node when ' | ||
| 'using Data Parallel or Distributed Data Parallel') | ||
| parser.add_argument('--epochs', default=2, type=int, metavar='N', | ||
| help='number of total epochs to run') | ||
| parser.add_argument('--dist-backend', default='nccl', type=str, | ||
| help='distributed backend') | ||
| args = parser.parse_args() | ||
| args.world_size = args.gpus * args.nodes | ||
| print('world_size:',args.world_size) | ||
| os.environ['MASTER_ADDR'] = os.environ['PAI_HOST_IP_worker_0'] | ||
| os.environ['MASTER_PORT'] = os.environ['PAI_worker_0_SynPort_PORT'] | ||
| print('master:', os.environ['MASTER_ADDR'], 'port:', os.environ['MASTER_PORT']) | ||
| mp.spawn(train, nprocs=args.gpus, args=(args,)) | ||
|
|
||
| def train(gpu, args): | ||
| print("start train") | ||
| rank = int(os.environ['PAI_TASK_INDEX']) * args.gpus + gpu | ||
| dist.init_process_group(backend=args.dist_backend, init_method='env://', world_size=args.world_size, rank=rank) | ||
| torch.manual_seed(0) | ||
| model=model = models.__dict__[args.arch]() | ||
| torch.cuda.set_device(gpu) | ||
| model.cuda(gpu) | ||
| batch_size = args.batch_size | ||
| # define loss function (criterion) and optimizer | ||
| criterion = nn.CrossEntropyLoss().cuda(gpu) | ||
| optimizer = torch.optim.SGD(model.parameters(), 1e-4) | ||
| # Wrap the model | ||
| model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) | ||
| # Data loading code | ||
| traindir = os.path.join(args.data, 'train') | ||
| valdir = os.path.join(args.data, 'val') | ||
| normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], | ||
| std=[0.229, 0.224, 0.225]) | ||
| train_dataset = datasets.ImageFolder( | ||
| traindir, | ||
| transforms.Compose([ | ||
| transforms.RandomResizedCrop(224), | ||
| transforms.RandomHorizontalFlip(), | ||
| transforms.ToTensor(), | ||
| normalize, | ||
| ])) | ||
| train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) | ||
|
|
||
| train_loader = torch.utils.data.DataLoader( | ||
| train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), | ||
| num_workers=args.nodes, pin_memory=True, sampler=train_sampler) | ||
|
|
||
| val_loader = torch.utils.data.DataLoader( | ||
| datasets.ImageFolder(valdir, transforms.Compose([ | ||
| transforms.Resize(256), | ||
| transforms.CenterCrop(224), | ||
| transforms.ToTensor(), | ||
| normalize, | ||
| ])), | ||
| batch_size=args.batch_size, shuffle=False, | ||
| num_workers=args.nodes, pin_memory=True) | ||
| start = datetime.now() | ||
| total_step = len(train_loader) | ||
| for epoch in range(args.epochs): | ||
| for i, (images, labels) in enumerate(train_loader): | ||
| images = images.cuda(non_blocking=True) | ||
| labels = labels.cuda(non_blocking=True) | ||
| # Forward pass | ||
| outputs = model(images) | ||
| loss = criterion(outputs, labels) | ||
|
|
||
| # Backward and optimize | ||
| optimizer.zero_grad() | ||
| loss.backward() | ||
| optimizer.step() | ||
| #if (i + 1) % 100 == 0 and gpu == 0: | ||
| print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step, | ||
| loss.item())) | ||
| if gpu == 0: | ||
| print("Training complete in: " + str(datetime.now() - start)) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
122 changes: 122 additions & 0 deletions
122
examples/Distributed-example/LiteApex-imagenet-singal-mul-DDP-nccl-gloo.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| import os | ||
| from datetime import datetime | ||
| import argparse | ||
| import torch.multiprocessing as mp | ||
| import torch.backends.cudnn as cudnn | ||
| import torchvision | ||
| import torchvision.transforms as transforms | ||
| import torch | ||
| import torch.nn as nn | ||
| import torch.nn.functional as F | ||
| import torch.distributed as dist | ||
| from apex.parallel import DistributedDataParallel as DDP | ||
| from apex import amp | ||
|
|
||
| import torchvision.datasets as datasets | ||
| import torchvision.models as models | ||
| model_names = sorted(name for name in models.__dict__ | ||
| if name.islower() and not name.startswith("__") | ||
| and callable(models.__dict__[name])) | ||
| def main(): | ||
| print('run main') | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument('data', metavar='DIR', | ||
| help='path to dataset') | ||
| parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', | ||
| choices=model_names, | ||
| help='model architecture: ' + | ||
| ' | '.join(model_names) + | ||
| ' (default: resnet18)') | ||
| parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N', | ||
| help='number of data loading workers (default: 4)') | ||
| parser.add_argument('-g', '--gpus', default=1, type=int, | ||
| help='number of gpus per node') | ||
| parser.add_argument('-nr', '--nr', default=0, type=int, | ||
| help='ranking within the nodes') | ||
| parser.add_argument('-b', '--batch-size', default=256, type=int, | ||
| metavar='N', | ||
| help='mini-batch size (default: 256), this is the total ' | ||
| 'batch size of all GPUs on the current node when ' | ||
| 'using Data Parallel or Distributed Data Parallel') | ||
| parser.add_argument('--epochs', default=2, type=int, metavar='N', | ||
| help='number of total epochs to run') | ||
| parser.add_argument('--dist-backend', default='nccl', type=str, | ||
| help='distributed backend') | ||
| args = parser.parse_args() | ||
| args.world_size = args.gpus * args.nodes | ||
| print('world_size:',args.world_size) | ||
| os.environ['MASTER_ADDR'] = os.environ['PAI_HOST_IP_worker_0'] | ||
| os.environ['MASTER_PORT'] = os.environ['PAI_worker_0_SynPort_PORT'] | ||
| print('master:', os.environ['MASTER_ADDR'], 'port:', os.environ['MASTER_PORT']) | ||
| mp.spawn(train, nprocs=args.gpus, args=(args,)) | ||
|
|
||
| def train(gpu, args): | ||
| print("start train") | ||
| rank = int(os.environ['PAI_TASK_INDEX']) * args.gpus + gpu | ||
| dist.init_process_group(backend=args.dist_backend, init_method='env://', world_size=args.world_size, rank=rank) | ||
| torch.manual_seed(0) | ||
| model=model = models.__dict__[args.arch]() | ||
| torch.cuda.set_device(gpu) | ||
| model.cuda(gpu) | ||
| batch_size = args.batch_size | ||
| # define loss function (criterion) and optimizer | ||
| criterion = nn.CrossEntropyLoss().cuda(gpu) | ||
| optimizer = torch.optim.SGD(model.parameters(), 1e-4) | ||
| # Wrap the model | ||
| model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) | ||
| # Wrap the model | ||
| model, optimizer = amp.initialize(model, optimizer, opt_level='O2') | ||
| model = DDP(model) | ||
| # Data loading code | ||
| traindir = os.path.join(args.data, 'train') | ||
| valdir = os.path.join(args.data, 'val') | ||
| normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], | ||
| std=[0.229, 0.224, 0.225]) | ||
|
|
||
| train_dataset = datasets.ImageFolder( | ||
| traindir, | ||
| transforms.Compose([ | ||
| transforms.RandomResizedCrop(224), | ||
| transforms.RandomHorizontalFlip(), | ||
| transforms.ToTensor(), | ||
| normalize, | ||
| ])) | ||
| train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) | ||
|
|
||
| train_loader = torch.utils.data.DataLoader( | ||
| train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), | ||
| num_workers=args.nodes, pin_memory=True, sampler=train_sampler) | ||
|
|
||
| val_loader = torch.utils.data.DataLoader( | ||
| datasets.ImageFolder(valdir, transforms.Compose([ | ||
| transforms.Resize(256), | ||
| transforms.CenterCrop(224), | ||
| transforms.ToTensor(), | ||
| normalize, | ||
| ])), | ||
| batch_size=args.batch_size, shuffle=False, | ||
| num_workers=args.nodes, pin_memory=True) | ||
| start = datetime.now() | ||
| total_step = len(train_loader) | ||
| for epoch in range(args.epochs): | ||
| for i, (images, labels) in enumerate(train_loader): | ||
| images = images.cuda(non_blocking=True) | ||
| labels = labels.cuda(non_blocking=True) | ||
| # Forward pass | ||
| outputs = model(images) | ||
| loss = criterion(outputs, labels) | ||
|
|
||
| # Backward and optimize | ||
| optimizer.zero_grad() | ||
| with amp.scale_loss(loss, optimizer) as scaled_loss: | ||
| scaled_loss.backward() | ||
| loss.backward() | ||
| optimizer.step() | ||
| #if (i + 1) % 100 == 0 and gpu == 0: | ||
| print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step, | ||
| loss.item())) | ||
| if gpu == 0: | ||
| print("Training complete in: " + str(datetime.now() - start)) | ||
|
|
||
| if __name__ == '__main__': | ||
| main() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the
singalin file name should besingle?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done