diff --git a/distributed/tensor_parallelism/README.md b/distributed/tensor_parallelism/README.md index ec61071e65..a3c36e3864 100644 --- a/distributed/tensor_parallelism/README.md +++ b/distributed/tensor_parallelism/README.md @@ -10,7 +10,25 @@ PyTorch native Tensor Parallel APIs, which include: More details about the PyTorch native Tensor Parallel APIs, please see PyTorch docs: https://pytorch.org/docs/stable/distributed.tensor.parallel.html -``` +## Installation + +```bash pip install -r requirements.txt -torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py ``` + +## Running Examples + +You can run the examples using `torchrun` to launch distributed training: + +```bash +# Simple Tensor Parallel example +torchrun --nnodes=1 --nproc_per_node=4 tensor_parallel_example.py + +# Tensor Parallel with Sequence Parallel +torchrun --nnodes=1 --nproc_per_node=4 sequence_parallel_example.py + +# FSDP + Tensor Parallel with Llama2 model +torchrun --nnodes=1 --nproc_per_node=4 fsdp_tp_example.py +``` + +For more details, check the `run_examples.sh` script. diff --git a/distributed/tensor_parallelism/fsdp_tp_example.py b/distributed/tensor_parallelism/fsdp_tp_example.py index 87935f10f0..4ae6cb1aa2 100644 --- a/distributed/tensor_parallelism/fsdp_tp_example.py +++ b/distributed/tensor_parallelism/fsdp_tp_example.py @@ -1,34 +1,3 @@ -import sys -import os -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F - -from log_utils import rank_log, get_logger, verify_min_gpu_count - -# ---- GPU check ------------ -_min_gpu_count = 4 - -if not verify_min_gpu_count(min_gpus=_min_gpu_count): - print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.") - sys.exit() -# --------------------------- - -from llama2_model import Transformer, ModelArgs - -from torch.distributed.device_mesh import init_device_mesh -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed._tensor import Shard, Replicate -from torch.distributed.tensor.parallel import ( - parallelize_module, - ColwiseParallel, - RowwiseParallel, - PrepareModuleInput, - SequenceParallel -) - - """ This is the script to test 2D Parallel which combines Tensor/Sequence parallel with Fully Sharded Data Parallel (TP/SP + FSDP) on a example @@ -60,6 +29,36 @@ https://pytorch.org/tutorials/intermediate/TP_tutorial.html """ +import sys +import os +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F + +from log_utils import rank_log, get_logger, verify_min_gpu_count + +# ---- GPU check ------------ +_min_gpu_count = 4 + +if not verify_min_gpu_count(min_gpus=_min_gpu_count): + print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.") + sys.exit() +# --------------------------- + +from llama2_model import Transformer, ModelArgs + +from torch.distributed.device_mesh import init_device_mesh +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed._tensor import Shard, Replicate +from torch.distributed.tensor.parallel import ( + parallelize_module, + ColwiseParallel, + RowwiseParallel, + PrepareModuleInput, + SequenceParallel +) + tp_size = 2 logger = get_logger() diff --git a/distributed/tensor_parallelism/sequence_parallel_example.py b/distributed/tensor_parallelism/sequence_parallel_example.py index b145fbc95e..988973af4b 100644 --- a/distributed/tensor_parallelism/sequence_parallel_example.py +++ b/distributed/tensor_parallelism/sequence_parallel_example.py @@ -1,5 +1,22 @@ -# The following is an example command to run this code -# torchrun --nnodes 1 --nproc-per-node 4 sequence_parallel_example.py +""" +This is the script to test Sequence Parallel(SP) on a toy model in a +Megetron-LM SPMD style. We show an E2E working flow from forward, +backward and optimization. + +We use the example of two `nn.Linear` layers with an element-wise `nn.RELU` +in between to show an example of sequence parallel, which was proposed in paper: + +https://arxiv.org/pdf/2205.05198.pdf. + +Like tensor parallel, we parallelize the first linear layer by column +and also parallelize the second linear layer by row. But the input in each rank +now is different so that we need one all-gather for input and one reduce-scatter +in the end of the second linear layer. + +The following is an example command to run this code + torchrun --nnodes 1 --nproc-per-node 4 sequence_parallel_example.py +""" + import os import sys import torch @@ -24,28 +41,8 @@ sys.exit() # --------------------------- - from torch.distributed._tensor.device_mesh import init_device_mesh - - -""" -This is the script to test Sequence Parallel(SP) on a toy model in a -Megetron-LM SPMD style. We show an E2E working flow from forward, -backward and optimization. - -We use the example of two `nn.Linear` layers with an element-wise `nn.RELU` -in between to show an example of sequence parallel, which was proposed in paper: - -https://arxiv.org/pdf/2205.05198.pdf. - -Like tensor parallel, we parallelize the first linear layer by column -and also parallelize the second linear layer by row. But the input in each rank -now is different so that we need one all-gather for input and one reduce-scatter -in the end of the second linear layer. -""" - - class ToyModel(nn.Module): """MLP based model""" diff --git a/distributed/tensor_parallelism/tensor_parallel_example.py b/distributed/tensor_parallelism/tensor_parallel_example.py index b96f982f0c..c42a952ea8 100755 --- a/distributed/tensor_parallelism/tensor_parallel_example.py +++ b/distributed/tensor_parallelism/tensor_parallel_example.py @@ -1,31 +1,3 @@ -# The following is an example command to run this code -# torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py -import os -import sys -import torch -import torch.nn as nn - -from torch.distributed.tensor.parallel import ( - parallelize_module, - ColwiseParallel, - RowwiseParallel, -) - -from log_utils import rank_log, get_logger, verify_min_gpu_count - -# ---- GPU check ------------ -_min_gpu_count = 2 - -if not verify_min_gpu_count(min_gpus=_min_gpu_count): - print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.") - sys.exit() -# --------------------------- - -from torch.distributed._tensor.device_mesh import init_device_mesh - - - - """ This is the script to test Tensor Parallel(TP) on a toy model in a Megetron-LM SPMD style. We show an E2E working flow from forward, @@ -55,8 +27,33 @@ to use and our `parallelize_module` API will parse and parallelize the modules based on the given `ParallelStyle`. We are using this PyTorch native Tensor Parallelism APIs in this example to show users how to use them. + +The following is an example command to run this code + torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py """ +import os +import sys +import torch +import torch.nn as nn +import torch.distributed as dist +from torch.distributed.tensor.parallel import ( + parallelize_module, + ColwiseParallel, + RowwiseParallel, +) +from log_utils import rank_log, get_logger, verify_min_gpu_count + +# ---- GPU check ------------ +_min_gpu_count = 2 + +if not verify_min_gpu_count(min_gpus=_min_gpu_count): + print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.") + sys.exit() +# --------------------------- + +from torch.distributed._tensor.device_mesh import init_device_mesh + class ToyModel(nn.Module): """MLP based model"""