From 998e5ef17b4ea304daba2d3e905a49b61e1c33ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Melissa=20Weber=20Mendon=C3=A7a?= <melissawm@gmail.com>
Date: Fri, 4 Jul 2025 16:33:20 -0300
Subject: [PATCH 1/2] Fix duplicate labels and other docs build warnings

---
 docs/source/conf.py                         |  3 ++-
 docs/source/contribute/cpp_debugger.md      |  2 +-
 docs/source/contribute/plugins.md           |  5 +----
 docs/source/features/pallas.md              |  1 +
 docs/source/learn/_pjrt.md                  | 19 ++++++++++++-------
 docs/source/learn/pytorch-on-xla-devices.md |  2 +-
 docs/source/learn/troubleshoot.md           |  4 ++--
 docs/source/perf/spmd_advanced.md           |  4 ++--
 8 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 86307073826a..e1694ac9d88c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,7 +27,6 @@
     "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
     "sphinxcontrib.katex",
-    "sphinx.ext.autosectionlabel",
     "sphinx_copybutton",
     # "sphinx_panels",
     # "myst_parser", # Will be activated by myst_nb
@@ -38,6 +37,8 @@
 extensions = pytorch_extensions + [
     "myst_nb"
 ]
+# Automatically generate section anchors for selected heading level
+myst_heading_anchors = 3
 
 # Users must manually execute their notebook cells
 # with the correct hardware accelerator.
diff --git a/docs/source/contribute/cpp_debugger.md b/docs/source/contribute/cpp_debugger.md
index 514332191298..855aeecda277 100644
--- a/docs/source/contribute/cpp_debugger.md
+++ b/docs/source/contribute/cpp_debugger.md
@@ -54,7 +54,7 @@ We suggest the following steps:
 
 At this point, your PyTorch is built with debugging symbols and ready to debug
 with GDB. However, we recommend debugging with VSCode. For more information, see
-{ref}`Debug with VSCode`.
+[](#debug-with-vscode)
 
 ### Verify your file is built
 
diff --git a/docs/source/contribute/plugins.md b/docs/source/contribute/plugins.md
index 42643700fd2e..f7ac5bf68641 100644
--- a/docs/source/contribute/plugins.md
+++ b/docs/source/contribute/plugins.md
@@ -45,7 +45,7 @@ you can test with the placeholder `LIBRARY` device type. For example:
     [device(type='xla', index=0), device(type='xla', index=1), device(type='xla', index=2), device(type='xla', index=3)]
 
 To register your device type automatically for users as well as to
-handle extra setup for e.g. multiprocessing, you may implement the
+handle extra setup for e.g. multiprocessing, you may implement the
 `DevicePlugin` Python API. PyTorch/XLA plugin packages contain two key
 components:
 
@@ -65,9 +65,6 @@ class CpuPlugin(plugins.DevicePlugin):
     that identifies your `DevicePlugin`. For exmaple, to register the
     `EXAMPLE` device type in a `pyproject.toml`:
 
-```{=html}
-<!-- -->
-```
     [project.entry-points."torch_xla.plugins"]
     example = "torch_xla_cpu_plugin:CpuPlugin"
 
diff --git a/docs/source/features/pallas.md b/docs/source/features/pallas.md
index 89714ab9623a..8994fdebc28a 100644
--- a/docs/source/features/pallas.md
+++ b/docs/source/features/pallas.md
@@ -95,6 +95,7 @@ output = torch.ops.xla.paged_attention(
 )
 ```
 
+(pallas-integration-example)=
 #### Integration Example
 
 The vLLM TPU integration utilizes [PagedAttention
diff --git a/docs/source/learn/_pjrt.md b/docs/source/learn/_pjrt.md
index edaa56ecee72..3b0c0eeb9dff 100644
--- a/docs/source/learn/_pjrt.md
+++ b/docs/source/learn/_pjrt.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
 # PJRT Runtime
 
 PyTorch/XLA has migrated from the TensorFlow-based XRT runtime to the
@@ -39,7 +43,7 @@ the `runtime` tag.
     per device. On TPU v2 and v3 in PJRT, workloads are multiprocess and
     multithreaded (4 processes with 2 threads each), so your workload
     should be thread-safe. See [Multithreading on TPU
-    v2/v3](#multithreading-on-tpu-v2v3) and the [Multiprocessing section
+    v2/v3](multithreading-on-tpu-v2v3) and the [Multiprocessing section
     of the API
     guide](https://github.com/pytorch/xla/blob/master/API_GUIDE.md#running-on-multiple-xla-devices-with-multi-processing)
     for more information. Key differences to keep in mind:
@@ -267,7 +271,7 @@ for more information about TPU architecture.
     from .
 -   Under XRT, the server process is the only process that interacts
     with the TPU devices, and client processes don't have direct access
-    to the TPU devices. When profiling a single-host TPU (e.g. v3-8 or
+    to the TPU devices. When profiling a single-host TPU (e.g. v3-8 or
     v4-8), you would normally see 8 device traces (one for each TPU
     core). With PJRT, each process has one chip, and a profile from that
     process will show only 2 TPU cores.
@@ -282,11 +286,12 @@ for more information about TPU architecture.
     each TPU host
     (`[gcloud compute tpus tpu-vm   scp](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/tpus/tpu-vm/scp)`)
     and run the code on each host in parallel
-    (e.g. `[gcloud compute tpus tpu-vm   ssh --workers=all --command="PJRT_DEVICE=TPU python   run.py"](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/tpus/tpu-vm/ssh)`)
+    (e.g. `[gcloud compute tpus tpu-vm   ssh --workers=all --command="PJRT_DEVICE=TPU python   run.py"](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/tpus/tpu-vm/ssh)`)
 -   `xm.rendezvous` has been reimplemented using XLA-native collective
     communication to enhance stability on large TPU pods. See below for
     more details.
 
+(multithreading-on-tpu-v2v3)=
 ### Multithreading on TPU v2/v3
 
 On TPU v2 and v3, **distributed workloads always run multithreaded**,
@@ -332,7 +337,7 @@ implementation:
 -   Because XLA does not permit collective operations to run on a subset
     of workers, all workers must participate in the `rendezvous`.
 
-If you require the old behavior of `xm.rendezvous` (i.e. communicating
+If you require the old behavior of `xm.rendezvous` (i.e. communicating
 data without altering the XLA graph and/or synchronizing a subset of
 workers), consider using `` `torch.distributed.barrier ``
 \<<https://pytorch.org/docs/stable/distributed.html#torch.distributed.barrier>\>[\_\_
@@ -358,7 +363,7 @@ from the PyTorch documentation. Keep in mind these constraints:
 *New in PyTorch/XLA r2.0*
 
 When using PJRT with `torch.distributed` and
-`[torch.nn.parallel.DistributedDataParallel](https://github.com/pytorch/xla/blob/master/docs/ddp.md)`
+`[torch.nn.parallel.DistributedDataParallel](https://github.com/pytorch/xla/blob/master/docs/source/perf/ddp.md)`
 we strongly recommend using the new `xla://` `init_method`, which
 automatically finds the replica IDs, world size, and master IP by
 querying the runtime. For example:
@@ -398,9 +403,9 @@ Note: For TPU v2/v3, you still need to import
 `torch.distributed` is still experimental.
 
 For more information about using `DistributedDataParallel` on
-PyTorch/XLA, see [ddp.md](./ddp.md) on TPU V4. For an example that uses
+PyTorch/XLA, see [ddp.md](../perf/ddp.md) on TPU V4. For an example that uses
 DDP and PJRT together, run the following [example
-script](../test/test_train_mp_imagenet.py) on a TPU:
+script](../../../test/test_train_mp_imagenet.py) on a TPU:
 
 ``` bash
 PJRT_DEVICE=TPU python xla/test/test_train_mp_mnist.py --ddp --pjrt_distributed --fake_data --num_epochs 1
diff --git a/docs/source/learn/pytorch-on-xla-devices.md b/docs/source/learn/pytorch-on-xla-devices.md
index 4613ae035f2a..809bda91c2c2 100644
--- a/docs/source/learn/pytorch-on-xla-devices.md
+++ b/docs/source/learn/pytorch-on-xla-devices.md
@@ -103,7 +103,7 @@ XLA. The model definition, dataloader, optimizer and training loop can
 work on any device. The only XLA-specific code is a couple lines that
 acquire the XLA device and materializing the tensors. Calling `torch_xla.sync()`
 at the end of each training iteration causes XLA to execute its current
-graph and update the model's parameters. See {ref}`XLA Tensor Deep Dive`
+graph and update the model's parameters. See [](#xla-tensor-deep-dive)
 for more on how XLA creates graphs and runs
 operations.
 
diff --git a/docs/source/learn/troubleshoot.md b/docs/source/learn/troubleshoot.md
index 6a0a720498cd..b3f346af720d 100644
--- a/docs/source/learn/troubleshoot.md
+++ b/docs/source/learn/troubleshoot.md
@@ -164,7 +164,7 @@ disable execution analysis by `PT_XLA_DEBUG_LEVEL=1`). To use
 PyTorch/XLA efficiently, we expect the same models code to be run for
 every step and compilation only happen once for every graph. If you keep
 seeing `Compilation Cause`, you should try to dump the IR/HLO following
-{ref}`Common Debugging Environment Variables Combinations` and
+[](#common-debugging-environment-variables-combinations) and
 compare the graphs for each step and understand the source of the
 differences.
 
@@ -313,7 +313,7 @@ If your model shows bad performance, keep in mind the following caveats:
     *Solution*:
 
     -   For most ops we can lower them to XLA to fix it. Checkout
-        {ref}`Get A Metrics Report` to find out the
+        [](#get-a-metrics-report) to find out the
         missing ops and open a feature request on
         [GitHub](https://github.com/pytorch/xla/issues).
 
diff --git a/docs/source/perf/spmd_advanced.md b/docs/source/perf/spmd_advanced.md
index 95fcb2e4335b..7005ee5dd4c0 100644
--- a/docs/source/perf/spmd_advanced.md
+++ b/docs/source/perf/spmd_advanced.md
@@ -3,7 +3,7 @@
 This guide covers advanced topics with SPMD. Please read the
 [SPMD user guide](https://github.com/pytorch/xla/blob/master/docs/spmd_basic.md) as a prerequisite.
 
-### Sharding-Aware Host-to-Device Data Loading
+## Sharding-Aware Host-to-Device Data Loading
 
 SPMD takes a single-device program, shards it, and executes it in parallel.
 
@@ -38,7 +38,7 @@ train_loader = pl.MpDeviceLoader(
 )
 ```
 
-### Virtual device optimization
+## Virtual device optimization
 
 PyTorch/XLA normally transfers tensor data asynchronously from host to device once the tensor is defined. This is to overlap the data transfer with the graph tracing time. However, because SPMD allows the user to modify the tensor sharding _after _the tensor has been defined, we need an optimization to prevent unnecessary transfer of tensor data back and forth between host and device. We introduce Virtual Device Optimization, a technique to place the tensor data on a virtual device SPMD:0 first, before uploading to the physical devices when all the sharding decisions are finalized. Every tensor data in SPMD mode is placed on a virtual device, SPMD:0. The virtual device is exposed to the user as an XLA device XLA:0 with the actual shards on physical devices, like TPU:0, TPU:1, etc.
 

From ed466c0e16395c801e9b72d0873e44f01a1bfcef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Melissa=20Weber=20Mendon=C3=A7a?= <melissawm@gmail.com>
Date: Mon, 7 Jul 2025 14:50:05 -0300
Subject: [PATCH 2/2] Improve wording

---
 docs/source/contribute/plugins.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/contribute/plugins.md b/docs/source/contribute/plugins.md
index f7ac5bf68641..40ae841e8d7b 100644
--- a/docs/source/contribute/plugins.md
+++ b/docs/source/contribute/plugins.md
@@ -45,7 +45,7 @@ you can test with the placeholder `LIBRARY` device type. For example:
     [device(type='xla', index=0), device(type='xla', index=1), device(type='xla', index=2), device(type='xla', index=3)]
 
 To register your device type automatically for users as well as to
-handle extra setup for e.g. multiprocessing, you may implement the
+handle extra setup, for example, multiprocessing, you may implement the
 `DevicePlugin` Python API. PyTorch/XLA plugin packages contain two key
 components: