pytorch · DiweiSun · Jun 20, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir 
+python3 setup.py install
+
+pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
+
+cd test/quantization
+pytest -v -s *.py
diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
@@ -0,0 +1,156 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: xpu-test
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    timeout-minutes: 60
+    runs-on: ao-pvc
+    env:
+      DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3
+      TEST_COMMAND: .github/scripts/ci_test_xpu.sh
+      PYTORCH_RETRY_TEST_CASES: 1
+      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
+      XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout Torchao
+        uses: actions/checkout@v4
+
+      - name: Clean all stopped docker containers
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
+
+      - name: Runner health check GPU count
+        if: always()
+        shell: bash
+        run: |
+          ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
+          msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+          if [[ $ngpu -eq 0 ]]; then
+            echo "Error: Failed to detect any GPUs on the runner"
+            echo "$msg"
+            exit 1
+          fi
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        shell: bash
+        run: |
+          echo "docker pull ${DOCKER_IMAGE}"
+          docker pull ${DOCKER_IMAGE}
+
+      - name: Test
+        id: test
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        timeout-minutes: 60
+        run: |
+          set -x
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e BRANCH \
+            -e SHA1 \
+            --user $(id -u):$(id -g) \
+            --ulimit stack=10485760:83886080 \
+            --ulimit core=0 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="8g" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            --privileged \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # save container name for later step
+          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
+          docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
+
+      - name: Change permissions
+        if: ${{ always() && steps.test.conclusion }}
+        run: |
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Stop container before exit
+        if: always()
+        run: |
+          # Workaround for multiple runners on same IDC node
+          docker stop "${{ env.CONTAINER_NAME }}"
+
+      - name: Store Core dumps on GitHub
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Teardown XPU
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py
@@ -2,13 +2,16 @@
 from pathlib import Path
 
 import torch
-from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal.common_utils import (
+    TestCase,
+)
 
 from torchao._models.llama.model import (
     ModelArgs,
     Transformer,
     prepare_inputs_for_model,
 )
+from torchao.utils import auto_detect_device
 from torchao._models.llama.tokenizer import get_tokenizer
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
 from torchao.quantization.utils import compute_error
@@ -18,10 +21,10 @@
 
 torch.manual_seed(0)
 
+_DEVICE = auto_detect_device()
 
 class TestGPTQ(TestCase):
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_gptq_quantizer_int4_weight_only(self):
         from torchao._models._eval import (
             LMEvalInputRecorder,
@@ -30,7 +33,6 @@ def test_gptq_quantizer_int4_weight_only(self):
         from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 
         precision = torch.bfloat16
-        device = "cuda"
         checkpoint_path = Path(
             "../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"
         )
@@ -80,7 +82,7 @@ def test_gptq_quantizer_int4_weight_only(self):
         model = quantizer.quantize(model, *inputs).cuda()
 
         model.reset_caches()
-        with torch.device("cuda"):
+        with torch.device(_DEVICE):
             model.setup_caches(max_batch_size=1, max_seq_length=model.config.block_size)
 
         limit = 1
@@ -89,7 +91,7 @@ def test_gptq_quantizer_int4_weight_only(self):
             tokenizer,
             model.config.block_size,
             prepare_inputs_for_model,
-            device,
+            _DEVICE,
         ).run_eval(
             ["wikitext"],
             limit,
@@ -102,7 +104,6 @@ def test_gptq_quantizer_int4_weight_only(self):
 
 class TestMultiTensorFlow(TestCase):
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_add_tensors(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -115,7 +116,6 @@ def test_multitensor_add_tensors(self):
         self.assertTrue(torch.equal(mt.values[1], tensor2))
 
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_pad_unpad(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -127,7 +127,6 @@ def test_multitensor_pad_unpad(self):
         self.assertEqual(mt.count, 1)
 
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_inplace_operation(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -138,7 +137,6 @@ def test_multitensor_inplace_operation(self):
 
 
 class TestMultiTensorInputRecorder(TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_input_recorder(self):
         from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder
 
@@ -159,7 +157,6 @@ def test_multitensor_input_recorder(self):
         self.assertTrue(isinstance(MT_input[2][2], MultiTensor))
         self.assertEqual(MT_input[3], torch.float)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_gptq_with_input_recorder(self):
         from torchao.quantization.GPTQ import (
             Int4WeightOnlyGPTQQuantizer,
@@ -170,7 +167,7 @@ def test_gptq_with_input_recorder(self):
 
         config = ModelArgs(n_layer=2)
 
-        with torch.device("cuda"):
+        with torch.device(_DEVICE):
             model = Transformer(config)
             model.setup_caches(max_batch_size=2, max_seq_length=100)
             idx = torch.randint(1, 10000, (10, 2, 50)).to(torch.int32)
@@ -191,7 +188,11 @@ def test_gptq_with_input_recorder(self):
 
         args = input_recorder.get_recorded_inputs()
 
-        quantizer = Int4WeightOnlyGPTQQuantizer()
+        if _DEVICE == "xpu":          
+            from torchao.dtypes import Int4XPULayout
+            quantizer = Int4WeightOnlyGPTQQuantizer(device=torch.device("xpu"), layout=Int4XPULayout())
+        else:
+            quantizer = Int4WeightOnlyGPTQQuantizer()
 
         quantizer.quantize(model, *args)
 

diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
@@ -31,8 +31,11 @@
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
     is_sm_at_least_90,
+    auto_detect_device,
 )
 
+_DEVICE = auto_detect_device()
+
 if torch.version.hip is not None:
     pytest.skip(
         "ROCm support for MoE quantization is under development",
@@ -52,7 +55,7 @@ def _test_impl_moe_quant(
         base_class=AffineQuantizedTensor,
         tensor_impl_class=None,
         dtype=torch.bfloat16,
-        device="cuda",
+        device=_DEVICE,
         fullgraph=False,
     ):
         """
@@ -114,8 +117,6 @@ def _test_impl_moe_quant(
         ]
     )
     def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 
@@ -138,10 +139,6 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
         ]
     )
     def test_int4wo_base(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
-        if not is_sm_at_least_90():
-            self.skipTest("Requires CUDA capability >= 9.0")
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 

diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -83,11 +83,13 @@
     TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_6,
+    auto_detect_device,
 )
 
 # TODO: put this in a common test utils file
-_CUDA_IS_AVAILABLE = torch.cuda.is_available()
+_GPU_IS_AVAILABLE = True if torch.cuda.is_available() or torch.xpu.is_available() else False
 
+_DEVICE = auto_detect_device()
 
 class Sub(torch.nn.Module):
     def __init__(self):
@@ -329,7 +331,7 @@ def _set_ptq_weight(
                 group_size,
             )
             q_weight = torch.ops.aten._convert_weight_to_int4pack(
-                q_weight.to("cuda"),
+                q_weight.to(_DEVICE),
                 qat_linear.inner_k_tiles,
             )
             ptq_linear.weight = q_weight
@@ -600,13 +602,13 @@ def _assert_close_4w(self, val, ref):
         print(mean_err)
         self.assertTrue(mean_err < 0.05)
 
-    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(not _GPU_IS_AVAILABLE, "skipping when cuda or xpu is not available")
     def test_qat_4w_primitives(self):
         n_bit = 4
         group_size = 32
         inner_k_tiles = 8
         scales_precision = torch.bfloat16
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         x = torch.randn(100, 256, dtype=dtype, device=device)
@@ -654,13 +656,13 @@ def test_qat_4w_primitives(self):
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
     )
-    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(not _GPU_IS_AVAILABLE, "skipping when cuda or xpu is not available")
     def test_qat_4w_linear(self):
         from torchao.quantization.GPTQ import WeightOnlyInt4Linear
         from torchao.quantization.qat.linear import Int4WeightOnlyQATLinear
 
         group_size = 128
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         qat_linear = Int4WeightOnlyQATLinear(
@@ -701,14 +703,14 @@ def test_qat_4w_quantizer_gradients(self):
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
     )
-    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(not _GPU_IS_AVAILABLE, "skipping when cuda or xpu is not available")
     def test_qat_4w_quantizer(self):
         from torchao.quantization.GPTQ import Int4WeightOnlyQuantizer
         from torchao.quantization.qat import Int4WeightOnlyQATQuantizer
 
         group_size = 32
         inner_k_tiles = 8
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         m = M().to(device).to(dtype)