Skip to content

Cluster lifecycle removal #865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,362 changes: 701 additions & 661 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ cryptography = "43.0.3"
executing = "1.2.0"
pydantic = "< 2"
ipywidgets = "8.1.2"
odh-kuberay-client = {version = "0.0.0.dev40", source = "testpypi"}

[[tool.poetry.source]]
name = "pypi"

[[tool.poetry.source]]
name = "testpypi"
url = "https://test.pypi.org/simple/"

[tool.poetry.group.docs]
optional = true
Expand Down
1 change: 1 addition & 0 deletions src/codeflare_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
AWManager,
AppWrapperStatus,
RayJobClient,
RayJob,
)

from .common.widgets import view_clusters
Expand Down
4 changes: 4 additions & 0 deletions src/codeflare_sdk/ray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
RayJobClient,
)

from .rayjobs import (
RayJob,
)

from .cluster import (
Cluster,
ClusterConfiguration,
Expand Down
2 changes: 2 additions & 0 deletions src/codeflare_sdk/ray/cluster/build_ray_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
"enableIngress": False,
"rayStartParams": {
"dashboard-host": "0.0.0.0",
"dashboard-port": "8265",
"block": "true",
"num-gpus": str(head_gpu_count),
"resources": head_resources,
Expand Down Expand Up @@ -245,6 +246,7 @@ def get_labels(cluster: "codeflare_sdk.ray.cluster.Cluster"):
"""
labels = {
"controller-tools.k8s.io": "1.0",
"ray.io/cluster": cluster.config.name, # Enforced label always present
}
if cluster.config.labels != {}:
labels.update(cluster.config.labels)
Expand Down
8 changes: 6 additions & 2 deletions src/codeflare_sdk/ray/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@

from time import sleep
from typing import List, Optional, Tuple, Dict
import copy

from ray.job_submission import JobSubmissionClient
from ray.job_submission import JobSubmissionClient, JobStatus
import time
import uuid
import warnings

from ...common.kubernetes_cluster.auth import (
config_check,
Expand Down Expand Up @@ -57,7 +61,6 @@
from kubernetes.client.rest import ApiException

from kubernetes.client.rest import ApiException
import warnings

CF_SDK_FIELD_MANAGER = "codeflare-sdk"

Expand Down Expand Up @@ -760,6 +763,7 @@ def get_cluster(
head_extended_resource_requests=head_extended_resources,
worker_extended_resource_requests=worker_extended_resources,
)

# Ignore the warning here for the lack of a ClusterConfiguration
with warnings.catch_warnings():
warnings.filterwarnings(
Expand Down
10 changes: 8 additions & 2 deletions src/codeflare_sdk/ray/cluster/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,5 +758,11 @@ def custom_side_effect(group, version, namespace, plural, **kwargs):

# Make sure to always keep this function last
def test_cleanup():
os.remove(f"{aw_dir}test-all-params.yaml")
os.remove(f"{aw_dir}aw-all-params.yaml")
# Remove files only if they exist
test_file = f"{aw_dir}test-all-params.yaml"
if os.path.exists(test_file):
os.remove(test_file)

aw_file = f"{aw_dir}aw-all-params.yaml"
if os.path.exists(aw_file):
os.remove(aw_file)
1 change: 1 addition & 0 deletions src/codeflare_sdk/ray/rayjobs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .rayjob import RayJob
111 changes: 111 additions & 0 deletions src/codeflare_sdk/ray/rayjobs/rayjob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
RayJob client for submitting and managing Ray jobs using the odh-kuberay-client.
"""

import logging
from typing import Dict, Any, Optional
from odh_kuberay_client.kuberay_job_api import RayjobApi

# Set up logging
logger = logging.getLogger(__name__)


class RayJob:
"""
A client for managing Ray jobs using the KubeRay operator.

This class provides a simplified interface for submitting and managing
Ray jobs in a Kubernetes cluster with the KubeRay operator installed.
"""

def __init__(
self,
job_name: str,
cluster_name: str,
namespace: str = "default",
entrypoint: str = "None",
runtime_env: Optional[Dict[str, Any]] = None,
):
"""
Initialize a RayJob instance.

Args:
name: The name for the Ray job
namespace: The Kubernetes namespace to submit the job to (default: "default")
cluster_name: The name of the Ray cluster to submit the job to
**kwargs: Additional configuration options
"""
self.name = job_name
self.namespace = namespace
self.cluster_name = cluster_name
self.entrypoint = entrypoint
self.runtime_env = runtime_env

# Initialize the KubeRay job API client
self._api = RayjobApi()

logger.info(f"Initialized RayJob: {self.name} in namespace: {self.namespace}")

def submit(
self,
) -> str:
"""
Submit the Ray job to the Kubernetes cluster.

Args:
entrypoint: The Python script or command to run
runtime_env: Ray runtime environment configuration (optional)

Returns:
The job ID/name if submission was successful

Raises:
RuntimeError: If the job has already been submitted or submission fails
"""
# Build the RayJob custom resource
rayjob_cr = self._build_rayjob_cr(
entrypoint=self.entrypoint,
runtime_env=self.runtime_env,
)

# Submit the job
logger.info(
f"Submitting RayJob {self.name} to RayCluster {self.cluster_name} in namespace {self.namespace}"
)
result = self._api.submit_job(k8s_namespace=self.namespace, job=rayjob_cr)

if result:
logger.info(f"Successfully submitted RayJob {self.name}")
return self.name
else:
raise RuntimeError(f"Failed to submit RayJob {self.name}")

def _build_rayjob_cr(
self,
entrypoint: str,
runtime_env: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Build the RayJob custom resource specification.

This creates a minimal RayJob CR that can be extended later.
"""
# Basic RayJob custom resource structure
rayjob_cr = {
"apiVersion": "ray.io/v1",
"kind": "RayJob",
"metadata": {
"name": self.name,
"namespace": self.namespace,
},
"spec": {
"entrypoint": entrypoint,
"clusterSelector": {"ray.io/cluster": self.cluster_name},
},
}

# Add runtime environment if specified
if runtime_env:
rayjob_cr["spec"]["runtimeEnvYAML"] = str(runtime_env)

return rayjob_cr
88 changes: 88 additions & 0 deletions src/codeflare_sdk/ray/rayjobs/test_rayjob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright 2024 IBM, Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
from unittest.mock import MagicMock
from codeflare_sdk.ray.rayjobs.rayjob import RayJob


def test_rayjob_submit_success(mocker):
"""Test successful RayJob submission."""
# Mock kubernetes config loading
mocker.patch("kubernetes.config.load_kube_config")

# Mock the RayjobApi class entirely
mock_api_class = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi")
mock_api_instance = MagicMock()
mock_api_class.return_value = mock_api_instance

# Configure the mock to return success when submit is called
mock_api_instance.submit.return_value = {"metadata": {"name": "test-rayjob"}}

# Create RayJob instance
rayjob = RayJob(
job_name="test-rayjob",
cluster_name="test-ray-cluster",
namespace="test-namespace",
entrypoint="python -c 'print(\"hello world\")'",
runtime_env={"pip": ["requests"]},
)

# Submit the job
job_id = rayjob.submit()

# Assertions
assert job_id == "test-rayjob"

# Verify the API was called with correct parameters
mock_api_instance.submit_job.assert_called_once()
call_args = mock_api_instance.submit_job.call_args

# Check the namespace parameter
assert call_args.kwargs["k8s_namespace"] == "test-namespace"

# Check the job custom resource
job_cr = call_args.kwargs["job"]
assert job_cr["metadata"]["name"] == "test-rayjob"
assert job_cr["metadata"]["namespace"] == "test-namespace"
assert job_cr["spec"]["entrypoint"] == "python -c 'print(\"hello world\")'"
assert job_cr["spec"]["clusterSelector"]["ray.io/cluster"] == "test-ray-cluster"
assert job_cr["spec"]["runtimeEnvYAML"] == "{'pip': ['requests']}"


def test_rayjob_submit_failure(mocker):
"""Test RayJob submission failure."""
# Mock kubernetes config loading
mocker.patch("kubernetes.config.load_kube_config")

# Mock the RayjobApi class entirely
mock_api_class = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi")
mock_api_instance = MagicMock()
mock_api_class.return_value = mock_api_instance

# Configure the mock to return failure (False/None) when submit_job is called
mock_api_instance.submit_job.return_value = None

# Create a RayJob instance
rayjob = RayJob(
job_name="test-rayjob",
cluster_name="test-ray-cluster",
namespace="default",
entrypoint="python script.py",
runtime_env={"pip": ["numpy"]},
)

# Test that RuntimeError is raised on failure
with pytest.raises(RuntimeError, match="Failed to submit RayJob test-rayjob"):
rayjob.submit()
2 changes: 2 additions & 0 deletions tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ spec:
controller-tools.k8s.io: '1.0'
key1: value1
key2: value2
ray.io/cluster: aw-all-params
name: aw-all-params
namespace: ns
spec:
Expand All @@ -38,6 +39,7 @@ spec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
dashboard-port: '8265'
num-gpus: '1'
resources: '"{\"TPU\": 2}"'
serviceType: ClusterIP
Expand Down
2 changes: 2 additions & 0 deletions tests/test_cluster_yamls/kueue/aw_kueue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ spec:
metadata:
labels:
controller-tools.k8s.io: '1.0'
ray.io/cluster: unit-test-aw-kueue
name: unit-test-aw-kueue
namespace: ns
spec:
Expand All @@ -32,6 +33,7 @@ spec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
dashboard-port: '8265'
num-gpus: '0'
resources: '"{}"'
serviceType: ClusterIP
Expand Down
2 changes: 2 additions & 0 deletions tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ spec:
metadata:
labels:
controller-tools.k8s.io: '1.0'
ray.io/cluster: unit-test-cluster-kueue
name: unit-test-cluster-kueue
namespace: ns
spec:
Expand All @@ -32,6 +33,7 @@ spec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
dashboard-port: '8265'
num-gpus: '0'
resources: '"{}"'
serviceType: ClusterIP
Expand Down
2 changes: 2 additions & 0 deletions tests/test_cluster_yamls/ray/default-appwrapper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ spec:
metadata:
labels:
controller-tools.k8s.io: '1.0'
ray.io/cluster: default-appwrapper
name: default-appwrapper
namespace: ns
spec:
Expand All @@ -30,6 +31,7 @@ spec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
dashboard-port: '8265'
num-gpus: '0'
resources: '"{}"'
serviceType: ClusterIP
Expand Down
2 changes: 2 additions & 0 deletions tests/test_cluster_yamls/ray/default-ray-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: RayCluster
metadata:
labels:
controller-tools.k8s.io: '1.0'
ray.io/cluster: default-cluster
name: default-cluster
namespace: ns
spec:
Expand All @@ -22,6 +23,7 @@ spec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
dashboard-port: '8265'
num-gpus: '0'
resources: '"{}"'
serviceType: ClusterIP
Expand Down
2 changes: 2 additions & 0 deletions tests/test_cluster_yamls/ray/unit-test-all-params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ metadata:
key1: value1
key2: value2
kueue.x-k8s.io/queue-name: local-queue-default
ray.io/cluster: test-all-params
name: test-all-params
namespace: ns
spec:
Expand All @@ -29,6 +30,7 @@ spec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
dashboard-port: '8265'
num-gpus: '1'
resources: '"{\"TPU\": 2}"'
serviceType: ClusterIP
Expand Down
Loading