From 06324e28b5607ad22b9f646bbff15514f37dd759 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 3 Dec 2021 10:03:24 -0800 Subject: [PATCH] Add buffer location property Sycl runtime calls clEnqueueWriteBuffer before clSetKernelArgs, this cause the buffer to be allocated on the device before knowing the right place of allocating the memory. As a result, later when kernel get invoked, the memory has to be copied from device's default global memory to the buffer location specified in kernel. This is an additional memory copy operation. This extension does not interfer with the other way of setting buffer location (i.e through clSetKernelArgs). This property exist for integration with sycl runtime, not for pure opencl user to use. If opencl user wish to use this property, they have to make sure the buffer location passed into clCreateBufferWithPropertyINTEL has to match the one defined in kernel function interface, otherwise the extra memory copy issue will remain. When resizing reserved allocation, we now have the information to allocate minimum amount of space required according to the property passed in. This is done to align with opencl docs and header in; https://github.com/KhronosGroup/OpenCL-Headers/pull/193 https://github.com/KhronosGroup/OpenCL-Docs/pull/746 --- src/acl_mem.cpp | 15 ++++++++++++-- src/acl_usm.cpp | 4 ++-- test/acl_mem_test.cpp | 46 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 62258405..59481e9b 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -416,6 +416,7 @@ CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( cl_bool context_has_device_with_physical_mem; unsigned int idevice; cl_uint bank_id = 0; + cl_uint tmp_mem_id = 0; acl_lock(); #ifdef MEM_DEBUG_MSG @@ -431,6 +432,9 @@ CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( } bank_id = (cl_uint) * (properties + 1); } break; + case CL_MEM_ALLOC_BUFFER_LOCATION_INTEL: { + tmp_mem_id = (cl_uint) * (properties + 1); + } break; default: { UNLOCK_BAIL_INFO(CL_INVALID_DEVICE, context, "Invalid properties"); } @@ -553,6 +557,7 @@ CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( UNLOCK_BAIL_INFO(CL_OUT_OF_HOST_MEMORY, context, "Could not allocate a cl_mem object"); } + mem->mem_id = tmp_mem_id; mem->block_allocation = new_block; mem->block_allocation->mem_obj = mem; @@ -784,7 +789,6 @@ CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( mem->context = context; mem->flags = flags; mem->size = size; - mem->mem_id = 0; mem->bank_id = 0; if (is_SOC_device()) { @@ -1254,7 +1258,7 @@ CL_API_ENTRY cl_mem CL_API_CALL clCreateSubBufferIntelFPGA( mem->context = context; mem->flags = sub_flags; - mem->mem_id = 0; + mem->mem_id = buffer->mem_id; if (is_SOC_device()) { // HPS DDR is system managed for SoC. @@ -1372,6 +1376,9 @@ CL_API_ENTRY cl_int CL_API_CALL clGetMemObjectInfoIntelFPGA( context = mem->context; switch (param_name) { + case CL_MEM_ALLOC_BUFFER_LOCATION_INTEL: + RESULT_UINT(mem->mem_id); + break; case CL_MEM_TYPE: RESULT_ENUM(mem->mem_object_type); break; @@ -4417,6 +4424,10 @@ void acl_resize_reserved_allocations_for_device(cl_mem mem, unsigned int num_global_mem_systems = def.autodiscovery_def.num_global_mem_systems; + // When we don't know how many memory systems will exist + // Load as much as needed. + num_global_mem_systems = std::max(num_global_mem_systems, mem->mem_id + 1); + // For the simulation flow we don't know how many memory systems will exist // until we load the .aocx, which may not happen until somewhat later. // Reserving space is quite cheap, so reserve space for many memory systems. diff --git a/src/acl_usm.cpp b/src/acl_usm.cpp index df7f4b0d..23b37d9d 100644 --- a/src/acl_usm.cpp +++ b/src/acl_usm.cpp @@ -255,8 +255,8 @@ clDeviceMemAllocINTEL(cl_context context, cl_device_id device, cl_int status; // Use cl_mem for convenience - cl_mem usm_device_buffer = - clCreateBufferIntelFPGA(context, CL_MEM_READ_WRITE, size, NULL, &status); + cl_mem usm_device_buffer = clCreateBufferWithPropertiesINTEL( + context, NULL, CL_MEM_READ_WRITE, size, NULL, &status); if (status != CL_SUCCESS) { UNLOCK_BAIL_INFO(status, context, "Failed to allocate device memory"); } diff --git a/test/acl_mem_test.cpp b/test/acl_mem_test.cpp index 497e2d4c..d9df11e7 100644 --- a/test/acl_mem_test.cpp +++ b/test/acl_mem_test.cpp @@ -2799,6 +2799,52 @@ TEST(acl_mem, case_205751_overlapping_alloc) { CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(c)); } +TEST(acl_mem, buffer_location_property) { + ACL_LOCKED(acl_print_debug_msg("begin buffer_location_property\n")); + // Test assumes more than 1 global memory space + // Allocate a small buffer (a), then try to allocate two buffers (b, c) of + // size bank_size. Expect the second allocation to fail. + cl_mem a; + cl_int status = CL_SUCCESS; + size_t total_size = ACL_RANGE_SIZE( + m_device[0]->def.autodiscovery_def.global_mem_defs[0].range); + size_t bank_size = total_size / 2; + size_t small_size = bank_size / 1024; + + cl_mem_properties_intel props[] = {CL_MEM_ALLOC_BUFFER_LOCATION_INTEL, 0, 0}; + a = clCreateBufferWithPropertiesINTEL(m_context, props, 0, bank_size, 0, + &status); + ACL_LOCKED(CHECK(acl_mem_is_valid(a))); + CHECK_EQUAL(CL_SUCCESS, status); + assert(a); + CHECK_EQUAL(1, acl_ref_count(a)); + cl_uint read_mem_id = 4; + size_t size_ret; + CHECK_EQUAL(CL_SUCCESS, + clGetMemObjectInfo(a, CL_MEM_ALLOC_BUFFER_LOCATION_INTEL, + sizeof(cl_uint), &read_mem_id, &size_ret)); + CHECK_EQUAL(0, read_mem_id); + + cl_buffer_region test_region = {0, 2}; + cl_mem subbuffer = + clCreateSubBuffer(a, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + &test_region, &status); + ACL_LOCKED(CHECK(acl_mem_is_valid(subbuffer))); + CHECK_EQUAL(CL_SUCCESS, status); + assert(subbuffer); + CHECK_EQUAL(2, acl_ref_count(a)); + read_mem_id = 4; + CHECK_EQUAL(CL_SUCCESS, + clGetMemObjectInfo(subbuffer, CL_MEM_ALLOC_BUFFER_LOCATION_INTEL, + sizeof(cl_uint), &read_mem_id, &size_ret)); + CHECK_EQUAL(0, read_mem_id); + + ACL_LOCKED(CHECK_EQUAL(acl_bind_buffer_to_device(m_cq->device, a), 1)); + + CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(subbuffer)); + CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(a)); +} + MT_TEST(acl_mem, map_buf_bad_flags) { ACL_LOCKED(acl_print_debug_msg("begin buf_bad_flags\n")); cl_int status = CL_SUCCESS;