From 3c557b57bd1f529aea97c2ca2ac5e1b048b8818f Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Thu, 14 Apr 2022 14:40:38 -0400 Subject: [PATCH 1/2] Initialize max number of global memory definition for simulator Simulator does not have any global memory interface information until the actuall aocx is loaded. (Note this is only a problem for simulator not hardware run, in hardware run, we can communicate with BSP to query memory interface information) Prior to loading aocx it uses predefined autodiscovery [1] to initialize its global memory interface, which has only 1 global memory In the sycl runtime flow today, the USM device allocation call happens before aocx is loaded. The aocx is loaded when clCreateProgram is called, which typically happen on first kernel launch in sycl runtime. The USM device allocation on mutli global memory system will fail because there are in total 1 global memory as defined in [1] but the user is requesting more than 1 device global memory. User could go around this issue by launching a sacrificial kernel that uses shared allocation as kernel argument. This will setup the correct global memory interface in runtime. This change eliminate the need to run a sacrificial kernel. However there are a few downside: 1. The address range/size may not be exactly the same as the one that is in aocx, but this is not too large of a problem because runtime first fit allocation algorithm will fill the lowest address range first. Unless user requested more than what is availble. 2. it potentially occupied more space than required 3. will not error out when user requested a non-existing device global memory because we are using ACL_MAX_GLOBAL_MEM for num_global_mem_systems [1] https://github.com/intel/fpga-runtime-for-opencl/blob/950f21dd079dfd55a473ba4122a4a9dca450e36f/include/acl_shipped_board_cfgs.h#L7 --- src/acl_kernel_if.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/acl_kernel_if.cpp b/src/acl_kernel_if.cpp index c0b836c5..9501716a 100644 --- a/src/acl_kernel_if.cpp +++ b/src/acl_kernel_if.cpp @@ -715,6 +715,29 @@ int acl_kernel_if_init(acl_kernel_if *kern, acl_bsp_io bsp_io, auto parse_result = acl_load_device_def_from_str( std::string(acl_shipped_board_cfgs[1]), sysdef->device[0].autodiscovery_def, err_msg); + // Fill in definition for all device global memory + // Simulator does not have any global memory interface information until the + // actual aocx is loaded. (Note this is only a problem for simulator not + // hardware run, in hardware run, we can communicate with BSP to query + // memory interface information). In the flow today, the USM device + // allocation call happens before aocx is loaded. The aocx is loaded when + // clCreateProgram is called, which typically happen on first kernel launch + // in sycl runtime. In order to prevent the USM device allocation from + // failing on mutli global memory system, initialize as much global memory + // system as possible for simulation flow. However there are a few downside: + // 1. The address range/size may not be exactly the same as the one that is + // in aocx, but this is not too large of a problem because runtime first fit + // allocation algorithm will fill the lowest address range first. Unless + // user requested more than what is availble. + // 2. it potentially occupied more space than required + // 3. will not error out when user requested a non-existing device global + // memory because we are using ACL_MAX_GLOBAL_MEM for num_global_mem_systems + sysdef->device[0].autodiscovery_def.num_global_mem_systems = + ACL_MAX_GLOBAL_MEM; + for (int i = 0; i < ACL_MAX_GLOBAL_MEM; i++) { + sysdef->device[0].autodiscovery_def.global_mem_defs[i] = + sysdef->device[0].autodiscovery_def.global_mem_defs[0]; + } if (parse_result) sysdef->num_devices = 1; // Override the device name to the simulator. From 93696eb437f141a08e103cda9212980fc3f79a4d Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Mon, 18 Apr 2022 15:40:32 -0400 Subject: [PATCH 2/2] Revert "Fix simulation of multi-memory systems" This reverts commit d9df7a9ed68d1343342666a0466d154561599a1a. --- src/acl_mem.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 59481e9b..bb153f01 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -4424,19 +4424,6 @@ void acl_resize_reserved_allocations_for_device(cl_mem mem, unsigned int num_global_mem_systems = def.autodiscovery_def.num_global_mem_systems; - // When we don't know how many memory systems will exist - // Load as much as needed. - num_global_mem_systems = std::max(num_global_mem_systems, mem->mem_id + 1); - - // For the simulation flow we don't know how many memory systems will exist - // until we load the .aocx, which may not happen until somewhat later. - // Reserving space is quite cheap, so reserve space for many memory systems. - int offline_mode = 0; - (void)acl_get_offline_device_user_setting(&offline_mode); - if (offline_mode == ACL_CONTEXT_MPSIM) { - num_global_mem_systems = std::max(num_global_mem_systems, 128u); - } - #ifdef MEM_DEBUG_MSG printf( "resizing reserved_allocations, physical_device_id:%u, target_size:%u \n",