diff --git a/include/acl_hal.h b/include/acl_hal.h index b1ee2711..e0fb58e3 100644 --- a/include/acl_hal.h +++ b/include/acl_hal.h @@ -63,7 +63,7 @@ typedef void (*acl_device_update_callback)( unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type, void *user_private_info, size_t user_cb); typedef void (*acl_process_printf_buffer_callback)(int activation_id, int size, - int stalled); + int debug_dump_printf); ///@} typedef unsigned int mem_properties_t; diff --git a/include/acl_kernel_if.h b/include/acl_kernel_if.h index a0b00207..43e363e0 100644 --- a/include/acl_kernel_if.h +++ b/include/acl_kernel_if.h @@ -50,6 +50,9 @@ typedef struct { // Track which of the kernels is the autorun profiling kernel (-1 if none) int autorun_profiling_kernel_id; + + // Track debug printf activity + time_ns last_printf_dump = 0; } acl_kernel_if; // *********************** Public functions ************************** diff --git a/include/acl_types.h b/include/acl_types.h index 123bc690..0536838e 100644 --- a/include/acl_types.h +++ b/include/acl_types.h @@ -922,6 +922,10 @@ typedef struct _cl_kernel { // Eventually this should be an array of ACLDeviceBinaries similar // to how cl_program contains an array of dev_prog. const acl_device_binary_t *dev_bin; + + // In ACL_HAL_DEBUG mode, printf buffer could be dumped before Kernel ends + // Therefore, we need to keep track of how much data has been processed. + size_t processed_printf_buffer_size; } _cl_kernel; ACL_DECLARE_CL_OBJECT_ALLOC_FUNCTIONS(cl_kernel); @@ -1346,6 +1350,10 @@ typedef struct { // it sees the CL_COMPLETE status for the first time, but it won't // change after that. cl_uint num_printf_bytes_pending; + + // Indicate whether this operation is dumping printf buffer before the Kernel + // for debug purpose + int debug_dump_printf = 0; } acl_device_op_info_t; // An operation to be performed on a device. diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index 0ec9b16f..6704c440 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -2419,6 +2419,9 @@ static int l_init_kernel(cl_kernel kernel, cl_program program, // Check if there are printfs in the kernel kernel->printf_device_buffer = 0; // Default is none. kernel->printf_device_ptr = 0; // Default is none. + // Keep track of already processed buffer size + // It will be reset when the buffer is full and dumped. + kernel->processed_printf_buffer_size = 0; if (!accel_def->printf_format_info.empty()) { auto gmem_idx = static_cast( acl_get_default_memory(kernel->dev_bin->get_devdef())); diff --git a/src/acl_kernel_if.cpp b/src/acl_kernel_if.cpp index 47934a24..c0b836c5 100644 --- a/src/acl_kernel_if.cpp +++ b/src/acl_kernel_if.cpp @@ -1369,8 +1369,9 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { ":: Calling acl_process_printf_buffer_fn with " "activation_id=%d and printf_size=%u.\n", activation_id, printf_size); - // update status, which will dump the printf buffer - acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1); + // update status, which will dump the printf buffer, set + // debug_dump_printf = 0 + acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0); ACL_KERNEL_IF_DEBUG_MSG( kern, ":: Accelerator %d new csr is %x.\n", k, @@ -1432,7 +1433,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { kern->accel_job_ids[k][next_queue_back] = -1; #ifdef TEST_PROFILING_HARDWARE - // Test readback of fake profile data using the acl_hal_mmd function that // would be called from the acl runtime. ACL_KERNEL_IF_DEBUG_MSG( @@ -1495,10 +1495,39 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { } } +void acl_kernel_if_debug_dump_printf(acl_kernel_if *kern, unsigned k) { + acl_assert_locked(); + unsigned int printf_size = 0; + int activation_id; + unsigned int next_queue_back; + + if (kern->accel_queue_back[k] == (int)kern->accel_invoc_queue_depth[k] - 1) + next_queue_back = 0; + else + next_queue_back = kern->accel_queue_back[k] + 1; + + if (kern->accel_num_printfs[k] > 0) { + acl_kernel_cra_read(kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE, + &printf_size); + assert(printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE); + ACL_KERNEL_IF_DEBUG_MSG( + kern, ":: Accelerator %d printf buffer size is %d.\n", k, printf_size); + activation_id = kern->accel_job_ids[k][next_queue_back]; + ACL_KERNEL_IF_DEBUG_MSG(kern, + ":: Calling acl_process_printf_buffer_fn with " + "activation_id=%d and printf_size=%u.\n", + activation_id, printf_size); + + // set debug_dump_printf to 1 + acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1); + } +} + void acl_kernel_if_dump_status(acl_kernel_if *kern) { int expect_kernel = 0; unsigned k, i; acl_assert_locked(); + for (k = 0; k < kern->num_accel; ++k) { for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) { if (kern->accel_job_ids[k][i] >= 0) { @@ -1510,10 +1539,6 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) { if (!expect_kernel) return; - kern->io.printf("No kernel updates in approximately 10 seconds for device %u", - kern->physical_device_id); - kern->io.printf(" ... a kernel may be hung?\n"); - for (k = 0; k < kern->num_accel; ++k) { unsigned int csr; @@ -1539,6 +1564,9 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) { if (ACL_KERNEL_READ_BIT(csr, KERNEL_CSR_LMEM_INVALID_BANK)) kern->io.printf(" lm_bank_exception"); + // Dump the printf buffer to stdout + acl_kernel_if_debug_dump_printf(kern, k); + unsigned buffered_kernel_invocation = 0; for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) { unsigned int next_queue_back; @@ -1576,14 +1604,23 @@ void acl_kernel_if_check_kernel_status(acl_kernel_if *kern) { #endif acl_assert_locked(); - // Print kernel status if it hasn't done anything in a while - // If multiple thread calls this, only one will print every 10 seconds if (kern->last_kern_update != 0 && (acl_kernel_if_get_time_us(kern) - kern->last_kern_update > 10 * 1000000)) { kern->last_kern_update = acl_kernel_if_get_time_us(kern); - if (kern->io.debug_verbosity > 0) + kern->io.printf( + "No kernel updates in approximately 10 seconds for device %u", + kern->physical_device_id); + kern->io.printf(" ... a kernel may be hung?\n"); + acl_kernel_if_dump_status(kern); + } else if (kern->io.debug_verbosity >= 3) { + // If ACL_HAL_DEBUG >= 3, the status will be printed even the server isn't + // hang every 10 seconds. + if (acl_kernel_if_get_time_us(kern) - kern->last_printf_dump > + 10 * 1000000) { + kern->last_printf_dump = acl_kernel_if_get_time_us(kern); acl_kernel_if_dump_status(kern); + } } #ifdef POLLING diff --git a/src/acl_printf.cpp b/src/acl_printf.cpp index 4069c102..fa66a8b7 100644 --- a/src/acl_printf.cpp +++ b/src/acl_printf.cpp @@ -38,8 +38,8 @@ #pragma GCC visibility push(protected) #endif -static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, - unsigned size); +static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel, + unsigned size); static void decode_string(std::string &print_data); // stores to global memory are bound by smallest global memory bandwidth we @@ -719,12 +719,13 @@ static std::string::const_iterator get_data_elem_at_offset( return end_of_string; } -static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, - unsigned size) { +static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel, + unsigned size) { unsigned global_offset; // the location in the printf buffer unsigned single_printf_offset; // the offset of a single printf void (*hal_dma_fn)(cl_event, const void *, void *, size_t) = 0; int src_on_host = 1; + size_t dumped_buffer_size = 0; #ifdef _WIN32 __declspec(align(64)) char buffer[ACL_PRINTF_BUFFER_TOTAL_SIZE]; // Aligned to 64, for dma transfers @@ -743,11 +744,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (!verify_types()) { printf("Host data types are incompatible with ACL compiler, ignoring " "printfs...\n"); - return; + return dumped_buffer_size; } if (printf_infos.empty()) - return; + return dumped_buffer_size; // Memory is on the device if all of these are true: // The memory is not SVM or the device does not support SVM. @@ -769,8 +770,28 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, hal_dma_fn = acl_get_hal()->copy_globalmem_to_hostmem; } - hal_dma_fn(NULL, kernel->printf_device_buffer->block_allocation->range.begin, - buffer, size); + // It needs the context from ACL_HAL_DEBUG instead of ACL_DEBUG + if (acl_get_hal()->get_debug_verbosity && + acl_get_hal()->get_debug_verbosity() > 0) { + printf("Previously processed buffer size is %zu \n", + kernel->processed_printf_buffer_size); + } + + // Check if we have already processed all the printf buffer + if (size > (unsigned int)kernel->processed_printf_buffer_size) { + void *unprocessed_begin = (void *)((char *)kernel->printf_device_buffer + ->block_allocation->range.begin + + kernel->processed_printf_buffer_size); + assert(size >= kernel->processed_printf_buffer_size); + dumped_buffer_size = size - kernel->processed_printf_buffer_size; + hal_dma_fn(NULL, unprocessed_begin, buffer, dumped_buffer_size); + } else { + if (acl_get_hal()->get_debug_verbosity && + acl_get_hal()->get_debug_verbosity() > 0) { + printf("All Printf() buffer has already been dumped \n"); + } + return dumped_buffer_size; + } #ifdef DEBUG if (debug_mode > 0) { @@ -789,11 +810,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, } } #endif - // always 32-byte aligned address (this may change if printf chunks can be // of different sizes ) // process all the printfs as long as there is data - for (global_offset = 0, single_printf_offset = 0; global_offset < size; + for (global_offset = 0, single_printf_offset = 0; + global_offset < dumped_buffer_size; global_offset += single_printf_offset) { // the first 4-bytes is the index of the format string @@ -820,7 +841,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (!success) { acl_print_debug_msg( "corrupt printf data, ignoring remaining printfs...\n"); - return; + return dumped_buffer_size; } #ifdef DEBUG @@ -883,7 +904,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (vector_size == -1) { acl_print_debug_msg("wrong vector specifier in printf call, ignoring " "remaining printfs...\n"); - return; + return dumped_buffer_size; } // get the length specifier @@ -904,7 +925,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (size_of_data == 0) { acl_print_debug_msg("wrong length modifier in printf call, ignoring " "remaining printfs...\n"); - return; + return dumped_buffer_size; } for (i = 0; i < vector_size; i++) { @@ -960,13 +981,14 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, #ifdef DEBUG printf("exiting acl_dump_buffer...\n"); #endif + return dumped_buffer_size; } // // Schedule enqueue read buffer to read printf buffer // The activation ID is the device op ID. void acl_schedule_printf_buffer_pickup(int activation_id, int size, - int stalled) { + int debug_dump_printf) { acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); // This function can potentially be called by a HAL that does not use the @@ -980,17 +1002,17 @@ void acl_schedule_printf_buffer_pickup(int activation_id, int size, } #ifdef DEBUG - printf("printf pickup %d %d %d\n", activation_id, size, stalled); + printf("printf pickup %d %d\n", activation_id, size); fflush(stdout); #endif if (activation_id >= 0 && activation_id < doq->max_ops) { // This address is stable, given a fixed activation_id. // So we don't run into race conditions. acl_device_op_t *op = doq->op + activation_id; - - stalled = stalled; // this argument is no longer used! - op->info.num_printf_bytes_pending = (cl_uint)size; + + // Propagate the operation info + op->info.debug_dump_printf = debug_dump_printf ? 1 : 0; } // Signal all waiters. acl_signal_device_update(); @@ -1009,7 +1031,15 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) { // Grab the printf data and emit it. cl_uint num_bytes = op->info.num_printf_bytes_pending; - l_dump_printf_buffer(event, kernel, num_bytes); + size_t dumped_buffer_size = l_dump_printf_buffer(event, kernel, num_bytes); + + if (op->info.debug_dump_printf) { + // Update the already processed buffer size + kernel->processed_printf_buffer_size += dumped_buffer_size; + } else { + // Full dump, reset this variable + kernel->processed_printf_buffer_size = 0; + } // Mark this printf work as done. Must do this *before* unstalling // the kernel, to avoid a race against the kernel filling up the @@ -1021,8 +1051,13 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) { acl_memory_barrier(); // Allow the kernel to continue running. - acl_get_hal()->unstall_kernel( - event->cmd.info.ndrange_kernel.device->def.physical_device_id, op->id); + // We don't need to unstall the kernel during the early flushing during + // debug. + if (!op->info.debug_dump_printf) { + acl_get_hal()->unstall_kernel( + event->cmd.info.ndrange_kernel.device->def.physical_device_id, + op->id); + } } } @@ -1084,4 +1119,4 @@ static void decode_string(std::string &print_data) { #ifdef __GNUC__ #pragma GCC visibility pop -#endif +#endif \ No newline at end of file diff --git a/test/acl_device_op_test.cpp b/test/acl_device_op_test.cpp index 6a608bb0..e6c116fb 100644 --- a/test/acl_device_op_test.cpp +++ b/test/acl_device_op_test.cpp @@ -363,6 +363,7 @@ TEST(device_op, submit_action) { CHECK(l_launch_kernel == the_call); CHECK_EQUAL(0, unblocked); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op->info.debug_dump_printf); CHECK(prev < (latest = op->timestamp[CL_SUBMITTED])); prev = latest; @@ -400,6 +401,7 @@ TEST(device_op, submit_action) { op->info.type = ACL_DEVICE_OP_KERNEL; op->status = op->execution_status = CL_RUNNING; // Required op->info.num_printf_bytes_pending = 1; + CHECK_EQUAL(0, op->info.debug_dump_printf); acl_submit_device_op(&m_doq, op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind); CHECK_EQUAL(op->info.type, submit_kind); @@ -409,6 +411,7 @@ TEST(device_op, submit_action) { CHECK_EQUAL(1, unblocked); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as // stalled on printf. + CHECK_EQUAL(0, op->info.debug_dump_printf); // We didn't change the submit time. CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]); prev = latest; @@ -419,6 +422,7 @@ TEST(device_op, submit_action) { op->info.type = ACL_DEVICE_OP_KERNEL; op->status = op->execution_status = CL_COMPLETE; // Required op->info.num_printf_bytes_pending = 1; + CHECK_EQUAL(0, op->info.debug_dump_printf); acl_submit_device_op(&m_doq, op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind); CHECK_EQUAL(op->info.type, submit_kind); @@ -428,6 +432,7 @@ TEST(device_op, submit_action) { CHECK_EQUAL(1, unblocked); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as // stalled on printf. + CHECK_EQUAL(0, op->info.debug_dump_printf); // We didn't change the submit time. CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]); prev = latest; diff --git a/test/acl_kernel_test.cpp b/test/acl_kernel_test.cpp index 9d3ec4df..c29272f6 100644 --- a/test/acl_kernel_test.cpp +++ b/test/acl_kernel_test.cpp @@ -4483,7 +4483,9 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op->info.debug_dump_printf); + // Testing debug_printf_dump // Pretend the kernel actually put something in the device buffer. CHECK(k->printf_device_buffer); cl_mem printf_buf = k->printf_device_buffer; @@ -4495,16 +4497,13 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { const unsigned int printf_bytes = 8; *((int *)printf_buf->block_allocation->range.begin) = printf_conversion_index; *(((int *)printf_buf->block_allocation->range.begin) + 1) = printf_data; - // Check operation of printf-pickup-scheduler call back function. // Activation id should be the device op id! - CHECK_EQUAL(0, op->info.num_printf_bytes_pending); ACL_LOCKED(acl_schedule_printf_buffer_pickup(op->id, printf_bytes, - 0 /*ignored argument*/)); + 1 /*Debug printf dump*/)); CHECK_EQUAL( printf_bytes, acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending); - ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(10, m_devlog.num_ops); @@ -4515,14 +4514,48 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(printf_bytes, op->info.num_printf_bytes_pending); + CHECK_EQUAL(1, op->info.debug_dump_printf); op = &(m_devlog.after[9]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // should have cleared it. + CHECK_EQUAL(1, op->info.debug_dump_printf); + CHECK_EQUAL(printf_bytes, k->processed_printf_buffer_size); + ACL_LOCKED(acl_idle_update(m_context)); + + // Testing normal printf dump + *(((int *)printf_buf->block_allocation->range.begin) + 2) = printf_data; + // Check operation of printf-pickup-scheduler call back function. + // Activation id should be the device op id! + // Now we have two printf_data in the buffer, therefore the size is doubled. + ACL_LOCKED(acl_schedule_printf_buffer_pickup(op->id, printf_bytes * 2, + 0 /*Not debug printf dump*/)); + CHECK_EQUAL( + printf_bytes * 2, + acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending); + ACL_LOCKED(acl_idle_update(m_context)); + + CHECK_EQUAL(11, m_devlog.num_ops); + + op = &(m_devlog.before[10]); + CHECK_EQUAL(acl_platform.device_op_queue.op + op->id, ke->current_device_op); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); + CHECK_EQUAL(CL_RUNNING, op->status); + CHECK_EQUAL(ke, op->info.event); + CHECK_EQUAL(printf_bytes * 2, op->info.num_printf_bytes_pending); + op = &(m_devlog.after[10]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); + CHECK_EQUAL(CL_RUNNING, op->status); + CHECK_EQUAL(ke, op->info.event); + CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // should have cleared it. + CHECK_EQUAL(0, op->info.debug_dump_printf); + CHECK_EQUAL(0, k->processed_printf_buffer_size); // A full dump should reset + // this variable to 0 ACL_LOCKED(acl_idle_update(m_context)); + // Testing normal printf dump end // Say it's complete, but with some printf stuff to clean up. ACL_LOCKED(acl_receive_kernel_update(op->id, CL_COMPLETE)); @@ -4532,34 +4565,39 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { // Set printf bytes which should be picked up acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending = printf_bytes; + CHECK_EQUAL(0, op->info.debug_dump_printf); ACL_LOCKED(acl_idle_update(m_context)); // bump scheduler CHECK_EQUAL(CL_COMPLETE, l_find_op(op->id)->execution_status); CHECK_EQUAL(CL_COMPLETE, l_find_op(op->id)->status); // Now it's copied up. - CHECK_EQUAL(12, m_devlog.num_ops); + CHECK_EQUAL(13, m_devlog.num_ops); // #10 is the printf flush. - op = &(m_devlog.before[10]); + op = &(m_devlog.before[11]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(CL_COMPLETE, op->execution_status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(printf_bytes, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op->info.debug_dump_printf); - op = &(m_devlog.after[10]); + op = &(m_devlog.after[11]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(CL_COMPLETE, op->execution_status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // should have cleared it. + CHECK_EQUAL(0, op->info.debug_dump_printf); // #11 is the completion. - op = &(m_devlog.before[11]); + op = &(m_devlog.before[12]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_COMPLETE, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, k->processed_printf_buffer_size); + CHECK_EQUAL(0, op->info.debug_dump_printf); CHECK_EQUAL(0, ke->current_device_op); // DONE!