From 94cbd654a3193fe905ca9398ffd55dfd993e3e47 Mon Sep 17 00:00:00 2001 From: "zibai.wang" Date: Wed, 2 Feb 2022 16:23:30 -0500 Subject: [PATCH] Support early print buffer flushing in debug mode Added a flow to flush printf() buffer before the Kernel ends. This will be helpful for debugging a hang Kernel. The Runtime periodically polls from the device and process the printf buffer from the last processed point. If ACL_HAL_DEBUG = 1 and the Kernel is Hang, Kernel status (existing feature) and printf buffer will be flushed to stdout every 10 seconds (approximately). The thread will run if there are spare cycles. If ACL_HAL_DEBUG = 3, Kernel status and printf buffer will flushed every 10 seconds, even the Kernel isn't hung. The behavior of printf() is not altered in non-debug mode. --- include/acl_hal.h | 2 +- include/acl_kernel_if.h | 3 ++ include/acl_types.h | 8 ++++ src/acl_kernel.cpp | 3 ++ src/acl_kernel_if.cpp | 57 +++++++++++++++++++++----- src/acl_printf.cpp | 79 ++++++++++++++++++++++++++----------- test/acl_device_op_test.cpp | 5 +++ test/acl_kernel_test.cpp | 54 +++++++++++++++++++++---- 8 files changed, 170 insertions(+), 41 deletions(-) diff --git a/include/acl_hal.h b/include/acl_hal.h index b1ee2711..e0fb58e3 100644 --- a/include/acl_hal.h +++ b/include/acl_hal.h @@ -63,7 +63,7 @@ typedef void (*acl_device_update_callback)( unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type, void *user_private_info, size_t user_cb); typedef void (*acl_process_printf_buffer_callback)(int activation_id, int size, - int stalled); + int debug_dump_printf); ///@} typedef unsigned int mem_properties_t; diff --git a/include/acl_kernel_if.h b/include/acl_kernel_if.h index a0b00207..43e363e0 100644 --- a/include/acl_kernel_if.h +++ b/include/acl_kernel_if.h @@ -50,6 +50,9 @@ typedef struct { // Track which of the kernels is the autorun profiling kernel (-1 if none) int autorun_profiling_kernel_id; + + // Track debug printf activity + time_ns last_printf_dump = 0; } acl_kernel_if; // *********************** Public functions ************************** diff --git a/include/acl_types.h b/include/acl_types.h index 123bc690..0536838e 100644 --- a/include/acl_types.h +++ b/include/acl_types.h @@ -922,6 +922,10 @@ typedef struct _cl_kernel { // Eventually this should be an array of ACLDeviceBinaries similar // to how cl_program contains an array of dev_prog. const acl_device_binary_t *dev_bin; + + // In ACL_HAL_DEBUG mode, printf buffer could be dumped before Kernel ends + // Therefore, we need to keep track of how much data has been processed. + size_t processed_printf_buffer_size; } _cl_kernel; ACL_DECLARE_CL_OBJECT_ALLOC_FUNCTIONS(cl_kernel); @@ -1346,6 +1350,10 @@ typedef struct { // it sees the CL_COMPLETE status for the first time, but it won't // change after that. cl_uint num_printf_bytes_pending; + + // Indicate whether this operation is dumping printf buffer before the Kernel + // for debug purpose + int debug_dump_printf = 0; } acl_device_op_info_t; // An operation to be performed on a device. diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index 0ec9b16f..6704c440 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -2419,6 +2419,9 @@ static int l_init_kernel(cl_kernel kernel, cl_program program, // Check if there are printfs in the kernel kernel->printf_device_buffer = 0; // Default is none. kernel->printf_device_ptr = 0; // Default is none. + // Keep track of already processed buffer size + // It will be reset when the buffer is full and dumped. + kernel->processed_printf_buffer_size = 0; if (!accel_def->printf_format_info.empty()) { auto gmem_idx = static_cast( acl_get_default_memory(kernel->dev_bin->get_devdef())); diff --git a/src/acl_kernel_if.cpp b/src/acl_kernel_if.cpp index 47934a24..c0b836c5 100644 --- a/src/acl_kernel_if.cpp +++ b/src/acl_kernel_if.cpp @@ -1369,8 +1369,9 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { ":: Calling acl_process_printf_buffer_fn with " "activation_id=%d and printf_size=%u.\n", activation_id, printf_size); - // update status, which will dump the printf buffer - acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1); + // update status, which will dump the printf buffer, set + // debug_dump_printf = 0 + acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0); ACL_KERNEL_IF_DEBUG_MSG( kern, ":: Accelerator %d new csr is %x.\n", k, @@ -1432,7 +1433,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { kern->accel_job_ids[k][next_queue_back] = -1; #ifdef TEST_PROFILING_HARDWARE - // Test readback of fake profile data using the acl_hal_mmd function that // would be called from the acl runtime. ACL_KERNEL_IF_DEBUG_MSG( @@ -1495,10 +1495,39 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { } } +void acl_kernel_if_debug_dump_printf(acl_kernel_if *kern, unsigned k) { + acl_assert_locked(); + unsigned int printf_size = 0; + int activation_id; + unsigned int next_queue_back; + + if (kern->accel_queue_back[k] == (int)kern->accel_invoc_queue_depth[k] - 1) + next_queue_back = 0; + else + next_queue_back = kern->accel_queue_back[k] + 1; + + if (kern->accel_num_printfs[k] > 0) { + acl_kernel_cra_read(kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE, + &printf_size); + assert(printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE); + ACL_KERNEL_IF_DEBUG_MSG( + kern, ":: Accelerator %d printf buffer size is %d.\n", k, printf_size); + activation_id = kern->accel_job_ids[k][next_queue_back]; + ACL_KERNEL_IF_DEBUG_MSG(kern, + ":: Calling acl_process_printf_buffer_fn with " + "activation_id=%d and printf_size=%u.\n", + activation_id, printf_size); + + // set debug_dump_printf to 1 + acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1); + } +} + void acl_kernel_if_dump_status(acl_kernel_if *kern) { int expect_kernel = 0; unsigned k, i; acl_assert_locked(); + for (k = 0; k < kern->num_accel; ++k) { for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) { if (kern->accel_job_ids[k][i] >= 0) { @@ -1510,10 +1539,6 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) { if (!expect_kernel) return; - kern->io.printf("No kernel updates in approximately 10 seconds for device %u", - kern->physical_device_id); - kern->io.printf(" ... a kernel may be hung?\n"); - for (k = 0; k < kern->num_accel; ++k) { unsigned int csr; @@ -1539,6 +1564,9 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) { if (ACL_KERNEL_READ_BIT(csr, KERNEL_CSR_LMEM_INVALID_BANK)) kern->io.printf(" lm_bank_exception"); + // Dump the printf buffer to stdout + acl_kernel_if_debug_dump_printf(kern, k); + unsigned buffered_kernel_invocation = 0; for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) { unsigned int next_queue_back; @@ -1576,14 +1604,23 @@ void acl_kernel_if_check_kernel_status(acl_kernel_if *kern) { #endif acl_assert_locked(); - // Print kernel status if it hasn't done anything in a while - // If multiple thread calls this, only one will print every 10 seconds if (kern->last_kern_update != 0 && (acl_kernel_if_get_time_us(kern) - kern->last_kern_update > 10 * 1000000)) { kern->last_kern_update = acl_kernel_if_get_time_us(kern); - if (kern->io.debug_verbosity > 0) + kern->io.printf( + "No kernel updates in approximately 10 seconds for device %u", + kern->physical_device_id); + kern->io.printf(" ... a kernel may be hung?\n"); + acl_kernel_if_dump_status(kern); + } else if (kern->io.debug_verbosity >= 3) { + // If ACL_HAL_DEBUG >= 3, the status will be printed even the server isn't + // hang every 10 seconds. + if (acl_kernel_if_get_time_us(kern) - kern->last_printf_dump > + 10 * 1000000) { + kern->last_printf_dump = acl_kernel_if_get_time_us(kern); acl_kernel_if_dump_status(kern); + } } #ifdef POLLING diff --git a/src/acl_printf.cpp b/src/acl_printf.cpp index 4069c102..fa66a8b7 100644 --- a/src/acl_printf.cpp +++ b/src/acl_printf.cpp @@ -38,8 +38,8 @@ #pragma GCC visibility push(protected) #endif -static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, - unsigned size); +static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel, + unsigned size); static void decode_string(std::string &print_data); // stores to global memory are bound by smallest global memory bandwidth we @@ -719,12 +719,13 @@ static std::string::const_iterator get_data_elem_at_offset( return end_of_string; } -static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, - unsigned size) { +static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel, + unsigned size) { unsigned global_offset; // the location in the printf buffer unsigned single_printf_offset; // the offset of a single printf void (*hal_dma_fn)(cl_event, const void *, void *, size_t) = 0; int src_on_host = 1; + size_t dumped_buffer_size = 0; #ifdef _WIN32 __declspec(align(64)) char buffer[ACL_PRINTF_BUFFER_TOTAL_SIZE]; // Aligned to 64, for dma transfers @@ -743,11 +744,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (!verify_types()) { printf("Host data types are incompatible with ACL compiler, ignoring " "printfs...\n"); - return; + return dumped_buffer_size; } if (printf_infos.empty()) - return; + return dumped_buffer_size; // Memory is on the device if all of these are true: // The memory is not SVM or the device does not support SVM. @@ -769,8 +770,28 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, hal_dma_fn = acl_get_hal()->copy_globalmem_to_hostmem; } - hal_dma_fn(NULL, kernel->printf_device_buffer->block_allocation->range.begin, - buffer, size); + // It needs the context from ACL_HAL_DEBUG instead of ACL_DEBUG + if (acl_get_hal()->get_debug_verbosity && + acl_get_hal()->get_debug_verbosity() > 0) { + printf("Previously processed buffer size is %zu \n", + kernel->processed_printf_buffer_size); + } + + // Check if we have already processed all the printf buffer + if (size > (unsigned int)kernel->processed_printf_buffer_size) { + void *unprocessed_begin = (void *)((char *)kernel->printf_device_buffer + ->block_allocation->range.begin + + kernel->processed_printf_buffer_size); + assert(size >= kernel->processed_printf_buffer_size); + dumped_buffer_size = size - kernel->processed_printf_buffer_size; + hal_dma_fn(NULL, unprocessed_begin, buffer, dumped_buffer_size); + } else { + if (acl_get_hal()->get_debug_verbosity && + acl_get_hal()->get_debug_verbosity() > 0) { + printf("All Printf() buffer has already been dumped \n"); + } + return dumped_buffer_size; + } #ifdef DEBUG if (debug_mode > 0) { @@ -789,11 +810,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, } } #endif - // always 32-byte aligned address (this may change if printf chunks can be // of different sizes ) // process all the printfs as long as there is data - for (global_offset = 0, single_printf_offset = 0; global_offset < size; + for (global_offset = 0, single_printf_offset = 0; + global_offset < dumped_buffer_size; global_offset += single_printf_offset) { // the first 4-bytes is the index of the format string @@ -820,7 +841,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (!success) { acl_print_debug_msg( "corrupt printf data, ignoring remaining printfs...\n"); - return; + return dumped_buffer_size; } #ifdef DEBUG @@ -883,7 +904,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (vector_size == -1) { acl_print_debug_msg("wrong vector specifier in printf call, ignoring " "remaining printfs...\n"); - return; + return dumped_buffer_size; } // get the length specifier @@ -904,7 +925,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, if (size_of_data == 0) { acl_print_debug_msg("wrong length modifier in printf call, ignoring " "remaining printfs...\n"); - return; + return dumped_buffer_size; } for (i = 0; i < vector_size; i++) { @@ -960,13 +981,14 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel, #ifdef DEBUG printf("exiting acl_dump_buffer...\n"); #endif + return dumped_buffer_size; } // // Schedule enqueue read buffer to read printf buffer // The activation ID is the device op ID. void acl_schedule_printf_buffer_pickup(int activation_id, int size, - int stalled) { + int debug_dump_printf) { acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); // This function can potentially be called by a HAL that does not use the @@ -980,17 +1002,17 @@ void acl_schedule_printf_buffer_pickup(int activation_id, int size, } #ifdef DEBUG - printf("printf pickup %d %d %d\n", activation_id, size, stalled); + printf("printf pickup %d %d\n", activation_id, size); fflush(stdout); #endif if (activation_id >= 0 && activation_id < doq->max_ops) { // This address is stable, given a fixed activation_id. // So we don't run into race conditions. acl_device_op_t *op = doq->op + activation_id; - - stalled = stalled; // this argument is no longer used! - op->info.num_printf_bytes_pending = (cl_uint)size; + + // Propagate the operation info + op->info.debug_dump_printf = debug_dump_printf ? 1 : 0; } // Signal all waiters. acl_signal_device_update(); @@ -1009,7 +1031,15 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) { // Grab the printf data and emit it. cl_uint num_bytes = op->info.num_printf_bytes_pending; - l_dump_printf_buffer(event, kernel, num_bytes); + size_t dumped_buffer_size = l_dump_printf_buffer(event, kernel, num_bytes); + + if (op->info.debug_dump_printf) { + // Update the already processed buffer size + kernel->processed_printf_buffer_size += dumped_buffer_size; + } else { + // Full dump, reset this variable + kernel->processed_printf_buffer_size = 0; + } // Mark this printf work as done. Must do this *before* unstalling // the kernel, to avoid a race against the kernel filling up the @@ -1021,8 +1051,13 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) { acl_memory_barrier(); // Allow the kernel to continue running. - acl_get_hal()->unstall_kernel( - event->cmd.info.ndrange_kernel.device->def.physical_device_id, op->id); + // We don't need to unstall the kernel during the early flushing during + // debug. + if (!op->info.debug_dump_printf) { + acl_get_hal()->unstall_kernel( + event->cmd.info.ndrange_kernel.device->def.physical_device_id, + op->id); + } } } @@ -1084,4 +1119,4 @@ static void decode_string(std::string &print_data) { #ifdef __GNUC__ #pragma GCC visibility pop -#endif +#endif \ No newline at end of file diff --git a/test/acl_device_op_test.cpp b/test/acl_device_op_test.cpp index 6a608bb0..e6c116fb 100644 --- a/test/acl_device_op_test.cpp +++ b/test/acl_device_op_test.cpp @@ -363,6 +363,7 @@ TEST(device_op, submit_action) { CHECK(l_launch_kernel == the_call); CHECK_EQUAL(0, unblocked); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op->info.debug_dump_printf); CHECK(prev < (latest = op->timestamp[CL_SUBMITTED])); prev = latest; @@ -400,6 +401,7 @@ TEST(device_op, submit_action) { op->info.type = ACL_DEVICE_OP_KERNEL; op->status = op->execution_status = CL_RUNNING; // Required op->info.num_printf_bytes_pending = 1; + CHECK_EQUAL(0, op->info.debug_dump_printf); acl_submit_device_op(&m_doq, op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind); CHECK_EQUAL(op->info.type, submit_kind); @@ -409,6 +411,7 @@ TEST(device_op, submit_action) { CHECK_EQUAL(1, unblocked); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as // stalled on printf. + CHECK_EQUAL(0, op->info.debug_dump_printf); // We didn't change the submit time. CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]); prev = latest; @@ -419,6 +422,7 @@ TEST(device_op, submit_action) { op->info.type = ACL_DEVICE_OP_KERNEL; op->status = op->execution_status = CL_COMPLETE; // Required op->info.num_printf_bytes_pending = 1; + CHECK_EQUAL(0, op->info.debug_dump_printf); acl_submit_device_op(&m_doq, op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind); CHECK_EQUAL(op->info.type, submit_kind); @@ -428,6 +432,7 @@ TEST(device_op, submit_action) { CHECK_EQUAL(1, unblocked); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as // stalled on printf. + CHECK_EQUAL(0, op->info.debug_dump_printf); // We didn't change the submit time. CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]); prev = latest; diff --git a/test/acl_kernel_test.cpp b/test/acl_kernel_test.cpp index 9d3ec4df..c29272f6 100644 --- a/test/acl_kernel_test.cpp +++ b/test/acl_kernel_test.cpp @@ -4483,7 +4483,9 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op->info.debug_dump_printf); + // Testing debug_printf_dump // Pretend the kernel actually put something in the device buffer. CHECK(k->printf_device_buffer); cl_mem printf_buf = k->printf_device_buffer; @@ -4495,16 +4497,13 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { const unsigned int printf_bytes = 8; *((int *)printf_buf->block_allocation->range.begin) = printf_conversion_index; *(((int *)printf_buf->block_allocation->range.begin) + 1) = printf_data; - // Check operation of printf-pickup-scheduler call back function. // Activation id should be the device op id! - CHECK_EQUAL(0, op->info.num_printf_bytes_pending); ACL_LOCKED(acl_schedule_printf_buffer_pickup(op->id, printf_bytes, - 0 /*ignored argument*/)); + 1 /*Debug printf dump*/)); CHECK_EQUAL( printf_bytes, acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending); - ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(10, m_devlog.num_ops); @@ -4515,14 +4514,48 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(printf_bytes, op->info.num_printf_bytes_pending); + CHECK_EQUAL(1, op->info.debug_dump_printf); op = &(m_devlog.after[9]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // should have cleared it. + CHECK_EQUAL(1, op->info.debug_dump_printf); + CHECK_EQUAL(printf_bytes, k->processed_printf_buffer_size); + ACL_LOCKED(acl_idle_update(m_context)); + + // Testing normal printf dump + *(((int *)printf_buf->block_allocation->range.begin) + 2) = printf_data; + // Check operation of printf-pickup-scheduler call back function. + // Activation id should be the device op id! + // Now we have two printf_data in the buffer, therefore the size is doubled. + ACL_LOCKED(acl_schedule_printf_buffer_pickup(op->id, printf_bytes * 2, + 0 /*Not debug printf dump*/)); + CHECK_EQUAL( + printf_bytes * 2, + acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending); + ACL_LOCKED(acl_idle_update(m_context)); + + CHECK_EQUAL(11, m_devlog.num_ops); + + op = &(m_devlog.before[10]); + CHECK_EQUAL(acl_platform.device_op_queue.op + op->id, ke->current_device_op); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); + CHECK_EQUAL(CL_RUNNING, op->status); + CHECK_EQUAL(ke, op->info.event); + CHECK_EQUAL(printf_bytes * 2, op->info.num_printf_bytes_pending); + op = &(m_devlog.after[10]); + CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); + CHECK_EQUAL(CL_RUNNING, op->status); + CHECK_EQUAL(ke, op->info.event); + CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // should have cleared it. + CHECK_EQUAL(0, op->info.debug_dump_printf); + CHECK_EQUAL(0, k->processed_printf_buffer_size); // A full dump should reset + // this variable to 0 ACL_LOCKED(acl_idle_update(m_context)); + // Testing normal printf dump end // Say it's complete, but with some printf stuff to clean up. ACL_LOCKED(acl_receive_kernel_update(op->id, CL_COMPLETE)); @@ -4532,34 +4565,39 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { // Set printf bytes which should be picked up acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending = printf_bytes; + CHECK_EQUAL(0, op->info.debug_dump_printf); ACL_LOCKED(acl_idle_update(m_context)); // bump scheduler CHECK_EQUAL(CL_COMPLETE, l_find_op(op->id)->execution_status); CHECK_EQUAL(CL_COMPLETE, l_find_op(op->id)->status); // Now it's copied up. - CHECK_EQUAL(12, m_devlog.num_ops); + CHECK_EQUAL(13, m_devlog.num_ops); // #10 is the printf flush. - op = &(m_devlog.before[10]); + op = &(m_devlog.before[11]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(CL_COMPLETE, op->execution_status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(printf_bytes, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, op->info.debug_dump_printf); - op = &(m_devlog.after[10]); + op = &(m_devlog.after[11]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(CL_COMPLETE, op->execution_status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // should have cleared it. + CHECK_EQUAL(0, op->info.debug_dump_printf); // #11 is the completion. - op = &(m_devlog.before[11]); + op = &(m_devlog.before[12]); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_COMPLETE, op->status); CHECK_EQUAL(ke, op->info.event); CHECK_EQUAL(0, op->info.num_printf_bytes_pending); + CHECK_EQUAL(0, k->processed_printf_buffer_size); + CHECK_EQUAL(0, op->info.debug_dump_printf); CHECK_EQUAL(0, ke->current_device_op); // DONE!