@@ -1369,7 +1369,7 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
1369
1369
" :: Calling acl_process_printf_buffer_fn with "
1370
1370
" activation_id=%d and printf_size=%u.\n " ,
1371
1371
activation_id, printf_size);
1372
- // update status, which will dump the printf buffer, set debug_dump = 0
1372
+ // update status, which will dump the printf buffer, set debug_dump_printf = 0
1373
1373
acl_process_printf_buffer_fn (activation_id, (int )printf_size, 0 );
1374
1374
1375
1375
ACL_KERNEL_IF_DEBUG_MSG (
@@ -1378,7 +1378,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
1378
1378
1379
1379
acl_kernel_cra_write (kern, k, KERNEL_OFFSET_CSR, new_csr);
1380
1380
continue ;
1381
- // ZIBAI TODO FIGURE OUT: Why Doesn't printf buffer need to be cleared, after this? Probably Handled outside of runtime?
1382
1381
}
1383
1382
}
1384
1383
@@ -1433,7 +1432,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
1433
1432
kern->accel_job_ids [k][next_queue_back] = -1 ;
1434
1433
1435
1434
#ifdef TEST_PROFILING_HARDWARE
1436
- // ZIBAI TODO: What is this doing?
1437
1435
// Test readback of fake profile data using the acl_hal_mmd function that
1438
1436
// would be called from the acl runtime.
1439
1437
ACL_KERNEL_IF_DEBUG_MSG (
@@ -1496,12 +1494,41 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
1496
1494
}
1497
1495
}
1498
1496
1497
+ void acl_kernel_if_debug_dump_printf (acl_kernel_if *kern, unsigned k){
1498
+ acl_assert_locked ();
1499
+ unsigned int printf_size = 0 ; // Do I have to initialize it to 0?
1500
+ int activation_id;
1501
+ unsigned int next_queue_back;
1502
+
1503
+ if (kern->accel_queue_back [k] == (int )kern->accel_invoc_queue_depth [k] - 1 )
1504
+ next_queue_back = 0 ;
1505
+ else
1506
+ next_queue_back = kern->accel_queue_back [k] + 1 ;
1507
+
1508
+ if (kern->accel_num_printfs [k] > 0 ) {
1509
+ acl_kernel_cra_read (kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE,
1510
+ &printf_size);
1511
+ assert (printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE);
1512
+ ACL_KERNEL_IF_DEBUG_MSG (kern,
1513
+ " :: Accelerator %d printf buffer size is %d.\n " ,
1514
+ k, printf_size);
1515
+ activation_id = kern->accel_job_ids [k][next_queue_back];
1516
+ ACL_KERNEL_IF_DEBUG_MSG (kern,
1517
+ " :: Calling acl_process_printf_buffer_fn with "
1518
+ " activation_id=%d and printf_size=%u.\n " ,
1519
+ activation_id, printf_size);
1520
+
1521
+ // set debug_dump_printf to 1
1522
+ acl_process_printf_buffer_fn (activation_id, (int )printf_size, 1 );
1523
+ }
1524
+
1525
+ }
1526
+
1499
1527
void acl_kernel_if_dump_status (acl_kernel_if *kern) {
1500
1528
int expect_kernel = 0 ;
1501
1529
unsigned k, i;
1502
1530
acl_assert_locked ();
1503
1531
1504
- kern->io .printf (" Zibai Added, kernel may be hung?\n " );
1505
1532
for (k = 0 ; k < kern->num_accel ; ++k) {
1506
1533
for (i = 0 ; i < kern->accel_invoc_queue_depth [k]; ++i) {
1507
1534
if (kern->accel_job_ids [k][i] >= 0 ) {
@@ -1513,10 +1540,6 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
1513
1540
if (!expect_kernel)
1514
1541
return ;
1515
1542
1516
- kern->io .printf (" No kernel updates in approximately 10 seconds for device %u" ,
1517
- kern->physical_device_id );
1518
- kern->io .printf (" ... a kernel may be hung?\n " );
1519
-
1520
1543
for (k = 0 ; k < kern->num_accel ; ++k) {
1521
1544
unsigned int csr;
1522
1545
@@ -1542,37 +1565,8 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
1542
1565
if (ACL_KERNEL_READ_BIT (csr, KERNEL_CSR_LMEM_INVALID_BANK))
1543
1566
kern->io .printf (" lm_bank_exception" );
1544
1567
1545
- // Testing start Zibai
1546
- unsigned int printf_size = 0 ; // Do I have to initialize it to 0?
1547
- int activation_id;
1548
-
1549
- int next_queue_back2;
1550
-
1551
- if (kern->accel_queue_back [k] == (int )kern->accel_invoc_queue_depth [k] - 1 )
1552
- next_queue_back2 = 0 ;
1553
- else
1554
- next_queue_back2 = kern->accel_queue_back [k] + 1 ;
1555
-
1556
- if (kern->accel_num_printfs [k] > 0 ) {
1557
- acl_kernel_cra_read (kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE,
1558
- &printf_size);
1559
- assert (printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE);
1560
- ACL_KERNEL_IF_DEBUG_MSG (kern,
1561
- " :: Accelerator %d printf buffer size is %d.\n " ,
1562
- k, printf_size);
1563
- activation_id = kern->accel_job_ids [k][next_queue_back2];
1564
- ACL_KERNEL_IF_DEBUG_MSG (kern,
1565
- " :: Calling acl_process_printf_buffer_fn with "
1566
- " activation_id=%d and printf_size=%u.\n " ,
1567
- activation_id, printf_size);
1568
-
1569
- // set debug_dump to 1
1570
- acl_process_printf_buffer_fn (activation_id, (int )printf_size, 1 );
1571
- // acl_process_printf_buffer_fn(activation_id, 64, 0);
1572
- }
1573
-
1574
- // Testing end Zibai
1575
- // Zibai TODO: What is the below code this doing?
1568
+ // Dump the printf buffer to stdout
1569
+ acl_kernel_if_debug_dump_printf (kern, k);
1576
1570
1577
1571
unsigned buffered_kernel_invocation = 0 ;
1578
1572
for (i = 0 ; i < kern->accel_invoc_queue_depth [k]; ++i) {
@@ -1611,15 +1605,18 @@ void acl_kernel_if_check_kernel_status(acl_kernel_if *kern) {
1611
1605
#endif
1612
1606
acl_assert_locked ();
1613
1607
1614
- // Print kernel status if it hasn't done anything in a while
1615
- // If multiple thread calls this, only one will print every 10 seconds
1616
- kern->io .printf (" Zibai Added, Are we calling acl_kernel_if_dump_status? \n " );
1617
1608
if (kern->last_kern_update != 0 &&
1618
- (acl_kernel_if_get_time_us (kern) - kern->last_kern_update >
1619
- 10 * 1000000 )) {
1620
- kern->last_kern_update = acl_kernel_if_get_time_us (kern);
1621
- if (kern->io .debug_verbosity > 0 )
1622
- kern->io .printf (" Zibai Added, calling acl_kernel_if_dump_status! \n " );
1609
+ (acl_kernel_if_get_time_us (kern) - kern->last_kern_update >
1610
+ 10 * 1000000 )) {
1611
+ kern->last_kern_update = acl_kernel_if_get_time_us (kern);
1612
+ kern->io .printf (" No kernel updates in approximately 10 seconds for device %u" ,
1613
+ kern->physical_device_id );
1614
+ kern->io .printf (" ... a kernel may be hung?\n " );
1615
+ acl_kernel_if_dump_status (kern);
1616
+ } else if (kern->io .debug_verbosity >= 3 ) {
1617
+ // If ACL_HAL_DEBUG >= 3, the status will be printed even the server isn't hang.
1618
+ // If there are spare cycles, it will be called at most every 5 seconds to dump the status and flush the printf buffer
1619
+ // 5 seconds is configured in acl_thread->acl_wait_for_device_update()
1623
1620
acl_kernel_if_dump_status (kern);
1624
1621
}
1625
1622
0 commit comments