Skip to content

Commit b114f9c

Browse files
perheldMartin Lindström
andauthored
Arm backend: Enable multiple inferences in executor runner (#13177)
The ET_NUM_INFERENCES macro can now be set to specify a number of inferences to run with Arm executor runner. The StopMeasurements function is also given a new argument, int num_inferences, in order to display data per inference. Signed-off-by: [email protected] Co-authored-by: Martin Lindström <[email protected]>
1 parent 22284e5 commit b114f9c

File tree

4 files changed

+98
-46
lines changed

4 files changed

+98
-46
lines changed

examples/arm/executor_runner/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ option(ET_DUMP_OUTPUT "Dump output in log" ON)
2020
option(FETCH_ETHOS_U_CONTENT
2121
"Fetch ethos_u dependencies instead of relying on pre-downloads" ON
2222
)
23+
set(ET_NUM_INFERENCES
24+
"1"
25+
CACHE STRING "Number of inferences to run"
26+
)
2327

2428
if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
2529
message(
@@ -77,6 +81,7 @@ set(MEMORY_MODE
7781

7882
message(STATUS "SYSTEM_CONFIG is ${SYSTEM_CONFIG}")
7983
message(STATUS "MEMORY_MODE is ${MEMORY_MODE}")
84+
message(STATUS "ET_NUM_INFERENCES is ${ET_NUM_INFERENCES}")
8085

8186
get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
8287
get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
@@ -255,6 +260,12 @@ if(ET_DUMP_OUTPUT)
255260
target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT)
256261
endif()
257262

263+
if(ET_NUM_INFERENCES)
264+
target_compile_definitions(
265+
arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
266+
)
267+
endif()
268+
258269
# Fixup compilation of retarget.c
259270
if(SEMIHOSTING)
260271
# Remove this when MLBEDSW-8910 is closed.

examples/arm/executor_runner/arm_executor_runner.cpp

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,12 @@ const float et_rtol = 0.01;
130130

131131
#endif
132132

133+
#if defined(ET_NUM_INFERENCES)
134+
const int num_inferences = ET_NUM_INFERENCES;
135+
#else
136+
const int num_inferences = 1;
137+
#endif
138+
133139
/**
134140
* The temp_allocation_pool is used for allocating temporary data during kernel
135141
* or delegate execution. This will be reset after each kernel or delegate call.
@@ -638,21 +644,6 @@ void runner_init(
638644
ET_LOG(Info, "Input prepared.");
639645
}
640646

641-
void run_model(RunnerContext& ctx) {
642-
ET_LOG(Info, "Starting the model execution...");
643-
644-
StartMeasurements();
645-
// Run the model.
646-
Error status = ctx.method.value()->execute();
647-
StopMeasurements();
648-
649-
ET_CHECK_MSG(
650-
status == Error::Ok,
651-
"Execution of method %s failed with status 0x%" PRIx32,
652-
ctx.method_name,
653-
status);
654-
}
655-
656647
void log_mem_status(const RunnerContext& ctx) {
657648
size_t executor_memsize =
658649
ctx.method_allocator->used_size() - ctx.executor_membase;
@@ -853,6 +844,32 @@ void verify_result(RunnerContext& ctx, const void* model_pte) {
853844
#endif
854845
}
855846

847+
void run_model(RunnerContext& ctx, const void* model_pte) {
848+
Error status;
849+
ET_LOG(Info, "Starting running %d inferences...", num_inferences);
850+
851+
int n = 0;
852+
StartMeasurements();
853+
for (n = 1; n <= num_inferences; n++) {
854+
// Run the model.
855+
status = ctx.method.value()->execute();
856+
if (status != Error::Ok) {
857+
break;
858+
}
859+
}
860+
StopMeasurements(n);
861+
862+
ET_CHECK_MSG(
863+
status == Error::Ok,
864+
"Execution of method %s failed with status 0x%" PRIx32,
865+
ctx.method_name,
866+
status);
867+
868+
ET_LOG(Info, "%d inferences finished", num_inferences);
869+
print_outputs(ctx);
870+
verify_result(ctx, model_pte);
871+
}
872+
856873
} // namespace
857874

858875
int main(int argc, const char* argv[]) {
@@ -934,11 +951,9 @@ int main(int argc, const char* argv[]) {
934951
Info, "PTE in %p %c Size: %lu bytes", model_pte, model_pte[0], pte_size);
935952

936953
runner_init(ctx, input_buffers, pte_size);
937-
run_model(ctx);
954+
run_model(ctx, model_pte);
938955
log_mem_status(ctx);
939-
print_outputs(ctx);
940956
write_etdump(ctx);
941-
verify_result(ctx, model_pte);
942957

943958
ET_LOG(Info, "Program complete, exiting.");
944959
#if defined(SEMIHOSTING)

examples/arm/executor_runner/arm_perf_monitor.cpp

Lines changed: 52 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
* LICENSE file in the root directory of this source tree.
55
*/
66

7+
#include <array>
78
#include <cinttypes>
8-
#include <vector>
99

1010
#include "arm_perf_monitor.h"
1111

@@ -14,29 +14,31 @@
1414
#include <executorch/runtime/platform/log.h>
1515
#include <pmu_ethosu.h>
1616

17-
static uint32_t ethosu_inference_count = 0;
18-
static uint64_t ethosu_ArmCycleCountStart = 0;
19-
static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0;
20-
static uint64_t ethosu_ArmBackendExecuteCycleCount = 0;
21-
static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0;
22-
static uint64_t ethosu_ArmWhenNPURunCycleCount = 0;
23-
static uint64_t ethosu_pmuCycleCount = 0;
24-
static std::vector<uint64_t> ethosu_pmuEventCounts(
25-
ETHOSU_PMU_Get_NumEventCounters(),
26-
0);
17+
namespace {
2718

2819
#if defined(ETHOSU55) || defined(ETHOSU65)
29-
static const uint32_t ethosu_pmuCountersUsed = 4;
20+
const uint32_t ethosu_pmuCountersUsed = 4;
3021
#elif defined(ETHOSU85)
31-
static const uint32_t ethosu_pmuCountersUsed = 5;
22+
const uint32_t ethosu_pmuCountersUsed = 5;
3223
#else
3324
#error No NPU target defined
3425
#endif
3526

27+
uint32_t ethosu_delegation_count = 0;
28+
uint64_t ethosu_ArmCycleCountStart = 0;
29+
uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0;
30+
uint64_t ethosu_ArmBackendExecuteCycleCount = 0;
31+
uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0;
32+
uint64_t ethosu_ArmWhenNPURunCycleCount = 0;
33+
uint64_t ethosu_pmuCycleCount = 0;
34+
std::array<uint64_t, ethosu_pmuCountersUsed> ethosu_pmuEventCounts = {0};
35+
3636
// ethosu_pmuCountersUsed should match numbers of counters setup in
3737
// ethosu_inference_begin() and not be more then the HW supports
3838
static_assert(ETHOSU_PMU_NCOUNTERS >= ethosu_pmuCountersUsed);
3939

40+
} // namespace
41+
4042
extern "C" {
4143

4244
// Callback invoked at start of NPU execution
@@ -85,7 +87,7 @@ void ethosu_inference_begin(struct ethosu_driver* drv, void*) {
8587

8688
// Callback invoked at end of NPU execution
8789
void ethosu_inference_end(struct ethosu_driver* drv, void*) {
88-
ethosu_inference_count++;
90+
ethosu_delegation_count++;
8991
ethosu_pmuCycleCount += ETHOSU_PMU_Get_CCNTR(drv);
9092

9193
for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
@@ -113,6 +115,7 @@ void EthosUBackend_execute_end() {
113115
}
114116

115117
void StartMeasurements() {
118+
ethosu_delegation_count = 0;
116119
ethosu_ArmBackendExecuteCycleCount = 0;
117120
ethosu_ArmWhenNPURunCycleCount = 0;
118121
ethosu_pmuCycleCount = 0;
@@ -123,32 +126,43 @@ void StartMeasurements() {
123126
ethosu_ArmCycleCountStart = ARM_PMU_Get_CCNTR();
124127
}
125128

126-
void StopMeasurements() {
129+
void StopMeasurements(int num_inferences) {
127130
ARM_PMU_CNTR_Disable(
128131
PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk |
129132
PMU_CNTENCLR_CNT1_ENABLE_Msk);
130133
uint32_t cycle_count = ARM_PMU_Get_CCNTR() - ethosu_ArmCycleCountStart;
131134

132135
// Number of comand streams handled by the NPU
133-
ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count);
136+
ET_LOG(Info, "NPU Inferences : %d", num_inferences);
137+
ET_LOG(
138+
Info,
139+
"NPU delegations: %d (%.2f per inference)",
140+
ethosu_delegation_count,
141+
(double)ethosu_delegation_count / num_inferences);
134142
ET_LOG(Info, "Profiler report, CPU cycles per operator:");
135143
// This is number of CPU cycles for the ethos-u operator from start to finish
136144
// in the framework If there is more then one commandstream the time is added
137145
// together
138146
ET_LOG(
139147
Info,
140-
"ethos-u : cycle_cnt : %d cycles",
141-
ethosu_ArmBackendExecuteCycleCount);
148+
"ethos-u : cycle_cnt : %d cycles (%.2f per inference)",
149+
ethosu_ArmBackendExecuteCycleCount,
150+
(double)ethosu_ArmBackendExecuteCycleCount / num_inferences);
142151
// We could print a list of the cycles used by the other delegates here in the
143152
// future but now we only print ethos-u: this means that "Operator(s) total:
144153
// ..." will be the same number as ethos-u : cycle_cnt and not the sum of all
145154
ET_LOG(
146155
Info,
147-
"Operator(s) total: %d CPU cycles",
148-
ethosu_ArmBackendExecuteCycleCount);
156+
"Operator(s) total: %d CPU cycles (%.2f per inference)",
157+
ethosu_ArmBackendExecuteCycleCount,
158+
(double)ethosu_ArmBackendExecuteCycleCount / num_inferences);
149159
// Total CPU cycles used in the executorch method->execute()
150160
// Other delegates and no delegates are counted in this
151-
ET_LOG(Info, "Inference runtime: %d CPU cycles total", cycle_count);
161+
ET_LOG(
162+
Info,
163+
"Inference runtime: %d CPU cycles total (%.2f per inference)",
164+
cycle_count,
165+
(double)cycle_count / num_inferences);
152166

153167
ET_LOG(
154168
Info,
@@ -174,14 +188,24 @@ void StopMeasurements() {
174188
// If there is more then one commandstream the time is added together
175189
ET_LOG(
176190
Info,
177-
"cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles",
178-
ethosu_ArmWhenNPURunCycleCount);
191+
"cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles (%.2f per inference)",
192+
ethosu_ArmWhenNPURunCycleCount,
193+
(double)ethosu_ArmWhenNPURunCycleCount / num_inferences);
179194

180195
ET_LOG(Info, "Ethos-U PMU report:");
181-
ET_LOG(Info, "ethosu_pmu_cycle_cntr : %" PRIu64, ethosu_pmuCycleCount);
196+
ET_LOG(
197+
Info,
198+
"ethosu_pmu_cycle_cntr : % " PRIu64 " (%.2f per inference)",
199+
ethosu_pmuCycleCount,
200+
(double)ethosu_pmuCycleCount / num_inferences);
182201

183202
for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
184-
ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]);
203+
ET_LOG(
204+
Info,
205+
"ethosu_pmu_cntr%zd : %" PRIu64 " (%.2f per inference)",
206+
i,
207+
ethosu_pmuEventCounts[i],
208+
(double)ethosu_pmuEventCounts[i] / num_inferences);
185209
}
186210
#if defined(ETHOSU55) || defined(ETHOSU65)
187211
ET_LOG(
@@ -199,6 +223,8 @@ void StopMeasurements() {
199223
#else
200224
void StartMeasurements() {}
201225

202-
void StopMeasurements() {}
226+
void StopMeasurements(int num_inferences) {
227+
(void)num_inferences;
228+
}
203229

204230
#endif
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright 2024 Arm Limited and/or its affiliates.
1+
/* Copyright 2024-2025 Arm Limited and/or its affiliates.
22
*
33
* This source code is licensed under the BSD-style license found in the
44
* LICENSE file in the root directory of this source tree.
@@ -7,4 +7,4 @@
77
#pragma once
88

99
void StartMeasurements();
10-
void StopMeasurements();
10+
void StopMeasurements(int num_inferences);

0 commit comments

Comments
 (0)