Skip to content

Commit 97f508b

Browse files
authored
[DevSAN] Set maximum supported local/private shadow memory size (#19465)
If number of work group is too large, local/private shadow memory will consume a lot of device memory and easily cause out of resource issue. So we'd better set a limitation for it. Also include other changes in this commit: 1.Allocate private base/shadow per subgroup instead of workgroup for GPU device. 2.Refine the algorithm to calculate Subgroup linear Id.
1 parent d0279af commit 97f508b

File tree

14 files changed

+111
-66
lines changed

14 files changed

+111
-66
lines changed

libdevice/include/sanitizer_utils.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ static inline size_t LocalLinearId() {
2828

2929
// For GPU device, each sub group is a hardware thread
3030
inline size_t SubGroupLinearId() {
31-
return __spirv_BuiltInGlobalLinearId / __spirv_BuiltInSubgroupSize;
31+
return WorkGroupLinearId() * __spirv_BuiltInNumSubgroups +
32+
__spirv_BuiltInSubgroupId;
3233
}
3334

3435
inline void SubGroupBarrier() {

libdevice/sanitizer/asan_rtl.cpp

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ static const __SYCL_CONSTANT__ char __local_shadow_out_of_bound[] =
2626
"[kernel] Local shadow memory out-of-bound (ptr: %p -> %p, wid: %llu, "
2727
"base: %p)\n";
2828
static const __SYCL_CONSTANT__ char __private_shadow_out_of_bound[] =
29-
"[kernel] Private shadow memory out-of-bound (ptr: %p -> %p, wid: %llu, "
30-
"sid: %llu, base: %p)\n";
29+
"[kernel] Private shadow memory out-of-bound (ptr: %p -> %p, sid: %llu, "
30+
"base: %p)\n";
3131

3232
static const __SYCL_CONSTANT__ char __asan_print_unsupport_device_type[] =
3333
"[kernel] Unsupport device type: %d\n";
@@ -126,13 +126,13 @@ inline uptr MemToShadow_DG2(uptr addr, uint32_t as,
126126
return shadow_ptr;
127127
} else if (as == ADDRESS_SPACE_LOCAL) { // local
128128
const auto shadow_offset = launch_info->LocalShadowOffset;
129-
if (shadow_offset == 0) {
129+
const size_t wid = WorkGroupLinearId();
130+
if (shadow_offset == 0 || wid >= ASAN_MAX_WG_LOCAL) {
130131
return 0;
131132
}
132133

133134
// The size of SLM is 64KB on DG2
134135
constexpr unsigned slm_size = 64 * 1024;
135-
const size_t wid = WorkGroupLinearId();
136136

137137
auto shadow_ptr = shadow_offset + ((wid * slm_size) >> ASAN_SHADOW_SCALE) +
138138
((addr & (slm_size - 1)) >> ASAN_SHADOW_SCALE);
@@ -146,12 +146,11 @@ inline uptr MemToShadow_DG2(uptr addr, uint32_t as,
146146
return shadow_ptr;
147147
} else if (as == ADDRESS_SPACE_PRIVATE) { // private
148148
const auto shadow_offset = launch_info->PrivateShadowOffset;
149-
if (shadow_offset == 0) {
149+
const size_t sid = SubGroupLinearId();
150+
if (shadow_offset == 0 || sid >= ASAN_MAX_SG_PRIVATE) {
150151
return 0;
151152
}
152153

153-
const auto wid = WorkGroupLinearId();
154-
const size_t sid = SubGroupLinearId();
155154
const uptr private_base = launch_info->PrivateBase[sid];
156155

157156
// FIXME: The recorded private_base may not be the most bottom one,
@@ -161,13 +160,13 @@ inline uptr MemToShadow_DG2(uptr addr, uint32_t as,
161160
}
162161

163162
uptr shadow_ptr = shadow_offset +
164-
((wid * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE) +
163+
((sid * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE) +
165164
((addr - private_base) >> ASAN_SHADOW_SCALE);
166165

167166
const auto shadow_offset_end = launch_info->PrivateShadowOffsetEnd;
168167
if (shadow_ptr > shadow_offset_end) {
169-
__spirv_ocl_printf(__private_shadow_out_of_bound, addr, shadow_ptr, wid,
170-
sid, private_base);
168+
__spirv_ocl_printf(__private_shadow_out_of_bound, addr, shadow_ptr, sid,
169+
private_base);
171170
return 0;
172171
};
173172

@@ -224,12 +223,11 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as,
224223
return shadow_ptr;
225224
} else if (as == ADDRESS_SPACE_PRIVATE) { // private
226225
const auto shadow_offset = launch_info->PrivateShadowOffset;
227-
if (shadow_offset == 0) {
226+
const size_t sid = SubGroupLinearId();
227+
if (shadow_offset == 0 || sid >= ASAN_MAX_SG_PRIVATE) {
228228
return 0;
229229
}
230230

231-
const size_t wid = WorkGroupLinearId();
232-
const size_t sid = SubGroupLinearId();
233231
const uptr private_base = launch_info->PrivateBase[sid];
234232

235233
// FIXME: The recorded private_base may not be the most bottom one,
@@ -239,13 +237,13 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as,
239237
}
240238

241239
uptr shadow_ptr = shadow_offset +
242-
((wid * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE) +
240+
((sid * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE) +
243241
((addr - private_base) >> ASAN_SHADOW_SCALE);
244242

245243
const auto shadow_offset_end = launch_info->PrivateShadowOffsetEnd;
246244
if (shadow_ptr > shadow_offset_end) {
247-
__spirv_ocl_printf(__private_shadow_out_of_bound, addr, shadow_ptr, wid,
248-
sid, private_base);
245+
__spirv_ocl_printf(__private_shadow_out_of_bound, addr, shadow_ptr, sid,
246+
private_base);
249247
return 0;
250248
};
251249

@@ -881,12 +879,12 @@ static __SYCL_CONSTANT__ const char __asan_print_private_base[] =
881879
DEVICE_EXTERN_C_NOINLINE void
882880
__asan_set_private_base(__SYCL_PRIVATE__ void *ptr) {
883881
auto launch_info = (__SYCL_GLOBAL__ const AsanRuntimeData *)__AsanLaunchInfo;
884-
if (!launch_info || launch_info->PrivateShadowOffset == 0 ||
885-
launch_info->PrivateBase == 0)
882+
const size_t sid = SubGroupLinearId();
883+
if (!launch_info || sid >= ASAN_MAX_SG_PRIVATE ||
884+
launch_info->PrivateShadowOffset == 0 || launch_info->PrivateBase == 0)
886885
return;
887886
// Only set on the first sub-group item
888887
if (__spirv_BuiltInSubgroupLocalInvocationId == 0) {
889-
const size_t sid = SubGroupLinearId();
890888
launch_info->PrivateBase[sid] = (uptr)ptr;
891889
ASAN_DEBUG(__spirv_ocl_printf(__asan_print_private_base, sid, ptr));
892890
}

libdevice/sanitizer/msan_rtl.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ const __SYCL_CONSTANT__ char __msan_print_func_end[] =
4646
"[kernel] ===== END %s()\n";
4747

4848
const __SYCL_CONSTANT__ char __msan_print_private_shadow_out_of_bound[] =
49-
"[kernel] Private shadow memory out-of-bound(ptr: %p -> %p, wid: %llu, "
49+
"[kernel] Private shadow memory out-of-bound(ptr: %p -> %p, "
5050
"sid: %llu, base: "
5151
"%p)\n";
5252

@@ -168,17 +168,16 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as) {
168168
shadow_base;
169169
} else if (as == ADDRESS_SPACE_LOCAL) {
170170
const auto shadow_offset = GetMsanLaunchInfo->LocalShadowOffset;
171-
if (shadow_offset != 0) {
171+
const size_t wid = WorkGroupLinearId();
172+
if (shadow_offset != 0 && wid < MSAN_MAX_WG_LOCAL) {
172173
// The size of SLM is 128KB on PVC
173174
constexpr unsigned SLM_SIZE = 128 * 1024;
174-
const size_t wid = WorkGroupLinearId();
175175
return shadow_offset + (wid * SLM_SIZE) + (addr & (SLM_SIZE - 1));
176176
}
177177
} else if (as == ADDRESS_SPACE_PRIVATE) {
178178
const auto shadow_offset = GetMsanLaunchInfo->PrivateShadowOffset;
179-
if (shadow_offset != 0) {
180-
const size_t wid = WorkGroupLinearId();
181-
const size_t sid = SubGroupLinearId();
179+
const size_t sid = SubGroupLinearId();
180+
if (shadow_offset != 0 && sid < MSAN_MAX_SG_PRIVATE) {
182181
const uptr private_base = GetMsanLaunchInfo->PrivateBase[sid];
183182

184183
// FIXME: The recorded private_base may not be the most bottom one,
@@ -188,12 +187,12 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as) {
188187
}
189188

190189
uptr shadow_ptr =
191-
shadow_offset + (wid * MSAN_PRIVATE_SIZE) + (addr - private_base);
190+
shadow_offset + (sid * MSAN_PRIVATE_SIZE) + (addr - private_base);
192191

193192
const auto shadow_offset_end = GetMsanLaunchInfo->PrivateShadowOffsetEnd;
194193
if (shadow_ptr > shadow_offset_end) {
195194
__spirv_ocl_printf(__msan_print_private_shadow_out_of_bound, addr,
196-
shadow_ptr, wid, sid, private_base);
195+
shadow_ptr, sid, private_base);
197196
return GetMsanLaunchInfo->CleanShadow;
198197
};
199198

@@ -717,12 +716,13 @@ static __SYCL_CONSTANT__ const char __msan_print_private_base[] =
717716

718717
DEVICE_EXTERN_C_NOINLINE void
719718
__msan_set_private_base(__SYCL_PRIVATE__ void *ptr) {
720-
if (!GetMsanLaunchInfo || GetMsanLaunchInfo->PrivateShadowOffset == 0 ||
719+
const size_t sid = SubGroupLinearId();
720+
if (!GetMsanLaunchInfo || sid >= MSAN_MAX_SG_PRIVATE ||
721+
GetMsanLaunchInfo->PrivateShadowOffset == 0 ||
721722
GetMsanLaunchInfo->PrivateBase == 0)
722723
return;
723724
// Only set on the first sub-group item
724725
if (__spirv_BuiltInSubgroupLocalInvocationId == 0) {
725-
const size_t sid = SubGroupLinearId();
726726
GetMsanLaunchInfo->PrivateBase[sid] = (uptr)ptr;
727727
MSAN_DEBUG(__spirv_ocl_printf(__msan_print_private_base, sid, ptr));
728728
}

libdevice/spirv_vars.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ __SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInNumWorkgroups;
3939
__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInWorkgroupId;
4040
__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInWorkgroupSize;
4141

42+
__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInNumSubgroups;
43+
__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupId;
4244
__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
4345

4446
__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;

unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -807,11 +807,14 @@ ur_result_t AsanInterceptor::prepareLaunch(
807807
LocalWorkSize[Dim];
808808
}
809809

810-
uint64_t NumWI = 1;
810+
uint64_t NumWILocal = 1;
811811
for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
812-
NumWI *= LaunchInfo.GlobalWorkSize[Dim];
812+
NumWILocal *= LocalWorkSize[Dim];
813813
}
814814

815+
size_t SGSize = GetSubGroupSize(Kernel, DeviceInfo->Handle);
816+
uint32_t NumSG = ((NumWILocal + SGSize - 1) / SGSize) * NumWG;
817+
815818
// Prepare asan runtime data
816819
LaunchInfo.Data.Host.GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
817820
LaunchInfo.Data.Host.GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
@@ -841,20 +844,20 @@ ur_result_t AsanInterceptor::prepareLaunch(
841844
// Write shadow memory offset for private memory
842845
if (getContext()->Options.DetectPrivates) {
843846
if (DeviceInfo->Shadow->AllocPrivateShadow(
844-
Queue, NumWI, NumWG, LaunchInfo.Data.Host.PrivateBase,
847+
Queue, NumSG, LaunchInfo.Data.Host.PrivateBase,
845848
LaunchInfo.Data.Host.PrivateShadowOffset,
846849
LaunchInfo.Data.Host.PrivateShadowOffsetEnd) != UR_RESULT_SUCCESS) {
847850
UR_LOG_L(getContext()->logger, WARN,
848851
"Failed to allocate shadow memory for private memory, "
849-
"maybe the number of workgroup ({}) is too large",
850-
NumWG);
852+
"maybe the number of subgroup ({}) is too large",
853+
NumSG);
851854
UR_LOG_L(getContext()->logger, WARN,
852855
"Skip checking private memory of kernel <{}>",
853856
GetKernelName(Kernel));
854857
LaunchInfo.Data.Host.PrivateShadowOffset = 0;
855858
} else {
856859
UR_LOG_L(getContext()->logger, INFO,
857-
"ShadowMemory(Private, WorkGroup={}, {} - {})", NumWG,
860+
"ShadowMemory(Private, SubGroup={}, {} - {})", NumSG,
858861
(void *)LaunchInfo.Data.Host.PrivateShadowOffset,
859862
(void *)LaunchInfo.Data.Host.PrivateShadowOffsetEnd);
860863
}

unified-runtime/source/loader/layers/sanitizer/asan/asan_libdevice.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ struct LocalArgsInfo {
4949
uint64_t SizeWithRedZone = 0;
5050
};
5151

52+
constexpr uint32_t ASAN_MAX_WG_LOCAL = 8192;
53+
54+
constexpr uint32_t ASAN_MAX_SG_PRIVATE = 256;
55+
5256
constexpr uint64_t ASAN_MAX_NUM_REPORTS = 10;
5357

5458
struct AsanRuntimeData {

unified-runtime/source/loader/layers/sanitizer/asan/asan_shadow.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,8 @@ ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
247247
uptr &End) {
248248
const size_t LocalMemorySize = GetDeviceLocalMemorySize(Device);
249249
const size_t RequiredShadowSize =
250-
(NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
250+
(std::min(ASAN_MAX_WG_LOCAL, NumWG) * LocalMemorySize) >>
251+
ASAN_SHADOW_SCALE;
251252
static size_t LastAllocedSize = 0;
252253
if (RequiredShadowSize > LastAllocedSize) {
253254
ur_context_handle_t QueueContext = GetContext(Queue);
@@ -285,16 +286,17 @@ ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
285286
}
286287

287288
ur_result_t ShadowMemoryGPU::AllocPrivateShadow(ur_queue_handle_t Queue,
288-
uint64_t NumWI, uint32_t NumWG,
289-
uptr *&Base, uptr &Begin,
290-
uptr &End) {
289+
uint32_t NumSG, uptr *&Base,
290+
uptr &Begin, uptr &End) {
291291
// Trying to allocate private base array and private shadow, and any one of
292292
// them fail to allocate would be a failure
293293
static size_t LastPrivateBaseAllocedSize = 0;
294294
static size_t LastPrivateShadowAllocedSize = 0;
295295

296+
NumSG = std::min(NumSG, ASAN_MAX_SG_PRIVATE);
297+
296298
try {
297-
const size_t NewPrivateBaseSize = NumWI * sizeof(uptr);
299+
const size_t NewPrivateBaseSize = NumSG * sizeof(uptr);
298300
if (NewPrivateBaseSize > LastPrivateBaseAllocedSize) {
299301
if (PrivateBasePtr) {
300302
UR_CALL_THROWS(getContext()->urDdiTable.USM.pfnFree(
@@ -317,7 +319,7 @@ ur_result_t ShadowMemoryGPU::AllocPrivateShadow(ur_queue_handle_t Queue,
317319
}
318320

319321
const size_t NewPrivateShadowSize =
320-
(NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
322+
(NumSG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
321323
if (NewPrivateShadowSize > LastPrivateShadowAllocedSize) {
322324
ur_context_handle_t QueueContext = GetContext(Queue);
323325
auto ContextInfo = getAsanInterceptor()->getContextInfo(QueueContext);

unified-runtime/source/loader/layers/sanitizer/asan/asan_shadow.hpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,8 @@ struct ShadowMemory {
5656
uptr &Begin, uptr &End) = 0;
5757

5858
virtual ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue,
59-
uint64_t NumWI, uint32_t NumWG,
60-
uptr *&Base, uptr &Begin,
61-
uptr &End) = 0;
59+
uint32_t NumSG, uptr *&Base,
60+
uptr &Begin, uptr &End) = 0;
6261

6362
ur_context_handle_t Context{};
6463

@@ -90,7 +89,7 @@ struct ShadowMemoryCPU final : public ShadowMemory {
9089
return UR_RESULT_SUCCESS;
9190
}
9291

93-
ur_result_t AllocPrivateShadow(ur_queue_handle_t, uint64_t, uint32_t, uptr *&,
92+
ur_result_t AllocPrivateShadow(ur_queue_handle_t, uint32_t, uptr *&,
9493
uptr &Begin, uptr &End) override {
9594
Begin = ShadowBegin;
9695
End = ShadowEnd;
@@ -110,8 +109,8 @@ struct ShadowMemoryGPU : public ShadowMemory {
110109
ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG,
111110
uptr &Begin, uptr &End) override final;
112111

113-
ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue, uint64_t NumWI,
114-
uint32_t NumWG, uptr *&Base, uptr &Begin,
112+
ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue, uint32_t NumSG,
113+
uptr *&Base, uptr &Begin,
115114
uptr &End) override final;
116115

117116
ur_mutex VirtualMemMapsMutex;

unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -530,11 +530,14 @@ ur_result_t MsanInterceptor::prepareLaunch(
530530
LocalWorkSize[Dim];
531531
}
532532

533-
uint64_t NumWI = 1;
533+
uint64_t NumWILocal = 1;
534534
for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
535-
NumWI *= LaunchInfo.GlobalWorkSize[Dim];
535+
NumWILocal *= LocalWorkSize[Dim];
536536
}
537537

538+
size_t SGSize = GetSubGroupSize(Kernel, DeviceInfo->Handle);
539+
uint32_t NumSG = ((NumWILocal + SGSize - 1) / SGSize) * NumWG;
540+
538541
// Write shadow memory offset for local memory
539542
if (KernelInfo.IsCheckLocals) {
540543
if (DeviceInfo->Shadow->AllocLocalShadow(
@@ -558,22 +561,22 @@ ur_result_t MsanInterceptor::prepareLaunch(
558561
// Write shadow memory offset for private memory
559562
if (KernelInfo.IsCheckPrivates) {
560563
if (DeviceInfo->Shadow->AllocPrivateShadow(
561-
Queue, NumWI, NumWG, LaunchInfo.Data.Host.PrivateBase,
564+
Queue, NumSG, LaunchInfo.Data.Host.PrivateBase,
562565
LaunchInfo.Data.Host.PrivateShadowOffset,
563566
LaunchInfo.Data.Host.PrivateShadowOffsetEnd) != UR_RESULT_SUCCESS) {
564567
UR_LOG_L(getContext()->logger, WARN,
565568
"Failed to allocate shadow memory for private memory, "
566-
"maybe the number of workgroup ({}) is too large",
567-
NumWG);
569+
"maybe the number of subgroup ({}) is too large",
570+
NumSG);
568571
UR_LOG_L(getContext()->logger, WARN,
569572
"Skip checking private memory of kernel <{}>",
570573
GetKernelName(Kernel));
571574
LaunchInfo.Data.Host.PrivateShadowOffset = 0;
572575
} else {
573576
UR_LOG_L(
574577
getContext()->logger, DEBUG,
575-
"ShadowMemory(Private, WorkGroup={}, PrivateBase={}, Shadow={} - {})",
576-
NumWG, (void *)LaunchInfo.Data.Host.PrivateBase,
578+
"ShadowMemory(Private, SubGroup={}, PrivateBase={}, Shadow={} - {})",
579+
NumSG, (void *)LaunchInfo.Data.Host.PrivateBase,
577580
(void *)LaunchInfo.Data.Host.PrivateShadowOffset,
578581
(void *)LaunchInfo.Data.Host.PrivateShadowOffsetEnd);
579582
}

unified-runtime/source/loader/layers/sanitizer/msan/msan_libdevice.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ namespace ur_sanitizer_layer {
2121

2222
constexpr unsigned MSAN_ORIGIN_GRANULARITY = 4U;
2323

24+
constexpr uint32_t MSAN_MAX_WG_LOCAL = 1024;
25+
26+
constexpr uint32_t MSAN_MAX_SG_PRIVATE = 32;
27+
2428
struct MsanErrorReport {
2529
int Flag = 0;
2630

0 commit comments

Comments
 (0)