Skip to content

Commit 0405046

Browse files
Hakon-BuggeBrian Maly
authored andcommitted
rds: ib: Add cm_id generation scheme in order to detect new ones
In rds_rdma_cm_event_handler_cmn() we have a simple address compare of the supplied cm_id and the one stored in the IB Connection struct, ic: if (ic->i_cm_id != cm_id) { But this address compare is flaky. Although the address is the same, the cm_id per se, does not need to be the same. The cm_id may be destroyed and re-created. Since we here talk about free() + kzalloc(), there is a fair probability that the new cm_id ends up at the very same address as the old one. If that happens, there is a dissonance between the RDS IB Connection state and the state of the cm_id. This bug used to manifest itself as a NULL ptr deref in rds_rdma_cm_event_handler_cmn(), but that was before commit a71a60c ("rds: ib: Remove incorrect update of the path record sl and qos_class fields"). Kernels with a71a60c may observe the following crash: BUG: kernel NULL pointer dereference, address: 0000000000000034 RIP: 0010:dma_direct_unmap_sg+0x5a/0x220 Call Trace: rds_ib_recv_cqe_handler+0xed/0x395 [rds_rdma] poll_rcq+0x72/0xa0 [rds_rdma] rds_ib_rx+0xb9/0x2b0 [rds_rdma] rds_ib_tasklet_fn_recv+0x2f/0x40 [rds_rdma] tasklet_action_common+0x153/0x240 handle_softirqs+0xe4/0x2ac __irq_exit_rcu+0xab/0xd0 common_interrupt+0x85/0xa0 Drgn analyses of the crash: >>> trace = prog.crashed_thread().stack_trace() >>> trace #0 sg_dma_is_bus_address (./include/linux/scatterlist.h:297:11) #1 dma_direct_unmap_sg (kernel/dma/direct.c:452:7) #2 ib_dma_unmap_sg_attrs (./include/rdma/ib_verbs.h:4232:3) #3 ib_dma_unmap_sg (./include/rdma/ib_verbs.h:4294:2) #4 rds_ib_recv_cqe_handler (net/rds/ib_recv.c:1397:2) #5 poll_rcq (net/rds/ib_cm.c:777:4) #6 rds_ib_rx (net/rds/ib_cm.c:870:2) #7 rds_ib_recv_cb (net/rds/ib_cm.c:916:2) #8 rds_ib_tasklet_fn_recv (net/rds/ib_cm.c:925:2) #9 tasklet_action_common (kernel/softirq.c:795:7) #10 handle_softirqs (kernel/softirq.c:561:3) #11 __do_softirq (kernel/softirq.c:595:2) #12 invoke_softirq (kernel/softirq.c:435:3) #13 __irq_exit_rcu (kernel/softirq.c:644:3) #14 common_interrupt (arch/x86/kernel/irq.c:280:1) #15 asm_common_interrupt+0x26/0x2b (./arch/x86/include/asm/idtentry.h:693) #16 cpuidle_enter_state (drivers/cpuidle/cpuidle.c:288:5) #17 cpuidle_enter (drivers/cpuidle/cpuidle.c:385:9) #18 cpuidle_idle_call (kernel/sched/idle.c:230:19) #19 do_idle (kernel/sched/idle.c:326:4) #20 cpu_startup_entry (kernel/sched/idle.c:424:3) #21 rest_init (init/main.c:747:2) #22 start_kernel (init/main.c:1105:2) #23 x86_64_start_reservations (arch/x86/kernel/head64.c:507:2) #24 x86_64_start_kernel (arch/x86/kernel/head64.c:488:2) #25 secondary_startup_64+0x15d/0x15f (arch/x86/kernel/head_64.S:413) >>> trace[4]["ic"].i_recvs[0] (struct rds_ib_recv_work){ .r_ibinc = (struct rds_ib_incoming *)0x0, .r_frag = (struct rds_page_frag *)0x0, .r_wr = (struct ib_recv_wr){ .next = (struct ib_recv_wr *)0x0, .wr_id = (u64)0, .wr_cqe = (struct ib_cqe *)0x0, .sg_list = (struct ib_sge *)0xffffb91c1a5b5030, .num_sge = (int)5, }, .r_sge = (struct ib_sge [8]){ { .addr = (u64)33819643904, .length = (u32)48, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, }, .r_ic = (struct rds_ib_connection *)0x0, .r_posted = (int)0, } The prosaic version: We do get an event and processes a recv CQE, but when performing DMA unmap, the addresses for SGE entry 1-4 are all zero. That explains the crash. If you look at rds_ib_recv_init_ring(), the state will be exactly what we see above when this function has been called. So, how can this happen? We do get an RDMA_CM_EVENT_ROUTE_RESOLVED event with a new cm_id which happens to have the same address as the one stored in the RDS IB Connection (ic). Now, without a71a60c, we do not crash in rds_rdma_cm_event_handler_cmn() but calls: trans->cm_initiate_connect() rds_ib_cm_initiate_connect() rds_ib_recv_init_ring() and the latter does: sge = recv->r_sge; sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; for (j = 1; j < num_send_sge; j++) { sge = recv->r_sge + j; sge->addr = 0; <==== Note sge->length = PAGE_SIZE; sge->lkey = ic->i_mr->lkey; } This fits 100% with the r_sge pasted above. We fix this bug by introducing a generation number and embed that in the last three bits of cm_id->context, since these bits are zero due to alignment requirement. When we use cm_id->context to find the RDS Connection (conn), we mask away the three lower bits. When we create the cm_id, we insert the next generation and make a copy of it in the ic. In rds_rdma_cm_event_handler_cmn(), when we have established that the two addresses are equal, we add an additional check to see if the cm_id generation is the same between the rdma_cm supplied cm_id and the generation stored in ic. If the generations differ, we increment the s_ib_cm_id_resurrected statistics counter and return. When handling an incoming connect, we set the cm_id generation count stored in ic to zero. Orabug: 38105083 Fixes: 3ae09d7 ("net/rds: Don't block workqueues "cma_wq" and "cm.wq"") Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Sharath Srinivasan <[email protected]> Signed-off-by: Brian Maly <[email protected]>
1 parent 047fcbb commit 0405046

File tree

4 files changed

+31
-8
lines changed

4 files changed

+31
-8
lines changed

net/rds/ib.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ enum rds_ib_conn_flags {
9292

9393
#define RDS_IB_CQ_FOLLOW_AFFINITY_THROTTLE 100 /* msec */
9494

95+
#define RDS_IB_CM_IB_GEN_MASK 7
96+
#define RDS_IB_CM_ID_ADD_GEN(conn, gen) ((void *)((uintptr_t)(conn) + (gen)))
97+
#define RDS_IB_CM_ID_EXTRACT_GEN(id) ((uintptr_t)(id)->context & RDS_IB_CM_IB_GEN_MASK)
98+
#define RDS_IB_CM_ID_EXTRACT_CONN(id) ((void *)((uintptr_t)(id)->context & ~RDS_IB_CM_IB_GEN_MASK))
99+
95100
enum rds_ib_preferred_cpu_options {
96101
RDS_IB_PREFER_CPU_CQ = 1 << 0,
97102
RDS_IB_PREFER_CPU_NUMA = 1 << 1,
@@ -358,6 +363,7 @@ struct rds_ib_connection {
358363
u16 i_frag_sz; /* IB fragment size */
359364
u16 i_frag_cache_sz;
360365
u8 i_frag_pages;
366+
u8 i_cm_id_gen:3;
361367
unsigned long i_flags;
362368
u16 i_frag_cache_inx;
363369
u16 i_hca_sge;
@@ -629,6 +635,7 @@ struct rds_ib_statistics {
629635
uint64_t s_ib_frag_pages_allocated;
630636
uint64_t s_ib_frag_pages_in_ib_recv_queue;
631637
uint64_t s_ib_frag_pages_in_caches;
638+
uint64_t s_ib_cm_id_resurrected;
632639
};
633640

634641
extern struct workqueue_struct *rds_ib_wq;

net/rds/ib_cm.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1836,6 +1836,7 @@ static int rds_ib_cm_accept(struct rds_connection *conn,
18361836
BUG_ON(ic->i_cm_id);
18371837

18381838
ic->i_cm_id = cm_id;
1839+
ic->i_cm_id_gen = 0;
18391840
cm_id->context = conn;
18401841

18411842
if (isv6) {
@@ -2214,7 +2215,7 @@ void rds_ib_conn_destroy_init(struct rds_connection *conn)
22142215

22152216
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
22162217
{
2217-
struct rds_connection *conn = cm_id->context;
2218+
struct rds_connection *conn = RDS_IB_CM_ID_EXTRACT_CONN(cm_id);
22182219
struct rds_ib_connection *ic = conn->c_transport_data;
22192220
struct rdma_conn_param conn_param;
22202221
union rds_ib_conn_priv dp;
@@ -2419,8 +2420,9 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
24192420
handler = rds_rdma_cm_event_handler;
24202421

24212422
WARN_ON(ic->i_cm_id);
2422-
ic->i_cm_id = rdma_create_id(rds_conn_net(conn),
2423-
handler, conn, RDMA_PS_TCP, IB_QPT_RC);
2423+
ic->i_cm_id = rdma_create_id(rds_conn_net(conn), handler,
2424+
RDS_IB_CM_ID_ADD_GEN(conn, ++ic->i_cm_id_gen),
2425+
RDMA_PS_TCP, IB_QPT_RC);
24242426

24252427
if (IS_ERR(ic->i_cm_id)) {
24262428
ret = PTR_ERR(ic->i_cm_id);

net/rds/ib_stats.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2006 Oracle. All rights reserved.
2+
* Copyright (c) 2006, 2025, Oracle and/or its affiliates.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -98,6 +98,7 @@ static char *rds_ib_stat_names[] = {
9898
"ib_frag_pages_allocated",
9999
"ib_frag_pages_in_ib_recv_queue",
100100
"ib_frag_pages_in_caches",
101+
"ib_cm_id_resurrected",
101102
};
102103

103104
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,

net/rds/rdma_transport.c

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ struct rds_rdma_cm_event_handler_info {
5252
struct rdma_cm_event event;
5353
struct rds_connection *conn;
5454
bool isv6;
55+
int cm_id_gen;
5556
char private_data[];
5657
};
5758

@@ -110,6 +111,7 @@ static inline bool __rds_rdma_chk_dev(struct rds_connection *conn)
110111
}
111112

112113
static void rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
114+
int cm_id_gen,
113115
struct rdma_cm_event *event,
114116
struct rds_connection *conn,
115117
bool isv6)
@@ -198,6 +200,14 @@ static void rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
198200
return;
199201
}
200202

203+
/* Although same address, could be different generation */
204+
if (cm_id_gen != ic->i_cm_id_gen) {
205+
rds_ib_stats_inc(s_ib_cm_id_resurrected);
206+
up_read(&ic->i_cm_id_free_lock);
207+
mutex_unlock(&conn->c_cm_lock);
208+
return;
209+
}
210+
201211
/* Even though this function no longer accesses "ic->i_cm_id" past this point
202212
* and "cma.c" always blocks "rdma_destroy_id" until "event_callback" is done,
203213
* we still need to hang on to the "i_cm_id_free_lock" until return,
@@ -388,7 +398,8 @@ static void rds_rdma_cm_event_handler_worker(struct work_struct *work)
388398
struct rds_rdma_cm_event_handler_info,
389399
work);
390400

391-
rds_rdma_cm_event_handler_cmn(info->cm_id, &info->event, info->conn, info->isv6);
401+
rds_rdma_cm_event_handler_cmn(info->cm_id, info->cm_id_gen, &info->event,
402+
info->conn, info->isv6);
392403

393404
kfree(info);
394405
}
@@ -397,7 +408,8 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
397408
struct rdma_cm_event *event,
398409
bool isv6)
399410
{
400-
struct rds_connection *conn = cm_id->context;
411+
int cm_id_gen = RDS_IB_CM_ID_EXTRACT_GEN(cm_id);
412+
struct rds_connection *conn = RDS_IB_CM_ID_EXTRACT_CONN(cm_id);
401413
struct rds_ib_connection *ic = conn ? conn->c_transport_data : NULL;
402414
struct workqueue_struct *wq;
403415
struct rds_rdma_cm_event_handler_info *info;
@@ -408,13 +420,13 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
408420
wq = rds_aux_wq;
409421

410422
if (!wq) {
411-
rds_rdma_cm_event_handler_cmn(cm_id, event, conn, isv6);
423+
rds_rdma_cm_event_handler_cmn(cm_id, cm_id_gen, event, conn, isv6);
412424
return;
413425
}
414426

415427
info = kmalloc(sizeof(*info) + event->param.conn.private_data_len, GFP_KERNEL);
416428
if (!info) {
417-
rds_rdma_cm_event_handler_cmn(cm_id, event, conn, isv6);
429+
rds_rdma_cm_event_handler_cmn(cm_id, cm_id_gen, event, conn, isv6);
418430
return;
419431
}
420432

@@ -424,6 +436,7 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
424436
memcpy(&info->event, event, sizeof(*event));
425437
info->conn = conn;
426438
info->isv6 = isv6;
439+
info->cm_id_gen = cm_id_gen;
427440

428441
if (event->param.conn.private_data &&
429442
event->param.conn.private_data_len) {

0 commit comments

Comments
 (0)