Viewing: efalnd_peerni.c
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025, Amazon and/or its affiliates. All rights reserved.
* Use is subject to license terms.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
*
* EFA GID/QP Discovery via TCP
*
* For IPv4 NIDs the EFA LND automatically discovers remote EFA NIs GID and
* manager QP data by doing a TCP LNET ping. This allows instances to
* communicate over EFA without needing large NID support and without
* needing the GID to be provided manually.
*
* The GIDs and manager QP data for all remote NIs for a particular
* node are passed via the LNET ping REPLY packet. Since a node will
* only send the GIDs and QP data of its own NIs, a ping must be
* performed with each node in a cluster.
*
* We implement the LNET callback lnd_get_nid_metadata to enable
* LNET to query the LND for NID related metadata to send alongside
* the ping REPLY. Of course, this is implemented by the EFA LND to
* transmit the GIDs and manager QP data for local NIs.
*
* The NID format is designed to generate unique NID without
* need a centralized name/number server. The NIDs are created by
* taking IP of the primary ethernet interface, discarding the
* subnet mask, and appending the PCI bus/devfn number for the device.
*
* For example, a node with TCP NID 172.86.23.4@tcp would have EFA
* NIDs such as: 23.4.0.79@efa, 23.4.0.96@efa, 23.4.0.131@efa.
*
* We define a kefa_peer_ni struct to track metadata about remote
* NIs. These kefa_peer_ni objects are kref'ed and stored in a glboal
* rhashtable protected by RCU. Access to each individual kefa_peer_ni
* is protected by a rw_lock_t.
*
* A kefa_dev holds a reference to it's own kefa_peer_ni. A
* kefa_conn holds a reference to at least the kefa_peer_ni it's
* initiating a connection to. If kefa_conn is the first connection,
* then it holds a reference on each kefa_peer_ni available on the
* remote node.
*
* Author: Timothy Day <timday@amazon.com>
* Author: Yonatan Nachum <ynachum@amazon.com>
*/
#include <linux/delay.h>
#include <linux/dmapool.h>
#include <linux/ethtool.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/smp.h>
#include <rdma/ib_verbs.h>
#include "kcompat.h"
#include "efalnd.h"
#define EFALND_TCP_PING_TIMEOUT 30
static void efa_nid_to_tcp_nid(__be32 local_ip, lnet_nid_t efa_nid4,
struct lnet_nid *tcp_nid)
{
u32 local_ip_le = __swab32(local_ip);
lnet_nid_t tcp_nid4;
u32 remote_ip;
remote_ip = local_ip_le & ~0xffff;
remote_ip = remote_ip | (LNET_NIDADDR(efa_nid4) >> 16);
remote_ip = le32_to_cpu(remote_ip);
tcp_nid4 = LNET_MKNID(LNET_MKNET(SOCKLND, 0), remote_ip);
lnet_nid4_to_nid(tcp_nid4, tcp_nid);
}
static void peer_ni_free(struct kref *ref)
{
struct kefa_peer_ni *peer_ni = container_of(ref, struct kefa_peer_ni,
refcount);
rcu_read_lock();
if (!kefalnd.shutdown)
rhashtable_remove_fast(&kefalnd.peer_ni, &peer_ni->linkage,
peer_ni_params);
atomic_dec(&kefalnd.peer_ni_count);
LIBCFS_FREE_PRE(peer_ni, sizeof(*peer_ni), "kfreed");
kfree_rcu(peer_ni, rcu_read);
rcu_read_unlock();
}
static struct kefa_peer_ni *get_peer_ni(u32 nid_addr)
{
struct kefa_peer_ni *peer_ni;
rcu_read_lock();
if (kefalnd.shutdown || kefalnd.init_state == EFALND_INIT_NONE) {
rcu_read_unlock();
return NULL;
}
peer_ni = rhashtable_lookup_fast(&kefalnd.peer_ni, &nid_addr,
peer_ni_params);
if (!peer_ni) {
rcu_read_unlock();
return NULL;
}
if (!kref_get_unless_zero(&peer_ni->refcount)) {
rcu_read_unlock();
return NULL;
}
rcu_read_unlock();
return peer_ni;
}
struct kefa_peer_ni *
kefalnd_lookup_or_create_peer_ni(lnet_nid_t nid, union ib_gid *gid, u16 cm_qpn,
u32 cm_qkey)
{
struct kefa_peer_ni *new_peer_ni, *old_peer_ni;
CFS_ALLOC_PTR(new_peer_ni);
if (!new_peer_ni)
return ERR_PTR(-ENOMEM);
new_peer_ni->remote_nid_addr = LNET_NIDADDR(nid);
new_peer_ni->gid = *gid;
new_peer_ni->cm_qp.qp_num = cm_qpn;
new_peer_ni->cm_qp.qkey = cm_qkey;
kref_init(&new_peer_ni->refcount);
rwlock_init(&new_peer_ni->peer_ni_lock);
rcu_read_lock();
if (kefalnd.shutdown || kefalnd.init_state == EFALND_INIT_NONE) {
rcu_read_unlock();
CFS_FREE_PTR(new_peer_ni);
return ERR_PTR(-ENODEV);
}
old_peer_ni = rhashtable_lookup_get_insert_fast(&kefalnd.peer_ni,
&new_peer_ni->linkage,
peer_ni_params);
if (IS_ERR(old_peer_ni)) {
CDEBUG(EFALND_CD, "Failed to insert mapping for peer NI[%s]\n",
libcfs_nid2str(nid));
rcu_read_unlock();
CFS_FREE_PTR(new_peer_ni);
return old_peer_ni;
}
if (old_peer_ni) {
CDEBUG(EFALND_CD,
"Found pre-existing mapping for peer NI[%s]\n",
libcfs_nid2str(nid));
if (!kref_get_unless_zero(&old_peer_ni->refcount))
old_peer_ni = ERR_PTR(-ENODEV);
rcu_read_unlock();
CFS_FREE_PTR(new_peer_ni);
return old_peer_ni;
}
rcu_read_unlock();
atomic_inc(&kefalnd.peer_ni_count);
return new_peer_ni;
}
void kefalnd_put_peer_ni(struct kefa_peer_ni *peer_ni)
{
kref_put(&peer_ni->refcount, peer_ni_free);
}
void kefalnd_update_peer_ni(struct kefa_peer_ni *peer_ni, union ib_gid *gid,
u16 cm_qpn, u32 cm_qkey)
{
unsigned long flags;
rcu_read_lock();
if (kefalnd.shutdown || kefalnd.init_state == EFALND_INIT_NONE) {
rcu_read_unlock();
return;
}
write_lock_irqsave(&peer_ni->peer_ni_lock, flags);
peer_ni->cm_qp.qp_num = cm_qpn;
peer_ni->cm_qp.qkey = cm_qkey;
peer_ni->gid = *gid;
write_unlock_irqrestore(&peer_ni->peer_ni_lock, flags);
rcu_read_unlock();
}
/**
* kefalnd_find_remote_peer_ni() - Either get cached peer NI or ping over TCP.
* @efa_dev: EFA interface that needs the connection.
* @efa_nid: The remote NID to search.
*
* Return: peer NI if found or error.
*/
struct kefa_peer_ni *
kefalnd_find_remote_peer_ni(struct kefa_dev *efa_dev, struct lnet_nid *efa_nid)
{
int mapping_size = offsetof(struct lnet_nid_metadata,
nid_mappings[lnet_interfaces_max]);
struct kefa_peer_ni *peer_ni = NULL;
struct lnet_nid_metadata *mapping;
struct lnet_processid id;
struct lnet_nid tcp_nid;
lnet_nid_t efa_nid4;
union ib_gid gid;
u32 nid_addr;
int rc = 0;
int i = 0;
ENTRY;
LASSERTF(nid_is_nid4(efa_nid), "NID[%s] is not a small NID\n",
libcfs_nidstr(efa_nid));
EFA_DEV_DEBUG(efa_dev, "Attempting to find peer NI for NI[%s]\n",
libcfs_nidstr(efa_nid));
efa_nid4 = lnet_nid_to_nid4(efa_nid);
nid_addr = LNET_NIDADDR(efa_nid4);
peer_ni = get_peer_ni(nid_addr);
if (peer_ni) {
EFA_DEV_DEBUG(efa_dev, "Successfully found peer NI[%s]\n",
libcfs_nidstr(efa_nid));
RETURN(peer_ni);
}
LIBCFS_CPT_ALLOC(mapping, lnet_cpt_table(), efa_dev->cpt, mapping_size);
if (!mapping)
GOTO(out_error, rc = -ENOMEM);
efa_nid_to_tcp_nid(efa_dev->ifip, efa_nid4, &tcp_nid);
EFA_DEV_DEBUG(efa_dev, "Attempting to ping TCP peer NI[%s]\n",
libcfs_nidstr(&tcp_nid));
id.nid = tcp_nid;
id.pid = LNET_PID_LUSTRE;
rc = lnet_discover_nid_metadata(&id, EFALND_TCP_PING_TIMEOUT, mapping);
if (rc) {
EFA_DEV_DEBUG(efa_dev, "Failed to ping TCP peer NI[%s]\n",
libcfs_nidstr(&tcp_nid));
GOTO(out_mapping, rc);
}
EFA_DEV_DEBUG(efa_dev, "Found %i mappings from TCP peer NI[%s]\n",
mapping->num_nid_mappings,
libcfs_nidstr(&tcp_nid));
peer_ni = NULL;
for (i = 0; i < mapping->num_nid_mappings; i++) {
struct kefa_nid_md_entry *kefa_nid_md;
struct kefa_peer_ni *new_peer_ni;
if (LNET_NETTYP(LNET_NIDNET(mapping->nid_mappings[i].nid)) != EFALND)
continue;
if (LNET_NIDADDR(mapping->nid_mappings[i].nid) != nid_addr)
continue;
kefa_nid_md = (struct kefa_nid_md_entry *)&mapping->nid_mappings[i];
memcpy(gid.raw, &kefa_nid_md->gid, sizeof(kefa_nid_md->gid));
new_peer_ni = kefalnd_lookup_or_create_peer_ni(kefa_nid_md->nid,
&gid, kefa_nid_md->qp_num,
kefa_nid_md->qkey);
if (IS_ERR_OR_NULL(new_peer_ni))
GOTO(out_mapping, rc = PTR_ERR(new_peer_ni));
peer_ni = new_peer_ni;
GOTO(out_success, rc);
}
/* We couldn't find the mapping we're looking for */
if (!peer_ni)
GOTO(out_mapping, rc = -ENODEV);
out_success:
LIBCFS_FREE(mapping, mapping_size);
EFA_DEV_DEBUG(efa_dev,
"Completed ping and found GID[0x%016llx] from TCP peer NI[%s]\n",
cpu_to_be64(peer_ni->gid.global.interface_id),
libcfs_nidstr(&tcp_nid));
RETURN(peer_ni);
out_mapping:
LIBCFS_FREE(mapping, mapping_size);
out_error:
RETURN(ERR_PTR(rc));
}
/**
* kefalnd_get_nid_metadata() - Get NIs GID and manager QP data.
* @ni: LNET NI associated with EFA NI.
* @md_entry: Mapping object - for EFA, contains device identifier
* that is needed to communicate with EFA devices and
* manager QP data.
*
* Return: have we found a valid mapping?
*/
int kefalnd_get_nid_metadata(struct lnet_ni *ni,
struct lnet_nid_md_entry *md_entry)
{
struct kefa_nid_md_entry *kefa_ni_md = (struct kefa_nid_md_entry *)md_entry;
struct kefa_ni *efa_ni = ni->ni_data;
struct kefa_dev *efa_dev = efa_ni->efa_dev;
if (kefalnd.shutdown || kefalnd.init_state == EFALND_INIT_NONE)
return -ENODEV;
memcpy(&kefa_ni_md->gid, efa_dev->gid.raw, sizeof(efa_dev->gid.raw));
kefa_ni_md->qp_num = efa_dev->cm_qp->ib_qp->qp_num;
kefa_ni_md->qkey = efa_dev->cm_qp->qkey;
EFA_DEV_DEBUG(efa_dev, "Mapped local NID[%s] to GID[0x%016llx]\n",
libcfs_nidstr(&ni->ni_nid),
cpu_to_be64(efa_dev->gid.global.interface_id));
return 0;
}