Viewing: efalnd.h

/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Copyright (c) 2023-2026, Amazon and/or its affiliates. All rights reserved.
 * Use is subject to license terms.
 */

/*
 * This file is part of Lustre, http://www.lustre.org/
 *
 * Author: Yehuda Yitschak <yehuday@amazon.com>
 * Author: Yonatan Nachum <ynachum@amazon.com>
 */

#ifndef _EFALND_EFALND_H_
#define _EFALND_EFALND_H_

#define DEBUG_SUBSYSTEM S_LND

#include <linux/bvec.h>
#include <linux/rhashtable.h>
#include <linux/libcfs/libcfs.h>

#include <linux/lnet/lib-lnet.h>
#include <linux/lnet/lib-types.h>
#include <linux/lnet/lnet_rdma.h>

#include <efa_verbs.h>

#include "efalnd_proto.h"

#define EFALND_MAJOR_VER        1
#define EFALND_MINOR_VER        2
#define EFALND_SUBMINOR_VER     2
#define EFALND_MAJOR_SHIFT	8

#define KEFA_IFNAME_SIZE		256
#define EFALND_CREDITS_MIN		8	/* Min # of peer_ni credits */
#define EFALND_CREDITS_MAX		255	/* Max # of peer_ni credits */

#define KEFA_THREAD_SHIFT		16
#define KEFA_THREAD_ID(cpt, tid)	((cpt) << KEFA_THREAD_SHIFT | (tid))
#define KEFA_THREAD_CPT(id)		((id) >> KEFA_THREAD_SHIFT)
#define KEFA_THREAD_TID(id)		((id) & ((1UL << KEFA_THREAD_SHIFT) - 1))

#define EFALND_MAX_MTU			(8900)
#define EFALND_MSG_SIZE			EFALND_MAX_MTU
#define EFALND_MSG_SIZE_ALIGNED		PAGE_ALIGN(EFALND_MSG_SIZE)
#define EFALND_MSG_PAGES		(EFALND_MSG_SIZE_ALIGNED / PAGE_SIZE)
#define EFALND_RX_MSGS(q)		(2 * (q)->rq_depth)

/* max # of fragments supported. + 1 for unaligned case */
#define EFALND_MAX_TX_FRAGS		(LNET_MAX_IOV + 1)

/* Max # of TXs each NI can allocate. */
#define EFALND_MAX_NI_TX_POOL		2048

/* default values in case no tunable was set */
#define EFALND_MIN_SCHED_THRS		2
#define EFALND_MAX_SCHED_THRS		4

 /* Used only for small NIDs */
#define EFALND_CM_STATIC_QKEY		(0x1111)

#define EFALND_NO_RDMA_THRESH		EFALND_MSG_SIZE

#define EFALND_CONN_HASH_BITS		7

#define EFALND_INV_CONN			((u32)~0U)

#define EFALND_MAX_PEER_QPS		256

#define EFALND_MIN_INIT_CONN_TIMEOUT	200

/*
 * NI large NID is of the following format:
 * |15            4|3       2|1         0|
 * +---------------+---------+-----------+
 * |      GID      | QP QKEY | QP Number |
 * +---------------+---------+-----------+
 * GID: EFA GID is 16 bytes with bytes 0-3 always constant.
 * QP Number: CM QP number for the remote to be able to establish connection.
 * QP QKEY: 2 byte QKEY to fit in the available area in EFA GID.
 */
#define EFALND_NID_CM_QP_NUM_OFFSET	0
#define EFALND_NID_CM_QP_NUM_SIZE	2
#define EFALND_NID_CM_QP_QKEY_OFFSET	2
#define EFALND_NID_CM_QP_QKEY_SIZE	2
#define EFALND_NID_GID_OFFSET		4
#define EFALND_NID_GID_SIZE		12

/* Define EFALND_CD so that we can easily add D_CONSOLE in test envs */
#define EFALND_CD (D_NET)

#define EFALND_FIELD_AVAIL(type, fld, sz) (offsetof(type, fld) < (sz))

#define EFA_DEV_DEBUG(dev, format, ...) CDEBUG(EFALND_CD, "Device[%s] " format, (dev)->ifname, ## __VA_ARGS__)
#define EFA_DEV_ERR(dev, format, ...) CERROR("Device[%s] " format, (dev)->ifname, ## __VA_ARGS__)
#define EFA_DEV_WARN(dev, format, ...) CWARN("Device[%s] " format, (dev)->ifname, ## __VA_ARGS__)

struct kefa_cq;
struct kefa_qp;
struct kefa_dev;
struct kefa_ni;
struct kefa_sched;
struct kefa_conn;
struct kefa_obj_pool;

extern struct kefa_tunables kefalnd_tunables;
extern struct kefa_data kefalnd;

enum efalnd_init_state {
	EFALND_INIT_NONE = 0,
	EFALND_INIT_ALL
};

struct kefa_remote_qp {
	u16 qp_num;
	u32 qkey;
};

struct kefa_peer_ni {
	struct kref refcount;             /* number of connections */
	u32 remote_nid_addr;              /* address of EFA NID */
	union ib_gid gid;
	struct kefa_remote_qp cm_qp;
	struct rhash_head linkage;
	rwlock_t peer_ni_lock;            /* protects kefa_peer_ni data */
	struct rcu_head rcu_read;         /* protects nid_gid_map lifetime */
};

static const struct rhashtable_params peer_ni_params = {
	.key_len     = sizeof(u32),
	.key_offset  = offsetof(struct kefa_peer_ni, remote_nid_addr),
	.head_offset = offsetof(struct kefa_peer_ni, linkage),
};

struct kefa_tunables {
	int *kefa_rnr_retry_count;
	/* # threads on each CPT */
	int *kefa_nscheds;
	char **kefa_ipif_name;
};

/* global singelton EFA data */
struct kefa_data {
	enum efalnd_init_state init_state;	/* init state of global data */
	struct list_head efa_ni_list;		/* list of EFA NIs */
	struct kefa_sched **scheds;		/* global schedulers */
	struct kefa_cm_deamon **cm_daemons;	/* Connection manager daemons */
	struct rhashtable peer_ni;
	atomic_t peer_ni_count;
	atomic_t nthreads;			/* # live threads */
	bool shutdown;				/* signal shutdown to threads */
};

struct kefa_obj_pool {
	struct kefa_ni *efa_ni;
	void *obj_arr;
	struct list_head free_obj;
	struct list_head free_pend_obj;	/* Objects pending to be freed */
	atomic_t pending_work;		/* Pending list have objects on */
	spinlock_t lock;		/* multithread lock */
	u32 pool_size;
	int cpt;
};

struct kefa_rx {
	struct list_head list_node;
	struct kefa_qp *qp;		/* owner QP */
	struct kefa_msg *msg;		/* message buffer (host vaddr) */
	struct ib_recv_wr wrq;		/* receive work item... */
	struct ib_sge sge;		/* ...and its memory */
	int rx_nob;			/* # bytes received (-1 while posted) */
};

struct kefa_qp {
	struct kefa_dev *efa_dev;
	struct kefa_cq *cq;
	struct ib_qp *ib_qp;
	struct kefa_rx *rx_msgs;	/* RX buffers posted to RQ */
	struct list_head posted_rx;	/* list of posted RX */
	struct list_head free_rx;	/* list of free RX */
	spinlock_t rq_lock;
	u32 rq_depth;
	u32 rq_space;
	u32 qkey;
};

struct kefa_cq {
	struct ib_cq *ib_cq;
	struct kefa_dev *efa_dev;	/* owner device */
	struct list_head sched_node;	/* node on scheduler */
	int cpt;
};

enum kefa_fmr_state {
	KEFA_FMR_INACTIVE = 0,
	KEFA_FMR_ACTIVATING,
	KEFA_FMR_ACTIVE,
	KEFA_FMR_DEACTIVATING,
};

struct kefa_fmr {
	struct ib_mr *mr;
	enum kefa_fmr_state state;
	struct list_head list_node;
	struct ib_reg_wr reg_wr;
	struct ib_send_wr inv_wr;
};

/* EFA device information */
struct kefa_dev {
	struct ib_device *ib_dev;
	char ifname[KEFA_IFNAME_SIZE];
	struct kefa_ni *efa_ni;	/* The EFA NI associated with the device */
	union ib_gid gid;
	struct ib_pd *pd;		/* PD */

	struct kefa_obj_pool fmr_pool;

	struct kefa_qp *qps;		/* QP set */
	atomic_t local_qpn;

	struct kefa_cq *cqs;		/* CQ set */

	struct kefa_qp *cm_qp;		/* Connection establishment QP */
	struct kefa_cq *cm_cq;

	__be32 ifip;			/* Eth interface IP */
	u32 nqps;
	u32 ncqs;
	int cpt;			/* CPU partition of the device */
};

/* transmit message */
struct kefa_tx {
	struct list_head list_node;	/* node on pool/conn list */
	struct kefa_obj_pool *tx_pool;	/* pool I'm from */
	struct kefa_conn *conn;		/* connection for TX */
	struct lnet_msg *lntmsg[2]; /* lnet msgs to finalize on completion */
	struct kefa_msg *msg;		/* message buffer (host vaddr) */
	dma_addr_t msgaddr;		/* message buffer (I/O addr) */
	struct ib_srd_rdma_wr wrq;	/* send work item... */
	struct ib_sge sge;		/* ...and its memory */
	u32 lkey;			/* lkey of sge buffers */
	enum lnet_msg_hstatus hstatus;	/* health status of tx */
	int status;			/* overall status */
	int nfrags;			/* # of mapped buffer fragments */
	struct scatterlist *frags;	/* mapped buffer fragments */
	struct kefa_rdma_desc rdma_desc;/* rdma descriptor to read/write */
	enum dma_data_direction dmadir;
	atomic_t ref_cnt;		/* track sends and completions */
	atomic_t waiting_resp;
	struct kefa_fmr *fmr;
	atomic64_t send_time;		/* send time of send in seconds */

	u8 type;
	bool send_sync;		/* send ctrl message after RDMA completes */
	u64 cookie;		/* opaque completion cookie for sync message */
};

/* Per Lnet network data */
struct kefa_ni {
	struct list_head lnd_node;	/* node in LND NI list */
	struct list_head cm_node;	/* node in connection manager daemon */
	struct kefa_dev *efa_dev;	/* underlying IB device */
	struct lnet_ni *lnet_ni;	/* LNet interface */
	u64 ni_epoch;			/* my epoch */
	struct kefa_obj_pool tx_pool;
	DECLARE_HASHTABLE(conns, EFALND_CONN_HASH_BITS);
	rwlock_t conn_lock;
	struct kefa_peer_ni *self_peer_ni;	/* Only valid for small NID NI*/
};

enum kefa_conn_state {
	KEFA_CONN_INACTIVE,
	KEFA_CONN_PROBE_TCP,
	KEFA_CONN_PROBE_EFA,
	KEFA_CONN_PROBE_EFA_PASSIVE,
	KEFA_CONN_ESTABLISH,
	KEFA_CONN_ACTIVE,
	KEFA_CONN_DEACTIVATING,
};

enum kefa_conn_type {
	KEFA_CONN_TYPE_LB,
	KEFA_CONN_TYPE_INITIATOR,
	KEFA_CONN_TYPE_RESPONDER,
};

struct kefa_conn {
	spinlock_t lock;
	enum kefa_conn_state state;
	struct list_head active_tx;	/* LRU list of active kefa_tx */
	struct list_head pend_tx;	/* list of pending kefa_tx */

	/* Fields that can be changed not under connection lock */
	struct ib_ah *ah;
	u64 remote_epoch;		/*The epoch of the remote connection */
	u8 proto_ver;
	u32 nqps;
	struct kefa_remote_qp *data_qps;
	atomic_t last_qp_idx;
	struct lnet_nid remote_nid;
	time64_t last_use_time;	/* last time the conn was used in seconds */
	struct hlist_node ni_node;	/* node on kefa_ni hashmap */
	struct kefa_ni *efa_ni;
	u64 hash_key;

	/* Low frequency fields */
	struct list_head abort_tx;	/* Only CM iterates this list */
	enum kefa_conn_type type;
	struct lnet_nid local_nid;
	struct kefa_peer_ni *peer_ni; /* my peer NI - only valid for small NID*/
	u64 remote_caps;
	u64 requests;
};

struct kefa_cm_deamon {
	struct mutex ni_list_lock;	/* multithread lock */
	struct list_head efa_ni_list;	/* list of EFA NIs */
	wait_queue_head_t waitq;
	bool active;
	int iter;
	int cpt;
};

struct kefa_sched {
	spinlock_t lock;		/* multithread lock */
	struct list_head pend_cqs;	/* CQs to poll */
	wait_queue_head_t waitq;
	int nthreads;			/* # of poll threads */
	int nthreads_max;		/* max # of threads */
	int cpt;			/* CPT id */
};

static inline u16
kefalnd_get_lnd_version(void)
{
	return ((EFALND_MAJOR_VER << EFALND_MAJOR_SHIFT) | EFALND_MINOR_VER);
}

static inline void
kefalnd_large_nid_create(struct lnet_nid *nid, const union ib_gid *gid,
			 u16 qp_num, u16 qp_qkey)
{
	u16 be_qp_num = cpu_to_be16(qp_num);
	u16 be_qp_qkey = cpu_to_be16(qp_qkey);

	memcpy((u8 *)nid->nid_addr + EFALND_NID_GID_OFFSET,
	       gid->raw + EFALND_NID_GID_OFFSET,
	       EFALND_NID_GID_SIZE);
	memcpy((u8 *)nid->nid_addr + EFALND_NID_CM_QP_NUM_OFFSET, &be_qp_num,
	       EFALND_NID_CM_QP_NUM_SIZE);
	memcpy((u8 *)nid->nid_addr + EFALND_NID_CM_QP_QKEY_OFFSET, &be_qp_qkey,
	       EFALND_NID_CM_QP_QKEY_SIZE);

	nid->nid_size = sizeof(union ib_gid) - 4;
}

static inline void
kefalnd_large_nid_get_gid(struct lnet_nid *nid, union ib_gid *gid)
{
	memcpy(gid->raw, nid->nid_addr, sizeof(gid->raw));
	gid->raw[0] = 0xfe;
	gid->raw[1] = 0x80;
	gid->raw[2] = 0;
	gid->raw[3] = 0;
}

static inline u16
kefalnd_large_nid_get_cm_qp_num(struct lnet_nid *nid)
{
	u16 qp_num;

	memcpy(&qp_num, (u8 *)nid->nid_addr + EFALND_NID_CM_QP_NUM_OFFSET,
	       sizeof(qp_num));

	return be16_to_cpu(qp_num);
}

static inline u16
kefalnd_large_nid_get_cm_qp_qkey(struct lnet_nid *nid)
{
	u16 qp_qkey;

	memcpy(&qp_qkey, (u8 *)nid->nid_addr + EFALND_NID_CM_QP_QKEY_OFFSET,
	       sizeof(qp_qkey));

	return be16_to_cpu(qp_qkey);
}

static inline void
kefalnd_thread_stop(void)
{
	atomic_dec(&kefalnd.nthreads);
}

static inline void
kefalnd_msg_set_epoch(struct kefa_msg *msg, u64 remote_epoch)
{
	if (msg->hdr.proto_ver != EFALND_PROTO_VER_1)
		msg->msg_v2.dst_epoch = remote_epoch;
	else
		msg->msg_v1.dst_epoch = remote_epoch;
}

int kefalnd_tunables_init(void);
int kefalnd_tunables_setup(struct lnet_ni *ni);

int kefalnd_msgtype2size(int type, u8 proto_ver);
int kefalnd_efa_status_to_errno(s16 efa_status);
s16 kefalnd_errno_to_efa_status(int status);
void kefalnd_tx_done(struct kefa_tx *tx);
void kefalnd_abort_tx(struct kefa_tx *tx, enum lnet_msg_hstatus hstatus,
		      int status);
/* Should be used only on TXs that we don't expect to get any completions for */
void kefalnd_force_cancel_tx(struct kefa_tx *tx, enum lnet_msg_hstatus hstatus,
			     int status);
void kefalnd_init_tx_protocol_msg(struct kefa_tx *tx, struct kefa_conn *conn,
				  int type, int body_nob, u8 proto_ver);
struct kefa_tx *kefalnd_get_idle_tx(struct kefa_ni *efa_ni);
void kefalnd_conn_post_tx_locked(struct kefa_conn *conn);
void kefalnd_get_srcnid_from_msg(struct kefa_msg *msg, struct lnet_nid *srcnid);
void kefalnd_get_dstnid_from_msg(struct kefa_msg *msg, struct lnet_nid *dstnid);

struct kefa_peer_ni *kefalnd_find_remote_peer_ni(struct kefa_dev *efa_dev,
						 struct lnet_nid *efa_nid);
struct kefa_peer_ni *kefalnd_lookup_or_create_peer_ni(lnet_nid_t nid,
						      union ib_gid *gid,
						      u16 cm_qpn, u32 cm_qkey);
void kefalnd_update_peer_ni(struct kefa_peer_ni *peer_ni, union ib_gid *gid,
			    u16 cm_qpn, u32 cm_qkey);
int kefalnd_get_nid_metadata(struct lnet_ni *ni,
			     struct lnet_nid_md_entry *md_entry);
void kefalnd_put_peer_ni(struct kefa_peer_ni *peer_ni);

void kefalnd_debugfs_init(void);
void kefalnd_debugfs_exit(void);

struct kefa_conn *kefalnd_lookup_conn(struct kefa_ni *efa_ni,
				      struct lnet_nid *nid,
				      enum kefa_conn_type conn_type);
struct kefa_conn *kefalnd_lookup_or_init_conn(struct kefa_ni *efa_ni,
					      struct lnet_nid *nid,
					      enum kefa_conn_type conn_type);
void kefalnd_handle_conn_establishment(struct kefa_ni *efa_ni,
				       struct kefa_msg *msg);
void kefalnd_deactivate_conn(struct kefa_conn *conn);
void kefalnd_destroy_conn(struct kefa_conn *conn, enum lnet_msg_hstatus hstatus,
			  int status);
int kefalnd_cm_daemon(void *arg);
void kefalnd_add_ni_to_cm_daemon(struct kefa_ni *efa_ni);
void kefalnd_del_ni_from_cm_daemon(struct kefa_ni *efa_ni);

#endif