Viewing: o2iblnd.h

/* SPDX-License-Identifier: GPL-2.0 */

/* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 * Use is subject to license terms.
 *
 * Copyright (c) 2011, 2017, Intel Corporation.
 */

/* This file is part of Lustre, http://www.lustre.org/
 *
 * Author: Eric Barton <eric@bartonsoftware.com>
 */

#include <linux/module.h>
#include <linux/kernel.h>

#if defined(EXTERNAL_OFED_BUILD) && !defined(HAVE_OFED_IB_DMA_MAP_SG_SANE)
#undef CONFIG_INFINIBAND_VIRT_DMA
#endif

#ifdef HAVE_OFED_COMPAT_RDMA
#include <linux/compat-2.6.h>
#endif

#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/unistd.h>
#include <linux/uio.h>

#include <asm/uaccess.h>
#include <asm/io.h>

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/list.h>
#include <linux/kmod.h>
#include <linux/sysctl.h>
#include <linux/pci.h>

#include <net/sock.h>
#include <linux/in.h>

#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <lustre_compat/rdma/ib_verbs.h>
#ifdef HAVE_OFED_FMR_POOL_API
#include <rdma/ib_fmr_pool.h>
#endif

#define DEBUG_SUBSYSTEM S_LND

#include <linux/libcfs/libcfs.h>
#include <linux/lnet/lib-lnet.h>
#include <linux/lnet/lnet_rdma.h>
#include "o2iblnd-idl.h"

enum kiblnd_ni_lnd_tunables_attr {
	LNET_NET_O2IBLND_TUNABLES_ATTR_UNSPEC = 0,

	LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS,
	LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS,
	LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND,
	LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE,
	LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER,
	LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE,
	LNET_NET_O2IBLND_TUNABLES_ATTR_NTX,
	LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER,
	LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TIMEOUT,
	LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TOS,
	__LNET_NET_O2IBLND_TUNABLES_ATTR_MAX_PLUS_ONE,
};

#define LNET_NET_O2IBLND_TUNABLES_ATTR_MAX (__LNET_NET_O2IBLND_TUNABLES_ATTR_MAX_PLUS_ONE - 1)

#define IBLND_PEER_HASH_BITS		7	/* log2 of # peer_ni lists */
#define IBLND_N_SCHED			2
#define IBLND_N_SCHED_HIGH		4

struct kib_tunables {
	int              *kib_dev_failover;     /* HCA failover */
	unsigned int     *kib_service;          /* IB service number */
	int              *kib_cksum;            /* checksum struct kib_msg? */
	int              *kib_timeout;          /* comms timeout (seconds) */
	int              *kib_keepalive;        /* keepalive timeout (seconds) */
	char            **kib_default_ipif;     /* default IPoIB interface */
	int              *kib_retry_count;
	int              *kib_rnr_retry_count;
	int		 *kib_ib_mtu;		/* IB MTU */
	int              *kib_require_priv_port;/* accept only privileged ports */
	int              *kib_use_priv_port;    /* use privileged port for active connect */
	/* # threads on each CPT */
	int		 *kib_nscheds;
	int		 *kib_wrq_sge;		/* # sg elements per wrq */
	int		 *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
};

extern struct kib_tunables  kiblnd_tunables;
extern struct lnet_ioctl_config_o2iblnd_tunables kib_default_tunables;

#define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
#define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */

#define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
#define IBLND_CREDITS_MAX          ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */

#define IBLND_TIMEOUT_DEFAULT	50	/* Default o2iblnd timeout in seconds */

/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
#define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)

/* max size of queued messages (inc hdr) */
#define IBLND_MSG_SIZE              (4<<10)
/* max # of fragments supported. + 1 for unaligned case */
#define IBLND_MAX_RDMA_FRAGS        (LNET_MAX_IOV + 1)

/************************/
/* derived constants... */
/* Pools (shared by connections on each CPT) */
/* These pools can grow at runtime, so don't need give a very large value */
#define IBLND_TX_POOL			256
#define IBLND_FMR_POOL			256
#define IBLND_FMR_POOL_FLUSH		192

/* RX messages (per connection) */
#define IBLND_RX_MSGS(c)	\
	((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
#define IBLND_RX_MSG_BYTES(c)       (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
#define IBLND_RX_MSG_PAGES(c)	\
	((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)

/* WRs and CQEs (per connection) */
#define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)

/* 2 = LNet msg + Transfer chain */
#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))

struct kib_hca_dev;

enum kib_dev_caps {
	IBLND_DEV_CAPS_FASTREG_ENABLED		= BIT(0),
	IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT	= BIT(1),
#ifdef HAVE_OFED_FMR_POOL_API
	IBLND_DEV_CAPS_FMR_ENABLED		= BIT(2),
#endif
};

#define IS_FAST_REG_DEV(dev) \
	((dev)->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)


struct kib_dev {
	struct list_head	ibd_list;	/* chain on kib_devs */
	struct list_head	ibd_fail_list;	/* chain on kib_failed_devs */
	struct sockaddr_storage	ibd_addr;	/* Interface network address */
	/** IPoIB interface name */
	char			ibd_ifname[IFALIASZ];
	int			ibd_nnets;	/* # nets extant */

	time64_t		ibd_next_failover;
	/* # failover failures */
	int			ibd_failed_failover;
	/* failover in progress */
	unsigned int		ibd_failover;
	/* IPoIB interface is a bonding master */
	unsigned int		ibd_can_failover;
	struct list_head	ibd_nets;
	struct kib_hca_dev	*ibd_hdev;
	enum kib_dev_caps	ibd_dev_caps;
};

struct kib_hca_dev {
	struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
	struct ib_device    *ibh_ibdev;         /* IB device */
	int                  ibh_page_shift;    /* page shift of current HCA */
	int                  ibh_page_size;     /* page size of current HCA */
	__u64                ibh_page_mask;     /* page mask of current HCA */
	__u64                ibh_mr_size;       /* size of MR */
	int		     ibh_max_qp_wr;     /* maximum work requests size */
	struct ib_pd        *ibh_pd;            /* PD */
	u8                   ibh_port;          /* port number */
	struct ib_event_handler
			     ibh_event_handler; /* IB event handler */
	int                  ibh_state;         /* device status */
#define IBLND_DEV_PORT_DOWN     0
#define IBLND_DEV_PORT_ACTIVE   1
#define IBLND_DEV_FATAL         2
	struct kib_dev           *ibh_dev;           /* owner */
	atomic_t             ibh_ref;           /* refcount */
};

/** # of seconds to keep pool alive */
#define IBLND_POOL_DEADLINE     300
/** # of seconds to retry if allocation failed */
#define IBLND_POOL_RETRY        1

struct kib_pages {
	int                     ibp_npages;             /* # pages */
	struct page            *ibp_pages[];            /* page array */
};

struct kib_pool;
struct kib_poolset;

typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
				     int inc, struct kib_pool **pp_po);
typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);

struct kib_net;

#define IBLND_POOL_NAME_LEN     32

struct kib_poolset {
	/* serialize */
	spinlock_t		ps_lock;
	/* network it belongs to */
	struct kib_net		*ps_net;
	/* pool set name */
	char			ps_name[IBLND_POOL_NAME_LEN];
	/* list of pools */
	struct list_head	ps_pool_list;
	/* failed pool list */
	struct list_head	ps_failed_pool_list;
	/* time stamp for retry if failed to allocate */
	time64_t		ps_next_retry;
	/* is allocating new pool */
	int			ps_increasing;
	/* new pool size */
	int			ps_pool_size;
	/* CPT id */
	int			ps_cpt;

	/* create a new pool */
	kib_ps_pool_create_t	ps_pool_create;
	/* destroy a pool */
	kib_ps_pool_destroy_t	ps_pool_destroy;
	/* initialize new allocated node */
	kib_ps_node_init_t	ps_node_init;
	/* finalize node */
	kib_ps_node_fini_t	ps_node_fini;
};

struct kib_pool {
	/* chain on pool list */
	struct list_head	 po_list;
	/* pre-allocated node */
	struct list_head	 po_free_list;
	/* pool_set of this pool */
	struct kib_poolset	*po_owner;
	/* deadline of this pool */
	time64_t		 po_deadline;
	/* # of elements in use */
	int			 po_allocated;
	/* pool is created on failed HCA */
	int			 po_failed;
	/* # of pre-allocated elements */
	int			 po_size;
};

struct kib_tx_poolset {
	struct kib_poolset	tps_poolset;		/* pool-set */
	__u64			tps_next_tx_cookie;	/* cookie of TX */
};

struct kib_tx_pool {
	struct kib_pool		 tpo_pool;	/* pool */
	struct kib_hca_dev	*tpo_hdev;	/* device for this pool */
	struct kib_tx		*tpo_tx_descs;	/* all the tx descriptors */
	struct kib_pages	*tpo_tx_pages;	/* premapped tx msg pages */
};

struct kib_fmr_poolset {
	spinlock_t		 fps_lock;		/* serialize */
	struct kib_net		*fps_net;		/* IB network */
	struct list_head	 fps_pool_list;		/* FMR pool list */
	struct list_head	 fps_failed_pool_list;	/* FMR pool list */
	__u64			 fps_version;		/* validity stamp */
	int			 fps_cpt;		/* CPT id */
	int			 fps_pool_size;
	int			 fps_flush_trigger;
	int			 fps_cache;
	/* is allocating new pool */
	int			 fps_increasing;
	/* time stamp for retry if failed to allocate */
	time64_t		 fps_next_retry;
};

struct kib_fast_reg_descriptor { /* For fast registration */
	struct list_head		 frd_list;
	struct ib_rdma_wr		 frd_inv_wr;
	struct ib_reg_wr		 frd_fastreg_wr;
	struct ib_mr			*frd_mr;
	bool				 frd_valid;
	bool				 frd_posted;
};

struct kib_fmr_pool {
	struct list_head	 fpo_list;	/* chain on pool list */
	struct kib_hca_dev	*fpo_hdev;	/* device for this pool */
	struct kib_fmr_poolset	*fpo_owner;	/* owner of this pool */
#ifdef HAVE_OFED_FMR_POOL_API
	union {
		struct {
			struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
		} fmr;
#endif
		struct { /* For fast registration */
			struct list_head  fpo_pool_list;
			int		  fpo_pool_size;
		} fast_reg;
#ifdef HAVE_OFED_FMR_POOL_API
	};
	bool			fpo_is_fmr; /* True if FMR pools allocated */
#endif
	time64_t		fpo_deadline;	/* deadline of this pool */
	int			fpo_failed;	/* fmr pool is failed */
	int			fpo_map_count;	/* # of mapped FMR */
};

struct kib_fmr {
	struct kib_fmr_pool		*fmr_pool;	/* pool of FMR */
#ifdef HAVE_OFED_FMR_POOL_API
	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
#endif /* HAVE_OFED_FMR_POOL_API */
	struct kib_fast_reg_descriptor	*fmr_frd;
	u32				 fmr_key;
};

struct kib_net {
	/* chain on struct kib_dev::ibd_nets */
	struct list_head	ibn_list;
	__u64			ibn_incarnation;/* my epoch */
	int			ibn_init;	/* initialisation state */
	int			ibn_shutdown;	/* shutting down? */

	atomic_t		ibn_npeers;	/* # peers extant */
	atomic_t		ibn_nconns;	/* # connections extant */

	struct kib_tx_poolset	**ibn_tx_ps;	/* tx pool-set */
	struct kib_fmr_poolset	**ibn_fmr_ps;	/* fmr pool-set */

	struct kib_dev		*ibn_dev;	/* underlying IB device */
	struct lnet_ni		*ibn_ni;	/* LNet interface */
};

#define KIB_THREAD_SHIFT		16
#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))

struct kib_sched_info {
	/* serialise */
	spinlock_t		ibs_lock;
	/* schedulers sleep here */
	wait_queue_head_t	ibs_waitq;
	/* conns to check for rx completions */
	struct list_head	ibs_conns;
	/* number of scheduler threads */
	int			ibs_nthreads;
	/* max allowed scheduler threads */
	int			ibs_nthreads_max;
	int			ibs_cpt;	/* CPT id */
};

struct kib_data {
	int			kib_init;	/* initialisation state */
	int			kib_shutdown;	/* shut down? */
	struct list_head	kib_devs;	/* IB devices extant */
	/* list head of failed devices */
	struct list_head	kib_failed_devs;
	/* schedulers sleep here */
	wait_queue_head_t	kib_failover_waitq;
	atomic_t		kib_nthreads;	/* # live threads */
	/* stabilize net/dev/peer_ni/conn ops */
	rwlock_t		kib_global_lock;
	/* hash table of all my known peers */
	DECLARE_HASHTABLE(kib_peers, IBLND_PEER_HASH_BITS);
	/* the connd task (serialisation assertions) */
	void			*kib_connd;
	/* connections to setup/teardown */
	struct list_head	kib_connd_conns;
	/* connections with zero refcount */
	struct list_head	kib_connd_zombies;
	/* connections to reconnect */
	struct list_head	kib_reconn_list;
	/* peers wait for reconnection */
	struct list_head	kib_reconn_wait;
	/*
	 * The second that peers are pulled out from \a kib_reconn_wait
	 * for reconnection.
	 */
	time64_t		kib_reconn_sec;
	/* connection daemon sleeps here */
	wait_queue_head_t	kib_connd_waitq;
	spinlock_t		kib_connd_lock;	/* serialise */
	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
	/* percpt data for schedulers */
	struct kib_sched_info	**kib_scheds;
};

#define IBLND_INIT_NOTHING	0
#define IBLND_INIT_DATA		1
#define IBLND_INIT_ALL		2

struct kib_rx {					/* receive message */
	/* queue for attention */
	struct list_head	 rx_list;
	/* owning conn */
	struct kib_conn		*rx_conn;
	/* # bytes received (-1 while posted) */
	int			 rx_nob;
	/* message buffer (host vaddr) */
	struct kib_msg		*rx_msg;
	/* message buffer (I/O addr) */
	__u64			 rx_msgaddr;
	/* for dma_unmap_single() */
	DEFINE_DMA_UNMAP_ADDR(rx_msgunmap);
};

#define IBLND_POSTRX_DONT_POST	  0 /* don't post */
#define IBLND_POSTRX_NO_CREDIT	  1 /* post: no credits */
#define IBLND_POSTRX_PEER_CREDIT  2 /* post: give peer_ni back 1 credit */
#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */

struct kib_tx {					/* transmit message */
	/* queue on idle_txs ibc_tx_queue etc. */
	struct list_head	tx_list;
	/* pool I'm from */
	struct kib_tx_pool	*tx_pool;
	/* owning conn */
	struct kib_conn		*tx_conn;
	/* # tx callbacks outstanding */
	short			tx_sending;
	/* queued for sending */
	unsigned long		tx_queued:1,
	/* waiting for peer_ni */
				tx_waiting:1,
	/* force RDMA */
				tx_gpu:1;
	/* LNET completion status */
	int			tx_status;
	/* health status of the transmit */
	enum lnet_msg_hstatus	tx_hstatus;
	/* completion deadline */
	ktime_t			tx_deadline;
	/* completion cookie */
	__u64			tx_cookie;
	/* lnet msgs to finalize on completion */
	struct lnet_msg		*tx_lntmsg[2];
	/* message buffer (host vaddr) */
	struct kib_msg		*tx_msg;
	/* message buffer (I/O addr) */
	__u64			tx_msgaddr;
	/* for dma_unmap_single() */
	DEFINE_DMA_UNMAP_ADDR(tx_msgunmap);
	/* # send work items */
	int			tx_nwrq;
	/* # used scatter/gather elements */
	int			tx_nsge;
	/* send work items... */
	struct ib_rdma_wr	*tx_wrq;
	/* ...and their memory */
	struct ib_sge		*tx_sge;
	/* rdma descriptor */
	struct kib_rdma_desc	*tx_rd;
	/* # entries in... */
	int			tx_nfrags;
	/* dma_map_sg descriptor */
	struct scatterlist	*tx_frags;
	/* rdma phys page addrs */
	__u64			*tx_pages;
	/* gaps in fragments */
	bool			tx_gaps;
	/* FMR */
	struct kib_fmr		tx_fmr;
				/* dma direction */
	int			tx_dmadir;
};

struct kib_connvars {
	/* connection-in-progress variables */
	struct kib_msg		cv_msg;
};

struct kib_conn {
	/* scheduler information */
	struct kib_sched_info	*ibc_sched;
	/* owning peer_ni */
	struct kib_peer_ni	*ibc_peer;
	/* HCA bound on */
	struct kib_hca_dev	*ibc_hdev;
	/* stash on peer_ni's conn list */
	struct list_head	ibc_list;
	/* schedule for attention */
	struct list_head	ibc_sched_list;
	/* version of connection */
	__u16			ibc_version;
	/* reconnect later */
	__u16			ibc_reconnect:1;
	/* which instance of the peer */
	__u64			ibc_incarnation;
	/* # users */
	atomic_t		ibc_refcount;
	/* what's happening */
	int			ibc_state;
	/* # uncompleted sends */
	int			ibc_nsends_posted;
	/* # uncompleted NOOPs */
	int			ibc_noops_posted;
	/* # credits I have */
	int			ibc_credits;
	/* # credits to return */
	int			ibc_outstanding_credits;
	/* # ACK/DONE msg credits */
	int			ibc_reserved_credits;
	/* set on comms error */
	int			ibc_comms_error;
	/* connections queue depth */
	__u16			ibc_queue_depth;
	/* connections max frags */
	__u16			ibc_max_frags;
	/* receive buffers owned */
	unsigned int		ibc_nrx:16;
	/* scheduled for attention */
	unsigned int		ibc_scheduled:1;
	/* CQ callback fired */
	unsigned int		ibc_ready:1;
	/* time of last send */
	ktime_t			ibc_last_send;
	/** link chain for kiblnd_check_conns only */
	struct list_head	ibc_connd_list;
	/** rxs completed before ESTABLISHED */
	struct list_head	ibc_early_rxs;
	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
	struct list_head	ibc_tx_noops;
	/* sends that need a credit */
	struct list_head	ibc_tx_queue;
	/* sends that don't need a credit */
	struct list_head	ibc_tx_queue_nocred;
	/* sends that need to reserve an ACK/DONE msg */
	struct list_head	ibc_tx_queue_rsrvd;
	/* active tx awaiting completion */
	struct list_head	ibc_active_txs;
	/* zombie tx awaiting done */
	struct list_head	ibc_zombie_txs;
	/* serialise */
	spinlock_t		ibc_lock;
	/* the rx descs */
	struct kib_rx		*ibc_rxs;
	/* premapped rx msg pages */
	struct kib_pages	*ibc_rx_pages;

	/* CM id */
	struct rdma_cm_id	*ibc_cmid;
	/* completion queue */
	struct ib_cq		*ibc_cq;

	/* in-progress connection state */
	struct kib_connvars	*ibc_connvars;
};

#define IBLND_CONN_INIT			0 /* being initialised */
#define IBLND_CONN_ACTIVE_CONNECT	1 /* active sending req */
#define IBLND_CONN_PASSIVE_WAIT		2 /* passive waiting for rtu */
#define IBLND_CONN_ESTABLISHED		3 /* connection established */
#define IBLND_CONN_CLOSING		4 /* being closed */
#define IBLND_CONN_DISCONNECTED		5 /* disconnected */

struct kib_peer_ni {
	/* on peer_ni hash chain */
	struct hlist_node	ibp_list;
	/* who's on the other end(s) */
	struct lnet_nid		ibp_nid;
	/* LNet interface */
	struct lnet_ni		*ibp_ni;
	/* all active connections */
	struct list_head	ibp_conns;
	/* connections with an inflight active connect request */
	struct list_head	ibp_connreqs;
	/* next connection to send on for round robin */
	struct kib_conn		*ibp_next_conn;
	/* msgs waiting for a conn */
	struct list_head	ibp_tx_queue;
	/* incarnation of peer_ni */
	__u64			ibp_incarnation;
	/* when (in seconds) I was last alive */
	time64_t		ibp_last_alive;
	/* # users */
	struct kref		ibp_kref;
	/* version of peer_ni */
	__u16			ibp_version;
	/* current passive connection attempts */
	unsigned short		ibp_accepting;
	/* current active connection attempts */
	unsigned short		ibp_connecting;
	/* reconnect this peer_ni later */
	unsigned char		ibp_reconnecting;
	/* counter of how many times we triggered a conn race */
	unsigned char		ibp_races;
	/* # consecutive reconnection attempts to this peer */
	unsigned int		ibp_reconnected;
	/* errno on closing this peer_ni */
	int			ibp_error;
	/* max map_on_demand */
	__u16			ibp_max_frags;
	/* max_peer_credits */
	__u16			ibp_queue_depth;
	/* reduced value which allows conn to be created if max fails */
	__u16			ibp_queue_depth_mod;
	/* Number of connections allocated. */
	atomic_t		ibp_nconns;
};

extern struct kib_data kiblnd_data;

extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);

int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);

#define RDMA_RESOLVE_TIMEOUT	(5 * MSEC_PER_SEC)	/* 5 seconds */

static inline int kiblnd_timeout(void)
{
	return *kiblnd_tunables.kib_timeout ?: lnet_get_lnd_timeout();
}

/* lnd_connreq_timeout = lnd_timeout / 4 */
static inline int kiblnd_connreq_timeout_ms(void)
{
	return max(RDMA_RESOLVE_TIMEOUT, kiblnd_timeout() * MSEC_PER_SEC / 4);
}

static inline int
kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
{
	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
	int concurrent_sends;

	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
	concurrent_sends = tunables->lnd_concurrent_sends;

	if (version == IBLND_MSG_VERSION_1) {
		if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
			return IBLND_MSG_QUEUE_SIZE_V1 * 2;

		if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
			return IBLND_MSG_QUEUE_SIZE_V1 / 2;
	}

	return concurrent_sends;
}

static inline void
kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
{
	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
	atomic_inc(&hdev->ibh_ref);
}

static inline void
kiblnd_hdev_decref(struct kib_hca_dev *hdev)
{
	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
	if (atomic_dec_and_test(&hdev->ibh_ref))
		kiblnd_hdev_destroy(hdev);
}

static inline int
kiblnd_dev_can_failover(struct kib_dev *dev)
{
	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
		return 0;

	if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
		return 0;

	if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
		return 1;

	return dev->ibd_can_failover;
}

static inline void kiblnd_conn_addref(struct kib_conn *conn)
{
#ifdef O2IBLND_CONN_REFCOUNT_DEBUG
	CDEBUG(D_NET, "conn[%p] (%d)++\n",
	       (conn), atomic_read(&(conn)->ibc_refcount));
#endif
	atomic_inc(&(conn)->ibc_refcount);
}

static inline void kiblnd_conn_decref(struct kib_conn *conn)
{
	unsigned long flags;
#ifdef O2IBLND_CONN_REFCOUNT_DEBUG
	CDEBUG(D_NET, "conn[%p] (%d)--\n",
	       (conn), atomic_read(&(conn)->ibc_refcount));
#endif
	LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);
	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {
		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
		list_add_tail(&(conn)->ibc_list,
			      &kiblnd_data.kib_connd_zombies);
		wake_up(&kiblnd_data.kib_connd_waitq);
		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
	}
}

void kiblnd_destroy_peer(struct kref *kref);

static inline void kiblnd_peer_addref(struct kib_peer_ni *peer_ni)
{
	CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",
	       peer_ni, libcfs_nidstr(&peer_ni->ibp_nid),
	       kref_read(&peer_ni->ibp_kref));
	kref_get(&(peer_ni)->ibp_kref);
}

static inline void kiblnd_peer_decref(struct kib_peer_ni *peer_ni)
{
	CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",
	       peer_ni, libcfs_nidstr(&peer_ni->ibp_nid),
	       kref_read(&peer_ni->ibp_kref));
	kref_put(&peer_ni->ibp_kref, kiblnd_destroy_peer);
}

static inline bool
kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
{
	return peer_ni->ibp_connecting != 0 ||
	       peer_ni->ibp_reconnecting != 0 ||
	       peer_ni->ibp_accepting != 0;
}

static inline bool
kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
{
	return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
}

static inline int
kiblnd_peer_active(struct kib_peer_ni *peer_ni)
{
	/* Am I in the peer_ni hash table? */
	return !hlist_unhashed(&peer_ni->ibp_list);
}

static inline struct kib_conn *
kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
{
	struct list_head *next;

	LASSERT(!list_empty(&peer_ni->ibp_conns));

	/* Advance to next connection, be sure to skip the head node */
	if (!peer_ni->ibp_next_conn ||
	    peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
		next = peer_ni->ibp_conns.next;
	else
		next = peer_ni->ibp_next_conn->ibc_list.next;
	peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);

	return peer_ni->ibp_next_conn;
}

static inline int
kiblnd_send_keepalive(struct kib_conn *conn)
{
	s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;

	return (*kiblnd_tunables.kib_keepalive > 0) &&
		ktime_after(ktime_get(),
			    ktime_add_ns(conn->ibc_last_send, keepalive_ns));
}

/* when to return credits eagerly */
static inline int
kiblnd_credits_highwater(struct lnet_ioctl_config_o2iblnd_tunables *t,
			 struct lnet_ioctl_config_lnd_cmn_tunables *nt,
			 struct kib_conn *conn)
{
	int credits_hiw = IBLND_CREDIT_HIGHWATER_V1;

	if ((conn->ibc_version) == IBLND_MSG_VERSION_1)
		return credits_hiw;

	/* if queue depth is negotiated down, calculate hiw proportionally */
	credits_hiw = (conn->ibc_queue_depth * t->lnd_peercredits_hiw) /
		       nt->lct_peer_tx_credits;

	return credits_hiw;
}

static inline int
kiblnd_need_noop(struct kib_conn *conn)
{
	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;

	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
	net_tunables = &ni->ni_net->net_tunables;

	if (conn->ibc_outstanding_credits <
	    kiblnd_credits_highwater(tunables, net_tunables, conn) &&
	    !kiblnd_send_keepalive(conn))
		return 0; /* No need to send NOOP */

	if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
		if (!list_empty(&conn->ibc_tx_queue_nocred))
			return 0; /* NOOP can be piggybacked */

		/* No tx to piggyback NOOP onto or no credit to send a tx */
		return (list_empty(&conn->ibc_tx_queue) ||
			conn->ibc_credits == 0);
	}

	if (!list_empty(&conn->ibc_tx_noops) ||		/* NOOP is queued */
	    !list_empty(&conn->ibc_tx_queue_nocred) ||  /* piggyback NOOP */
	    conn->ibc_credits == 0)			/* no credit */
		return 0;

	if (conn->ibc_credits == 1 &&		/* last credit reserved for */
	    conn->ibc_outstanding_credits == 0) /* giving back credits */
		return 0;

	/* No tx to piggyback NOOP onto or no credit to send a tx */
	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
}

static inline void
kiblnd_abort_receives(struct kib_conn *conn)
{
	ib_modify_qp(conn->ibc_cmid->qp,
		     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
}

static inline const char *
kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
{
	if (q == &conn->ibc_tx_queue)
		return "tx_queue";

	if (q == &conn->ibc_tx_queue_rsrvd)
		return "tx_queue_rsrvd";

	if (q == &conn->ibc_tx_queue_nocred)
		return "tx_queue_nocred";

	if (q == &conn->ibc_active_txs)
		return "active_txs";

	LBUG();
	return NULL;
}

/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
 * lowest bits of the work request id to stash the work item type. */

#define IBLND_WID_INVAL	0
#define IBLND_WID_TX	1
#define IBLND_WID_RX	2
#define IBLND_WID_RDMA	3
#define IBLND_WID_MR	4
#define IBLND_WID_MASK	7UL

static inline __u64
kiblnd_ptr2wreqid (void *ptr, int type)
{
	unsigned long lptr = (unsigned long)ptr;

	LASSERT((lptr & IBLND_WID_MASK) == 0);
	LASSERT((type & ~IBLND_WID_MASK) == 0);
	return (__u64)(lptr | type);
}

static inline void *
kiblnd_wreqid2ptr (__u64 wreqid)
{
	return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
}

static inline int
kiblnd_wreqid2type (__u64 wreqid)
{
	return (wreqid & IBLND_WID_MASK);
}

static inline void
kiblnd_set_conn_state(struct kib_conn *conn, int state)
{
	conn->ibc_state = state;
	smp_mb();
}

static inline void
kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
{
	msg->ibm_type = type;
	msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
}

static inline int
kiblnd_rd_size(struct kib_rdma_desc *rd)
{
	int size;
	int i;

	for (i = size = 0; i < rd->rd_nfrags; i++)
		size += rd->rd_frags[i].rf_nob;

	return size;
}

static inline __u64
kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
{
	return rd->rd_frags[index].rf_addr;
}

static inline int
kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
{
	return rd->rd_frags[index].rf_nob;
}

static inline __u32
kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
{
	return rd->rd_key;
}

static inline int
kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
{
	if (nob < rd->rd_frags[index].rf_nob) {
		rd->rd_frags[index].rf_addr += nob;
		rd->rd_frags[index].rf_nob  -= nob;
	} else {
		index++;
	}

	return index;
}

static inline int
kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
{
	return msgtype == IBLND_MSG_GET_REQ ?
		offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
		offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
}

static inline __u64
kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
{
	return ib_dma_mapping_error(dev, dma_addr);
}

static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
					  void *msg, size_t size,
					  enum dma_data_direction direction)
{
	return ib_dma_map_single(dev, msg, size, direction);
}

static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
					   __u64 addr, size_t size,
					   enum dma_data_direction direction)
{
	ib_dma_unmap_single(dev, addr, size, direction);
}

#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
#define KIBLND_UNMAP_ADDR(p, m, a)	(a)

static inline
int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
{
	struct scatterlist *sg = tx->tx_frags;
	int nents = tx->tx_nfrags;
	enum dma_data_direction direction = tx->tx_dmadir;

	if (tx->tx_gpu)
		return lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
					      sg, nents, direction);

	return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
}

static inline
void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
{
	struct scatterlist *sg = tx->tx_frags;
	int nents = tx->tx_nfrags;
	enum dma_data_direction direction = tx->tx_dmadir;

	if (tx->tx_gpu)
		lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
					  sg, nents, direction);
	else
		ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
}

#ifndef HAVE_OFED_IB_SG_DMA_ADDRESS
#include <linux/scatterlist.h>
#define ib_sg_dma_address(dev, sg)	sg_dma_address(sg)
#define ib_sg_dma_len(dev, sg)		sg_dma_len(sg)
#endif

static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
					  struct scatterlist *sg)
{
	return ib_sg_dma_address(dev, sg);
}

static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
					     struct scatterlist *sg)
{
	return ib_sg_dma_len(dev, sg);
}

#ifndef HAVE_OFED_RDMA_CONNECT_LOCKED
#define rdma_connect_locked(cmid, cpp)	rdma_connect(cmid, cpp)
#endif

/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
 * right because OFED1.2 defines it as const, to use it we have to add
 * (void *) cast to overcome "const" */

#define KIBLND_CONN_PARAM(e)		((e)->param.conn.private_data)
#define KIBLND_CONN_PARAM_LEN(e)	((e)->param.conn.private_data_len)

void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
void kiblnd_map_rx_descs(struct kib_conn *conn);
void kiblnd_unmap_rx_descs(struct kib_conn *conn);
void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);

int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
			struct kib_rdma_desc *rd, u32 nob, u64 iov,
			struct kib_fmr *fmr);
void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);

int kiblnd_tunables_setup(struct lnet_lnd_tunables *lnd_tunables,
			  struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables);
int kiblnd_tunables_init(void);

int kiblnd_connd(void *arg);
int kiblnd_scheduler(void *arg);
#define kiblnd_thread_start(fn, data, namefmt, arg...)			\
	({								\
		struct task_struct *__task = kthread_run(fn, data,	\
							 namefmt, ##arg); \
		if (!IS_ERR(__task))					\
			atomic_inc(&kiblnd_data.kib_nthreads);		\
		PTR_ERR_OR_ZERO(__task);				\
	})

int kiblnd_failover_thread(void *arg);

int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);

int kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event);
int kiblnd_translate_mtu(int value);

int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
		       struct lnet_nid *nid);
bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
void kiblnd_destroy_dev(struct kib_dev *dev);
void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni,
					    struct lnet_nid *nid);
int kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
				    int version, u64 incarnation);
int kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);

struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
				    struct rdma_cm_id *cmid,
				    int state, int version);
void kiblnd_destroy_conn(struct kib_conn *conn);
void kiblnd_close_conn(struct kib_conn *conn, int error);
void kiblnd_close_conn_locked(struct kib_conn *conn, int error);

void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx,
		      struct lnet_nid *nid);
void kiblnd_txlist_done(struct list_head *txlist, int status,
			enum lnet_msg_hstatus hstatus);

void kiblnd_qp_event(struct ib_event *event, void *arg);
void kiblnd_cq_event(struct ib_event *event, void *arg);
void kiblnd_cq_completion(struct ib_cq *cq, void *arg);

void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
		     int credits, struct lnet_nid *dstnid, u64 dststamp);
int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
int kiblnd_post_rx(struct kib_rx *rx, int credit);

int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
		int delayed, struct iov_iter *to, unsigned int rlen);
unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);

#define kiblnd_dump_conn_dbg(conn)			\
({							\
	if (conn && conn->ibc_cmid)			\
		CDEBUG(D_NET, "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id %p qp_num 0x%x device_name %s\n",	\
			conn,				\
			conn->ibc_state,		\
			conn->ibc_noops_posted,		\
			conn->ibc_nsends_posted,	\
			conn->ibc_credits,		\
			conn->ibc_outstanding_credits,	\
			conn->ibc_reserved_credits,	\
			conn->ibc_comms_error,		\
			conn->ibc_cmid,			\
			conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0,	\
			conn->ibc_cmid->qp ? (conn->ibc_cmid->qp->device ? dev_name(&conn->ibc_cmid->qp->device->dev) : "NULL") : "NULL");	\
	else if (conn)					\
		CDEBUG(D_NET, "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id NULL\n",	\
			conn,				\
			conn->ibc_state,		\
			conn->ibc_noops_posted,		\
			conn->ibc_nsends_posted,	\
			conn->ibc_credits,		\
			conn->ibc_outstanding_credits,	\
			conn->ibc_reserved_credits,	\
			conn->ibc_comms_error		\
			);				\
})