Viewing: socklnd_modparams.c

// SPDX-License-Identifier: GPL-2.0

/* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 *
 * Copyright (c) 2011, 2012, Intel Corporation.
 */

/* This file is part of Lustre, http://www.lustre.org/
 *
 * Author: Eric Barton <eric@bartonsoftware.com>
 */

#include "socklnd.h"

#include <lustre_compat/linux/inetdevice.h>
#include <linux/ethtool.h>
#include <net/addrconf.h>

#define CURRENT_LND_VERSION 1

static int sock_timeout = SOCKNAL_TIMEOUT_DEFAULT;
module_param(sock_timeout, int, 0644);
MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");

static int credits = DEFAULT_CREDITS;
module_param(credits, int, 0444);
MODULE_PARM_DESC(credits, "# concurrent sends");

static int peer_credits = DEFAULT_PEER_CREDITS;
module_param(peer_credits, int, 0444);
MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");

static int peer_buffer_credits;
module_param(peer_buffer_credits, int, 0444);
MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");

static int peer_timeout = DEFAULT_PEER_TIMEOUT;
module_param(peer_timeout, int, 0444);
MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");

/* Number of daemons in each thread pool which is percpt,
 * we will estimate reasonable value based on CPUs if it's not set. */
static unsigned int nscheds;
module_param(nscheds, int, 0444);
MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");

static int nconnds = 4;
module_param(nconnds, int, 0444);
MODULE_PARM_DESC(nconnds, "# connection daemons while starting");

static int nconnds_max = 64;
module_param(nconnds_max, int, 0444);
MODULE_PARM_DESC(nconnds_max, "max # connection daemons");

static int min_reconnectms = 1000;
module_param(min_reconnectms, int, 0644);
MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");

static int max_reconnectms = 60000;
module_param(max_reconnectms, int, 0644);
MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");

static int eager_ack;
module_param(eager_ack, int, 0644);
MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");

static int typed_conns = 1;
module_param(typed_conns, int, 0444);
MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");

static int min_bulk = (1<<10);
module_param(min_bulk, int, 0644);
MODULE_PARM_DESC(min_bulk, "smallest 'large' message");

# define DEFAULT_BUFFER_SIZE 0
static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
module_param(tx_buffer_size, int, 0644);
MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");

static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
module_param(rx_buffer_size, int, 0644);
MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");

static int nagle = 0;
module_param(nagle, int, 0644);
MODULE_PARM_DESC(nagle, "enable NAGLE?");

static int round_robin = 1;
module_param(round_robin, int, 0644);
MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");

static int keepalive = 30;
module_param(keepalive, int, 0644);
MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");

static int keepalive_idle = 30;
module_param(keepalive_idle, int, 0644);
MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");

#define DEFAULT_KEEPALIVE_COUNT  5
static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
module_param(keepalive_count, int, 0644);
MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");

static int keepalive_intvl = 5;
module_param(keepalive_intvl, int, 0644);
MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");

static int enable_csum = 0;
module_param(enable_csum, int, 0644);
MODULE_PARM_DESC(enable_csum, "enable check sum");

static int inject_csum_error = 0;
module_param(inject_csum_error, int, 0644);
MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");

static int enable_irq_affinity = 0;
module_param(enable_irq_affinity, int, 0644);
MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");

static int nonblk_zcack = 1;
module_param(nonblk_zcack, int, 0644);
MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");

static unsigned int zc_min_payload = (16 << 10);
module_param(zc_min_payload, int, 0644);
MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");

static unsigned int zc_recv = 0;
module_param(zc_recv, int, 0644);
MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");

static unsigned int zc_recv_min_nfrags = 16;
module_param(zc_recv_min_nfrags, int, 0644);
MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");

static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
module_param(conns_per_peer, uint, 0644);
MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");

/* By default skip_mr_route_setup is 0 (do not skip) */
static unsigned int skip_mr_route_setup;
module_param(skip_mr_route_setup, uint, 0444);
MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR");

#ifdef SOCKNAL_BACKOFF
static int backoff_init = 3;
module_param(backoff_init, int, 0644);
MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");

static int backoff_max = 3;
module_param(backoff_max, int, 0644);
MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
#endif

#if SOCKNAL_VERSION_DEBUG
static int protocol = 3;
module_param(protocol, int, 0644);
MODULE_PARM_DESC(protocol, "protocol version");
#endif

static int tos = -1;
static int param_set_tos(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops param_ops_tos = {
	.set = param_set_tos,
	.get = param_get_int,
};

#define param_check_tos(name, p) \
	__param_check(name, p, int)
module_param(tos, tos, 0444);
MODULE_PARM_DESC(tos, "Set the type of service (=-1 to disable)");

struct ksock_tunables ksocknal_tunables;
struct lnet_ioctl_config_socklnd_tunables ksock_default_tunables;

static int param_set_tos(const char *val, const struct kernel_param *kp)
{
	int rc, t;

	if (!val)
		return -EINVAL;

	rc = kstrtoint(val, 0, &t);
	if (rc)
		return rc;

	if (t < -1 || t > 0xff)
		return -ERANGE;

	*((int *)kp->arg) = t;

	return 0;
}

static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni)
{
	struct net_device *dev;
	int intf_idx = -1;
	int ret = -1;

	DECLARE_CONST_IN_IFADDR(ifa);

	/* check if ni has interface assigned */
	if (!ni->ni_net_ns || !ni->ni_interface)
		return 0;

	rtnl_lock();
	for_each_netdev(ni->ni_net_ns, dev) {
		int flags = netif_get_flags(dev);
		struct in_device *in_dev;

		if (flags & IFF_LOOPBACK) /* skip the loopback IF */
			continue;

		if (!(flags & IFF_UP))
			continue;

		in_dev = __in_dev_get_rtnl(dev);
		if (in_dev) {
			in_dev_for_each_ifa_rtnl(ifa, in_dev) {
				if (strcmp(ifa->ifa_label, ni->ni_interface) == 0)
					intf_idx = dev->ifindex;
			}
			endfor_ifa(in_dev);
		} else {
#if IS_ENABLED(CONFIG_IPV6)
			struct inet6_dev *in6_dev = __in6_dev_get(dev);

			if (in6_dev) {
				const struct inet6_ifaddr *ifa6;

				list_for_each_entry_rcu(ifa6,
							&in6_dev->addr_list,
							if_list) {
					if (ifa6->flags & IFA_F_TEMPORARY)
						continue;

					/* As different IPv6 addresses don't
					 * have unique labels, it is safest
					 * just to use the first and ignore
					 * the rest.
					 */
					if (strcmp(dev->name,
						   ni->ni_interface) == 0) {
						intf_idx = dev->ifindex;
						break;
					}
				}
			} else {
#endif
				continue;
#if IS_ENABLED(CONFIG_IPV6)
			}
#endif
		}

		if (intf_idx >= 0)
			break;
	}
	if (intf_idx >= 0) {
		struct ethtool_link_ksettings cmd;
		int ethtool_ret;

		/* Some devices may not be providing link settings */
		ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd);
		if (!ethtool_ret)
			ret = cmd.base.speed;
		else
			ret = ethtool_ret;
	}
	rtnl_unlock();

	return ret;
}

static int ksocklnd_speed2cpp(int speed)
{
	/* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */
	if (speed < 1000)
		speed = 1000;

	/* Pick heuristically optimal conns_per_peer value
	 * for the specified ethernet interface speed (Mbps)
	 */
	return ilog2(speed/1000) / 2 + 1;
}

int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni)
{
	int speed = ksocklnd_ni_get_eth_intf_speed(ni);
	int cpp = 1;

	if (ni->ni_interface)
		CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed);

	if (speed > 0)
		cpp = ksocklnd_speed2cpp(speed);

	return cpp;
}

int ksocknal_tunables_init(void)
{
	ksock_default_tunables.lnd_version = CURRENT_LND_VERSION;
	ksock_default_tunables.lnd_conns_per_peer = conns_per_peer;
	ksock_default_tunables.lnd_tos = tos;

	/* initialize ksocknal_tunables structure */
	ksocknal_tunables.ksnd_timeout            = &sock_timeout;
	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
	ksocknal_tunables.ksnd_nconnds            = &nconnds;
	ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
	ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
	ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
	ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
	ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
	ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
	ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
	ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
	ksocknal_tunables.ksnd_nagle              = &nagle;
	ksocknal_tunables.ksnd_round_robin        = &round_robin;
	ksocknal_tunables.ksnd_keepalive          = &keepalive;
	ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
	ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
	ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
	ksocknal_tunables.ksnd_credits            = &credits;
	ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
	ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
	ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
	ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
	ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
	ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
	ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
	ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
	ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
	if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
		CWARN("socklnd conns_per_peer is capped at %u.\n",
		      (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
	}
	ksocknal_tunables.ksnd_conns_per_peer     = &conns_per_peer;

	if (enable_irq_affinity) {
		CWARN("irq_affinity is removed from socklnd because modern "
		      "computer always has fast CPUs and more cores than "
		      "# NICs, although you still can set irq_affinity by "
		      "another way, please check manual for details.\n");
	}
	ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;

#ifdef SOCKNAL_BACKOFF
	ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
	ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
#endif

#if SOCKNAL_VERSION_DEBUG
	ksocknal_tunables.ksnd_protocol           = &protocol;
#endif

	if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
		*ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);

	return 0;
}

void ksocknal_tunables_setup(struct lnet_lnd_tunables *lnd_tunables,
			     struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables)
{
	struct lnet_ioctl_config_socklnd_tunables *tunables;

	tunables = &lnd_tunables->lnd_tun_u.lnd_sock;
	/* Current API version */
	tunables->lnd_version = CURRENT_LND_VERSION;

	if (net_tunables->lct_peer_timeout == -1)
		net_tunables->lct_peer_timeout =
			*ksocknal_tunables.ksnd_peertimeout;

	if (net_tunables->lct_max_tx_credits == -1)
		net_tunables->lct_max_tx_credits =
			*ksocknal_tunables.ksnd_credits;

	if (net_tunables->lct_peer_tx_credits == -1)
		net_tunables->lct_peer_tx_credits =
			*ksocknal_tunables.ksnd_peertxcredits;

	if (net_tunables->lct_peer_tx_credits >
	    net_tunables->lct_max_tx_credits)
		net_tunables->lct_peer_tx_credits =
			net_tunables->lct_max_tx_credits;

	if (net_tunables->lct_peer_rtr_credits == -1)
		net_tunables->lct_peer_rtr_credits =
			*ksocknal_tunables.ksnd_peerrtrcredits;

	if (tunables->lnd_tos < 0)
		tunables->lnd_tos = tos;

	tunables->lnd_timeout = ksocknal_timeout();
}