Viewing: lib-socket.c

// SPDX-License-Identifier: GPL-2.0

/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 * Use is subject to license terms.
 *
 * Copyright (c) 2015, 2017, Intel Corporation.
 */

/* This file is part of Lustre, http://www.lustre.org/ */

#define DEBUG_SUBSYSTEM S_LNET

#include <linux/if.h>
#include <linux/in.h>
#include <linux/net.h>
#include <net/addrconf.h>
#include <net/ipv6.h>
#include <linux/file.h>
#include <linux/if_vlan.h>
#include <linux/pagemap.h>
/* For sys_open & sys_close */
#include <linux/syscalls.h>
#include <net/net_namespace.h>
#include <lustre_compat/net/sock.h>
#include <lustre_compat/net/tcp.h>
#include <lustre_compat/linux/inetdevice.h>

#include <linux/libcfs/libcfs.h>
#include <linux/lnet/lib-lnet.h>

int
lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
{
	int rc;
	long jiffies_left = cfs_time_seconds(timeout);
	unsigned long then;
	struct kvec iov = {
		.iov_base = buffer,
		.iov_len = nob
	};
	struct msghdr msg = { NULL, };

	LASSERT(nob > 0);
	/* Caller may pass a zero timeout if she thinks the socket buffer is
	 * empty enough to take the whole message immediately
	 */
	for (;;) {
		msg.msg_flags = !timeout ? MSG_DONTWAIT : 0;
		if (timeout != 0) {
			struct sock *sk = sock->sk;

			/* Set send timeout to remaining time */
			lock_sock(sk);
			sk->sk_sndtimeo = jiffies_left;
			release_sock(sk);
		}

		then = jiffies;
		rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
		jiffies_left -= jiffies - then;

		if (rc < 0)
			return rc;

		if (rc == 0) {
			CERROR("Unexpected zero rc\n");
			return -ECONNABORTED;
		}

		if (!msg_data_left(&msg))
			break;

		if (jiffies_left <= 0)
			return -EAGAIN;
	}
	return 0;
}
EXPORT_SYMBOL(lnet_sock_write);

int
lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
{
	int rc;
	long jiffies_left = cfs_time_seconds(timeout);
	unsigned long then;

	LASSERT(nob > 0);
	LASSERT(jiffies_left > 0);

	for (;;) {
		struct kvec  iov = {
			.iov_base = buffer,
			.iov_len  = nob
		};
		struct msghdr msg = {
			.msg_flags	= 0
		};
		struct sock *sk = sock->sk;

		/* Set receive timeout to remaining time */
		lock_sock(sk);
		sk->sk_rcvtimeo = jiffies_left;
		release_sock(sk);

		then = jiffies;
		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
		jiffies_left -= jiffies - then;

		if (rc < 0)
			return rc;

		if (rc == 0)
			return -ECONNRESET;

		buffer = ((char *)buffer) + rc;
		nob -= rc;

		if (nob == 0)
			return 0;

		if (jiffies_left <= 0)
			return -ETIMEDOUT;
	}
}
EXPORT_SYMBOL(lnet_sock_read);

int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
{
	struct net_device *dev;
	struct in_device *in_dev;
	int err;
	DECLARE_CONST_IN_IFADDR(ifa);

	rcu_read_lock();
	dev = dev_get_by_index_rcu(ns, interface);
	err = -EINVAL;
	if (!dev || !(dev->flags & IFF_UP))
		goto out;
	in_dev = __in_dev_get_rcu(dev);
	if (!in_dev)
		goto out;
	err = -ENOENT;
	in_dev_for_each_ifa_rcu(ifa, in_dev) {
		if (err ||
		    ((dst_ipaddr ^ ntohl(ifa->ifa_local))
		     & ntohl(ifa->ifa_mask)) == 0) {
			/* This address at least as good as what we
			 * already have
			 */
			*ret = ntohl(ifa->ifa_local);
			err = 0;
		}
	}
	endfor_ifa(in_dev);
out:
	rcu_read_unlock();
	return err;
}
EXPORT_SYMBOL(choose_ipv4_src);

static struct socket *
lnet_sock_create(int interface, struct sockaddr_unsized *remaddr,
		 int local_port, struct net *ns, struct sockaddr *addr)
{
	struct socket *sock;
	int rc;
	int family;

	family = AF_INET6;
	if (remaddr)
		family = remaddr->sa_family;
	else if (addr)
		family = addr->sa_family;

retry:
	rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
	if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
		family = AF_INET;
		goto retry;
	}

	if (rc) {
		CERROR("Can't create socket: %d\n", rc);
		return ERR_PTR(rc);
	}

	sock->sk->sk_reuseport = 1;
#ifdef HAVE_SOCK_NOT_OWNED_BY_ME
	/* Set sk_net_refcnt and namespace for orphan cleanup LU-18137 */
	sock->sk->sk_net_refcnt = 1;
	get_net(ns);
	sock_inuse_add(ns, 1);
#endif

	if (interface >= 0 || local_port != 0) {
		struct sockaddr_storage locaddr = {};

		switch (family) {
		case AF_INET: {
			struct sockaddr_in *sin = (void *)&locaddr;

			sin->sin_family = AF_INET;
			sin->sin_addr.s_addr = INADDR_ANY;

			if (interface >= 0 && remaddr && !addr) {
				struct sockaddr_in *rem = (void *)remaddr;
				__u32 ip;

				rc = choose_ipv4_src(&ip,
						     interface,
						     ntohl(rem->sin_addr.s_addr),
						     ns);
				if (rc)
					goto failed;
				sin->sin_addr.s_addr = htonl(ip);
			} else if (addr) {
				struct sockaddr_in *src;

				src = (struct sockaddr_in *)addr;
				sin->sin_addr.s_addr = src->sin_addr.s_addr;
			}
			sin->sin_port = htons(local_port);
			break;
		}
#if IS_ENABLED(CONFIG_IPV6)
		case AF_INET6: {
			struct sockaddr_in6 *sin6 = (void *)&locaddr;
			int val = 0;

			sin6->sin6_family = AF_INET6;
			sin6->sin6_addr = in6addr_any;

			/* Make sure we get both IPv4 and IPv6 connections.
			 * This is the default, but it can be overridden so we
			 * force it back.
			 */
#ifdef HAVE_KERNEL_SETSOCKOPT
			kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
					  (char *) &val, sizeof(val));
#elif defined(_LINUX_SOCKPTR_H)
			/* sockptr_t was introduced around
			 * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
			 * kernel address to be passed to ->setsockopt
			 */
			if (ipv6_only_sock(sock->sk)) {
				sockptr_t optval = KERNEL_SOCKPTR(&val);

				sock->ops->setsockopt(sock,
						      IPPROTO_IPV6, IPV6_V6ONLY,
						      optval, sizeof(val));
			}
#else
			/* From v5.7-rc6-2614-g5a892ff2facb when
			 * kernel_setsockopt() was removed until
			 * sockptr_t (above) there is no clean way to
			 * pass kernel address to setsockopt.  We could
			 * use get_fs()/set_fs(), but in this particular
			 * situation there is an easier way.  It depends
			 * on the fact that at least for these few
			 * kernels a NULL address to ipv6_setsockopt()
			 * is treated like the address of a zero.
			 */
			if (ipv6_only_sock(sock->sk) && !val) {
				void *optval = NULL;

				sock->ops->setsockopt(sock,
						      IPPROTO_IPV6, IPV6_V6ONLY,
						      optval, sizeof(val));
			}
#endif /* HAVE_KERNEL_SETSOCKOPT */

			if (interface >= 0 && remaddr && !addr) {
				struct sockaddr_in6 *rem = (void *)remaddr;
				struct net_device *dev;

				rcu_read_lock();
				dev = dev_get_by_index_rcu(ns, interface);
				if (!dev) {
					CERROR("No net device for interface %d\n",
					       interface);
					rcu_read_unlock();
					rc = -ENODEV;
					goto failed;
				}
				ipv6_dev_get_saddr(ns, dev, &rem->sin6_addr, 0,
						   &sin6->sin6_addr);
				rcu_read_unlock();
			} else if (addr) {
				const struct sockaddr_in6 *src6;

				src6 = (const struct sockaddr_in6 *)addr;
				sin6->sin6_addr = src6->sin6_addr;
			}

			sin6->sin6_port = htons(local_port);
			break;
		}
#endif /* IS_ENABLED(CONFIG_IPV6) */
		}
		rc = kernel_bind(sock, (struct sockaddr_unsized *)&locaddr,
				 sizeof(locaddr));
		if (rc == -EADDRINUSE) {
			CDEBUG(D_NET, "Port %d already in use\n", local_port);
			goto failed;
		}
		if (rc != 0) {
			CERROR("Error trying to bind to %pISc/%d: rc = %d\n",
			       &locaddr, local_port, rc);
			goto failed;
		}
	}
	return sock;

failed:
	sock_release(sock);
	return ERR_PTR(rc);
}

void
lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
{
	struct sock *sk = sock->sk;

	if (txbufsize != 0) {
		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
		sk->sk_sndbuf = txbufsize;
		sk->sk_write_space(sk);
	}

	if (rxbufsize != 0) {
		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
		sk->sk_rcvbuf = rxbufsize;
	}
}
EXPORT_SYMBOL(lnet_sock_setbuf);

int
lnet_sock_getaddr(struct socket *sock, bool remote,
		  struct sockaddr_storage *peer)
{
	int rc;

	if (remote)
		rc = kernel_getpeername(sock, (struct sockaddr *)peer);
	else
		rc = kernel_getsockname(sock, (struct sockaddr *)peer);
	if (rc < 0) {
		CERROR("Error %d getting sock %s IP/port\n",
			rc, remote ? "peer" : "local");
		return rc;
	}
	if (peer->ss_family == AF_INET6) {
		struct sockaddr_in6 *in6 = (void *)peer;
		struct sockaddr_in *in = (void *)peer;
		short port = in6->sin6_port;

		if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
			/* Pretend it is a v4 socket */
			memset(in, 0, sizeof(*in));
			in->sin_family = AF_INET;
			in->sin_port = port;
			memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
		}
	}
	return 0;
}
EXPORT_SYMBOL(lnet_sock_getaddr);

void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
{
	if (txbufsize != NULL)
		*txbufsize = sock->sk->sk_sndbuf;

	if (rxbufsize != NULL)
		*rxbufsize = sock->sk->sk_rcvbuf;
}
EXPORT_SYMBOL(lnet_sock_getbuf);

struct socket *
lnet_sock_listen(int local_port, int backlog, struct net *ns,
		 struct sockaddr *addr, int ifindex)
{
	struct socket *sock;
	int rc;

	sock = lnet_sock_create(ifindex, NULL, local_port, ns, addr);
	if (IS_ERR(sock)) {
		rc = PTR_ERR(sock);
		if (rc == -EADDRINUSE)
			CERROR("Can't create socket: port %d already in use\n",
			       local_port);
		return ERR_PTR(rc);
	}

	rc = kernel_listen(sock, backlog);
	if (rc == 0)
		return sock;

	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
	sock_release(sock);
	return ERR_PTR(rc);
}

struct socket *
lnet_sock_connect(int interface, int local_port,
		  struct sockaddr_unsized *peeraddr,
		  struct net *ns)
{
	struct socket *sock;
	int rc;

	sock = lnet_sock_create(interface, peeraddr, local_port, ns, 0);
	if (IS_ERR(sock))
		return sock;

	/* Avoid temporary address, they are bad for long-lived
	 * connections such as lustre mounts.
	 * RFC4941, section 3.6 suggests that:
	 *    Individual applications, which have specific
	 *    knowledge about the normal duration of connections,
	 *    MAY override this as appropriate.
	 */
	if (peeraddr->sa_family == PF_INET6)
		ip6_sock_set_addr_preferences(sock->sk,
					      IPV6_PREFER_SRC_PUBLIC);

	rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
	if (rc == 0)
		return sock;

	/* EADDRNOTAVAIL probably means we're already connected to the same
	 * peer/port on the same local port on a differently typed
	 * connection.	Let our caller retry with a different local
	 * port... */

	CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
		     "Error %d connecting %d -> %pIScp\n", rc,
		     local_port, peeraddr);

	sock_release(sock);
	return ERR_PTR(rc);
}

static int lnet_inet4_enumerate(struct net_device *dev, int flags,
				int *nalloc, int nip, int cpt,
				struct lnet_inetdev **dev_list)
{
	struct lnet_inetdev *ifaces = *dev_list;
	struct in_device *in_dev;
	DECLARE_CONST_IN_IFADDR(ifa);

	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev) {
		CWARN("lnet: Interface %s has no IPv4 status.\n",
		      dev->name);
		return nip;
	}

	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
		if (nip >= *nalloc) {
			struct lnet_inetdev *tmp;

			*nalloc += LNET_INTERFACES_NUM;
			tmp = krealloc(ifaces, *nalloc * sizeof(*tmp),
				       GFP_KERNEL);
			if (!tmp) {
				kfree(ifaces);
				*nalloc = 0;
				*dev_list = NULL;
				return -ENOMEM;
			}
			ifaces = tmp;
		}
		ifaces[nip].li_cpt = cpt;
		ifaces[nip].li_iff_master = !!(flags & IFF_MASTER);
		ifaces[nip].li_size = sizeof(ifa->ifa_local);
		ifaces[nip].li_index = dev->ifindex;
		ifaces[nip].li_ipaddr = ifa->ifa_local;
		ifaces[nip].li_netmask = ntohl(ifa->ifa_mask);
		strscpy(ifaces[nip].li_name, ifa->ifa_label,
		       sizeof(ifaces[nip].li_name));
		nip++;
	}
	endfor_ifa(in_dev);

	*dev_list = ifaces;

	return nip;
}

static int lnet_inet6_enumerate(struct net_device *dev, int flags,
				int *nalloc, int nip, int cpt,
				struct lnet_inetdev **dev_list)
{
#if IS_ENABLED(CONFIG_IPV6)
	struct lnet_inetdev *ifaces = *dev_list;
	const struct inet6_ifaddr *ifa6;
	struct inet6_dev *in6_dev;

	in6_dev = __in6_dev_get(dev);
	if (!in6_dev) {
		CWARN("lnet: Interface %s has no IPv6 status.\n",
		      dev->name);
		return nip;
	}

	list_for_each_entry_rcu(ifa6, &in6_dev->addr_list, if_list) {
		if (ifa6->flags & IFA_F_TEMPORARY)
			continue;

		if (ipv6_addr_type(&ifa6->addr) & IPV6_ADDR_LINKLOCAL)
			continue;

		if (nip >= *nalloc) {
			struct lnet_inetdev *tmp;

			*nalloc += LNET_INTERFACES_NUM;
			tmp = krealloc(ifaces, *nalloc * sizeof(*tmp),
				       GFP_KERNEL);
			if (!tmp) {
				kfree(ifaces);
				*nalloc = 0;
				*dev_list = NULL;
				return -ENOMEM;
			}
			ifaces = tmp;
		}

		ifaces[nip].li_cpt = cpt;
		ifaces[nip].li_iff_master = !!(flags & IFF_MASTER);
		ifaces[nip].li_size = sizeof(struct in6_addr);
		ifaces[nip].li_index = dev->ifindex;
		memcpy(ifaces[nip].li_ipv6addr,
		       &ifa6->addr, sizeof(struct in6_addr));
		strscpy(ifaces[nip].li_name, dev->name,
			sizeof(ifaces[nip].li_name));
		nip++;
		/* As different IPv6 addresses don't have unique
		 * labels, it is safest just to use the first
		 * and ignore the rest.
		 */
		break;
	}

	*dev_list = ifaces;
#endif /* IS_ENABLED(CONFIG_IPV6) */
	return nip;
}

int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns,
			bool v6_first)
{
	struct lnet_inetdev *ifaces = NULL;
	struct net_device *dev;
	struct net_device *cpt_dev;
	int nalloc = 0;
	int nip = 0;

	rtnl_lock();
	for_each_netdev(ns, dev) {
		int flags = netif_get_flags(dev);
		int node_id, cpt;
		int count;

		if (flags & IFF_LOOPBACK) /* skip the loopback IF */
			continue;

		if (!(flags & IFF_UP)) {
			CDEBUG(D_NET, "Ignoring interface %s: it's down\n",
			       dev->name);
			continue;
		}

		cpt_dev = dev;
#if IS_ENABLED(CONFIG_VLAN_8021Q)
		if (is_vlan_dev(dev) && vlan_dev_real_dev(dev))
			cpt_dev = vlan_dev_real_dev(dev);
#endif
		node_id = dev_to_node(&cpt_dev->dev);
		cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);

		if (v6_first) {
			count = lnet_inet6_enumerate(dev, flags, &nalloc, nip,
						     cpt, &ifaces);
			if (count < 0)
				CWARN("lnet: No IPv6 addresses for interface %s.\n",
				      dev->name);
			else
				nip = count;

			count = lnet_inet4_enumerate(dev, flags, &nalloc, nip,
						     cpt, &ifaces);
			if (count < 0)
				CWARN("lnet: No IPv4 addresses for interface %s.\n",
				      dev->name);
			else
				nip = count;
		} else {
			count = lnet_inet4_enumerate(dev, flags, &nalloc, nip,
						     cpt, &ifaces);
			if (count < 0)
				CWARN("lnet: No IPv4 addresses for interface %s.\n",
				      dev->name);
			else
				nip = count;

			count = lnet_inet6_enumerate(dev, flags, &nalloc, nip,
						     cpt, &ifaces);
			if (count < 0)
				CWARN("lnet: No IPv6 addresses for interface %s.\n",
				      dev->name);
			else
				nip = count;
		}
	}
	rtnl_unlock();

	if (nip == 0) {
		CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n");
		nip = -ENOENT;
	}

	*dev_list = ifaces;
	return nip;
}
EXPORT_SYMBOL(lnet_inet_enumerate);