Viewing: lib-cpt.c

// SPDX-License-Identifier: GPL-2.0

/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2012, 2017, Intel Corporation.
 */

/* This file is part of Lustre, http://www.lustre.org/
 *
 * Please see comments in include/lnet/lib-cpt.h for introduction
 *
 * Author: liang@whamcloud.com
 */

#define DEBUG_SUBSYSTEM S_LNET

#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/libcfs/libcfs.h>
#include <linux/lnet/lib-types.h>
#include <linux/lnet/lib-cpt.h>

/* virtual processing unit */
struct cfs_cpu_partition {
	/* CPUs mask for this partition */
	cpumask_var_t			cpt_cpumask;
	/* nodes mask for this partition */
	nodemask_t			*cpt_nodemask;
	/* NUMA distance between CPTs */
	unsigned int			*cpt_distance;
	/* spread rotor for NUMA allocator */
	unsigned int			cpt_spread_rotor;
	/* NUMA node if cpt_nodemask is empty */
	int				cpt_node;
};

/** descriptor for CPU partitions */
struct cfs_cpt_table {
	/* spread rotor for NUMA allocator */
	unsigned int			ctb_spread_rotor;
	/* maximum NUMA distance between all nodes in table */
	unsigned int			ctb_distance;
	/* # of CPU partitions */
	int				ctb_nparts;
	/* partitions tables */
	struct cfs_cpu_partition	*ctb_parts;
	/* shadow HW CPU to CPU partition ID */
	int				*ctb_cpu2cpt;
	/* all cpus in this partition table */
	cpumask_var_t			ctb_cpumask;
	/* shadow HW node to CPU partition ID */
	int				*ctb_node2cpt;
	/* all nodes in this partition table */
	nodemask_t			*ctb_nodemask;
};

/** Global CPU partition table */
struct cfs_cpt_table *cfs_cpt_tab __read_mostly;
EXPORT_SYMBOL(cfs_cpt_tab);

/*
 * modparam for setting number of partitions
 *
 *  0 : estimate best value based on cores or NUMA nodes
 *  1 : disable multiple partitions
 * >1 : specify number of partitions
 */
module_param(cpu_npartitions, int, 0444);
MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");

/*
 * modparam for setting CPU partitions patterns:
 *
 * i.e:	"0[0-3] 1[4,5,7]", number before bracket is CPU partition ID,
 *	number in bracket is processor ID (core or HT)
 *
 * i.e:	"N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
 *	are NUMA node ID, number before bracket is CPU partition ID.
 *
 * i.e:	"N C[0-1]" or "C[0-1]", the character 'C' means numbers in bracket are
 *	relative core numbers to exclude, all other cores
 *	are included. If 'N' is specified then the core numbers are relative to
 *	the NUMA nodes, otherwise, they cores are relative to each partition.
 *	As per the first example, the first two cores of each NUMA node
 *	will be excluded, all other cores on all nodes are included with
 *	one partition per node. In the second example, the first two cores of
 *	each partition will be excluded, all other cores on all partitions are
 *	included. The partition count is specified with cpu_npartitions.
 *
 * i.e:	"N X[0-1]" or "X[0-1]", the character 'X' means that the numbers in
 *	brackets are processor IDs to be excluded from the CPT that they belong
 *	to. If 'N' was specified it will use the default NUMA node layout,
 *	otherwise it uses the default configuration for the cpu_npartitions
 *	specified.
 *
 * i.e:	"N", shortcut expression to create CPT from NUMA & CPU topology
 *	This is the default behavior if the cpu_pattern and cpu_npartitions
 *	are not specified.
 *
 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
 */
module_param(cpu_pattern, charp, 0444);
MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");

struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
{
	struct cfs_cpt_table *cptab;
	int i;

	LIBCFS_ALLOC(cptab, sizeof(*cptab));
	if (!cptab)
		return NULL;

	cptab->ctb_nparts = ncpt;

	if (!zalloc_cpumask_var(&cptab->ctb_cpumask, GFP_NOFS))
		goto failed_alloc_cpumask;

	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
	if (!cptab->ctb_nodemask)
		goto failed_alloc_nodemask;

	CFS_ALLOC_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
	if (!cptab->ctb_cpu2cpt)
		goto failed_alloc_cpu2cpt;

	memset(cptab->ctb_cpu2cpt, -1,
	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));

	CFS_ALLOC_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
	if (!cptab->ctb_node2cpt)
		goto failed_alloc_node2cpt;

	memset(cptab->ctb_node2cpt, -1,
	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));

	CFS_ALLOC_PTR_ARRAY(cptab->ctb_parts, ncpt);
	if (!cptab->ctb_parts)
		goto failed_alloc_ctb_parts;

	memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0]));

	for (i = 0; i < ncpt; i++) {
		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];

		if (!zalloc_cpumask_var(&part->cpt_cpumask, GFP_NOFS))
			goto failed_setting_ctb_parts;

		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
		if (!part->cpt_nodemask)
			goto failed_setting_ctb_parts;

		CFS_ALLOC_PTR_ARRAY(part->cpt_distance, cptab->ctb_nparts);
		if (!part->cpt_distance)
			goto failed_setting_ctb_parts;

		memset(part->cpt_distance, -1,
		       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
	}

	return cptab;

failed_setting_ctb_parts:
	while (i-- >= 0) {
		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];

		if (part->cpt_nodemask) {
			LIBCFS_FREE(part->cpt_nodemask,
				    sizeof(*part->cpt_nodemask));
		}

		free_cpumask_var(part->cpt_cpumask);

		if (part->cpt_distance) {
			CFS_FREE_PTR_ARRAY(part->cpt_distance,
					   cptab->ctb_nparts);
		}
	}

	CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts);

failed_alloc_ctb_parts:
	CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);

failed_alloc_node2cpt:
	CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);

failed_alloc_cpu2cpt:
	LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
failed_alloc_nodemask:
	free_cpumask_var(cptab->ctb_cpumask);
failed_alloc_cpumask:
	LIBCFS_FREE(cptab, sizeof(*cptab));
	return NULL;
}
EXPORT_SYMBOL(cfs_cpt_table_alloc);

void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
{
	int i;

	CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);

	CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);

	for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];

		if (part->cpt_nodemask) {
			LIBCFS_FREE(part->cpt_nodemask,
				    sizeof(*part->cpt_nodemask));
		}

		free_cpumask_var(part->cpt_cpumask);

		CFS_FREE_PTR_ARRAY(part->cpt_distance,
				   cptab->ctb_nparts);
	}

	CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts);

	LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
	free_cpumask_var(cptab->ctb_cpumask);

	LIBCFS_FREE(cptab, sizeof(*cptab));
}
EXPORT_SYMBOL(cfs_cpt_table_free);

int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
{
	char *tmp = buf;
	int rc;
	int i;
	int j;

	for (i = 0; i < cptab->ctb_nparts; i++) {
		if (len <= 0)
			goto err;

		rc = snprintf(tmp, len, "%d\t:", i);
		len -= rc;

		if (len <= 0)
			goto err;

		tmp += rc;
		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
			rc = snprintf(tmp, len, " %d", j);
			len -= rc;
			if (len <= 0)
				goto err;
			tmp += rc;
		}

		*tmp = '\n';
		tmp++;
		len--;
	}

	return tmp - buf;
err:
	return -E2BIG;
}
EXPORT_SYMBOL(cfs_cpt_table_print);

int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
{
	char *tmp = buf;
	int rc;
	int i;
	int j;

	for (i = 0; i < cptab->ctb_nparts; i++) {
		if (len <= 0)
			goto err;

		rc = snprintf(tmp, len, "%d\t:", i);
		len -= rc;

		if (len <= 0)
			goto err;

		tmp += rc;
		for (j = 0; j < cptab->ctb_nparts; j++) {
			rc = snprintf(tmp, len, " %d:%d", j,
				      cptab->ctb_parts[i].cpt_distance[j]);
			len -= rc;
			if (len <= 0)
				goto err;
			tmp += rc;
		}

		*tmp = '\n';
		tmp++;
		len--;
	}

	return tmp - buf;
err:
	return -E2BIG;
}
EXPORT_SYMBOL(cfs_cpt_distance_print);

int cfs_cpt_number(struct cfs_cpt_table *cptab)
{
	return cptab->ctb_nparts;
}
EXPORT_SYMBOL(cfs_cpt_number);

int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
{
	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));

	return cpt == CFS_CPT_ANY ?
	       cpumask_weight(cptab->ctb_cpumask) :
	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
}
EXPORT_SYMBOL(cfs_cpt_weight);

int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
{
	LASSERTF(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts),
		 "cpt=%d, nparts=%d\n", cpt, cptab->ctb_nparts);

	return cpt == CFS_CPT_ANY ?
	       cpumask_any_and(cptab->ctb_cpumask,
			       cpu_online_mask) < nr_cpu_ids :
	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
			       cpu_online_mask) < nr_cpu_ids;
}
EXPORT_SYMBOL(cfs_cpt_online);

cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
{
	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));

	return cpt == CFS_CPT_ANY ?
	       &cptab->ctb_cpumask : &cptab->ctb_parts[cpt].cpt_cpumask;
}
EXPORT_SYMBOL(cfs_cpt_cpumask);

nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
{
	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));

	return cpt == CFS_CPT_ANY ?
	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
}
EXPORT_SYMBOL(cfs_cpt_nodemask);

unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
{
	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));

	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
		return cptab->ctb_distance;

	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
}
EXPORT_SYMBOL(cfs_cpt_distance);

/* Calculate the maximum NUMA distance between all nodes in the
 * from_mask and all nodes in the to_mask.
 */
static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask,
					       nodemask_t *to_mask)
{
	unsigned int maximum;
	unsigned int distance;
	int from;
	int to;

	maximum = 0;
	for_each_node_mask(from, *from_mask) {
		for_each_node_mask(to, *to_mask) {
			distance = node_distance(from, to);
			if (maximum < distance)
				maximum = distance;
		}
	}
	return maximum;
}

static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
	cptab->ctb_cpu2cpt[cpu] = cpt;

	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
}

static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);

	cptab->ctb_cpu2cpt[cpu] = -1;
}

static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
{
	struct cfs_cpu_partition *part;

	if (!node_isset(node, *cptab->ctb_nodemask)) {
		unsigned int dist;

		/* first time node is added to the CPT table */
		node_set(node, *cptab->ctb_nodemask);
		cptab->ctb_node2cpt[node] = cpt;

		dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask,
						  cptab->ctb_nodemask);
		cptab->ctb_distance = dist;
	}

	part = &cptab->ctb_parts[cpt];
	if (!node_isset(node, *part->cpt_nodemask)) {
		int cpt2;

		/* first time node is added to this CPT */
		node_set(node, *part->cpt_nodemask);
		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
			struct cfs_cpu_partition *part2;
			unsigned int dist;

			part2 = &cptab->ctb_parts[cpt2];
			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
							  part2->cpt_nodemask);
			part->cpt_distance[cpt2] = dist;
			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
							  part->cpt_nodemask);
			part2->cpt_distance[cpt] = dist;
		}
	}
}

static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
{
	struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
	int cpu;

	for_each_cpu(cpu, part->cpt_cpumask) {
		/* this CPT has other CPU belonging to this node? */
		if (cpu_to_node(cpu) == node)
			break;
	}

	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
		int cpt2;

		/* No more CPUs in the node for this CPT. */
		node_clear(node, *part->cpt_nodemask);
		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
			struct cfs_cpu_partition *part2;
			unsigned int dist;

			part2 = &cptab->ctb_parts[cpt2];
			if (node_isset(node, *part2->cpt_nodemask))
				cptab->ctb_node2cpt[node] = cpt2;

			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
							  part2->cpt_nodemask);
			part->cpt_distance[cpt2] = dist;
			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
							  part->cpt_nodemask);
			part2->cpt_distance[cpt] = dist;
		}
	}

	for_each_cpu(cpu, cptab->ctb_cpumask) {
		/* this CPT-table has other CPUs belonging to this node? */
		if (cpu_to_node(cpu) == node)
			break;
	}

	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
		/* No more CPUs in the table for this node. */
		node_clear(node, *cptab->ctb_nodemask);
		cptab->ctb_node2cpt[node] = -1;
		cptab->ctb_distance =
			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
						   cptab->ctb_nodemask);
	}
}

int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
	LASSERTF(cpt >= 0 && cpt < cptab->ctb_nparts, "cpt=%d, nparts=%d\n",
		 cpt, cptab->ctb_nparts);

	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
		return 0;
	}

	if (cptab->ctb_cpu2cpt[cpu] != -1) {
		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
		       cpu, cptab->ctb_cpu2cpt[cpu]);
		return 0;
	}

	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
		return 0;
	}

	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
		       cpu, cptab->ctb_cpu2cpt[cpu]);
		return 0;
	}

	cfs_cpt_add_cpu(cptab, cpt, cpu);
	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));

	return 1;
}
EXPORT_SYMBOL(cfs_cpt_set_cpu);

void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));

	if (cpu < 0 || cpu >= nr_cpu_ids) {
		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
		return;
	}

	if (cpt == CFS_CPT_ANY) {
		/* caller doesn't know the partition ID */
		cpt = cptab->ctb_cpu2cpt[cpu];
		if (cpt < 0) { /* not set in this CPT-table */
			CDEBUG(D_INFO,
			       "Try to unset cpu %d which is not in CPT-table %p\n",
			       cpt, cptab);
			return;
		}

	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
		CDEBUG(D_INFO,
		       "CPU %d is not in CPU partition %d\n", cpu, cpt);
		return;
	}

	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));

	cfs_cpt_del_cpu(cptab, cpt, cpu);
	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
}
EXPORT_SYMBOL(cfs_cpt_unset_cpu);

int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
			const cpumask_t *mask)
{
	int cpu;

	if (!cpumask_weight(mask) ||
	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
		CDEBUG(D_INFO,
		       "No online CPU is found in the CPU mask for CPU partition %d\n",
		       cpt);
		return 0;
	}

	for_each_cpu(cpu, mask) {
		cfs_cpt_add_cpu(cptab, cpt, cpu);
		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
	}

	return 1;
}
EXPORT_SYMBOL(cfs_cpt_set_cpumask);

void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
			   const cpumask_t *mask)
{
	int cpu;

	for_each_cpu(cpu, mask) {
		cfs_cpt_del_cpu(cptab, cpt, cpu);
		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
	}
}
EXPORT_SYMBOL(cfs_cpt_unset_cpumask);

int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
{
	const cpumask_t *mask;
	int cpu;

	if (node < 0 || node >= nr_node_ids) {
		CDEBUG(D_INFO,
		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
		return 0;
	}

	mask = cpumask_of_node(node);

	if (!cpumask_empty(mask))
		for_each_cpu(cpu, mask) {
			CDEBUG(D_INFO,
		       "set_node() cpu=%d cpt=%d\n", cpu, cpt);
			cfs_cpt_add_cpu(cptab, cpt, cpu);
		}

	cfs_cpt_add_node(cptab, cpt, node);

	return 1;
}
EXPORT_SYMBOL(cfs_cpt_set_node);

void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
{
	const cpumask_t *mask;
	int cpu;

	if (node < 0 || node >= nr_node_ids) {
		CDEBUG(D_INFO,
		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
		return;
	}

	mask = cpumask_of_node(node);

	if (!cpumask_empty(mask))
		for_each_cpu(cpu, mask)
			cfs_cpt_del_cpu(cptab, cpt, cpu);

	cfs_cpt_del_node(cptab, cpt, node);
}
EXPORT_SYMBOL(cfs_cpt_unset_node);

void cfs_set_node_core(struct cfs_cpt_table *cptab,
		      int include_lo, int include_hi)
{
	const cpumask_t *mask;
	int node, cpu;
	int offset;

	for_each_online_node(node) {
		offset = -1;
		mask = cpumask_of_node(node);
		if (cpumask_empty(mask))
			continue;

		for_each_cpu(cpu, mask) {
			if (offset < 0)
				offset = cpu;
			if (include_lo + offset <= cpu &&
			    include_hi + offset >= cpu)
				cfs_cpt_add_cpu(cptab,
						cfs_cpt_of_cpu(cptab, cpu),
						cpu);
		}
	}
}
EXPORT_SYMBOL(cfs_set_node_core);

void cfs_unset_node_core(struct cfs_cpt_table *cptab,
			 int exclude_lo, int exclude_hi)
{
	const cpumask_t *mask;
	int node, cpu;
	int offset;

	for_each_online_node(node) {
		offset = -1;
		mask = cpumask_of_node(node);
		if (cpumask_empty(mask))
			continue;

		for_each_cpu(cpu, mask) {
			if (offset < 0)
				offset = cpu;
			if (exclude_lo + offset <= cpu &&
			    exclude_hi + offset >= cpu)
				cfs_cpt_del_cpu(cptab,
						cfs_cpt_of_cpu(cptab, cpu),
						cpu);
		}
	}
}
EXPORT_SYMBOL(cfs_unset_node_core);

void cfs_set_cpt_core(struct cfs_cpt_table *cptab,
			int include_lo, int include_hi)
{
	const cpumask_t *mask;
	int cpt, cpu;
	int offset;

	for (cpt = 0; cpt < cptab->ctb_nparts; cpt++) {
		offset = -1;
		mask = cptab->ctb_parts[cpt].cpt_cpumask;
		if (cpumask_empty(mask))
			continue;

		for_each_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask) {
			if (offset < 0)
				offset = cpu;
			if (include_lo + offset <= cpu &&
			    include_hi + offset >= cpu)
				cfs_cpt_add_cpu(cptab, cpt, cpu);
		}
	}
}
EXPORT_SYMBOL(cfs_set_cpt_core);

void cfs_unset_cpt_core(struct cfs_cpt_table *cptab,
			int exclude_lo, int exclude_hi)
{
	const cpumask_t *mask;
	int cpt, cpu;
	int offset;

	for (cpt = 0; cpt < cptab->ctb_nparts; cpt++) {
		offset = -1;
		mask = cptab->ctb_parts[cpt].cpt_cpumask;
		if (cpumask_empty(mask))
			continue;

		for_each_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask) {
			if (offset < 0)
				offset = cpu;
			if (exclude_lo + offset <= cpu &&
			    exclude_hi + offset >= cpu)
				cfs_cpt_del_cpu(cptab, cpt, cpu);
		}
	}
}
EXPORT_SYMBOL(cfs_unset_cpt_core);

int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
			 const nodemask_t *mask)
{
	int node;

	for_each_node_mask(node, *mask)
		cfs_cpt_set_node(cptab, cpt, node);

	return 1;
}
EXPORT_SYMBOL(cfs_cpt_set_nodemask);

void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
			    const nodemask_t *mask)
{
	int node;

	for_each_node_mask(node, *mask)
		cfs_cpt_unset_node(cptab, cpt, node);
}
EXPORT_SYMBOL(cfs_cpt_unset_nodemask);

int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
{
	nodemask_t *mask;
	int weight;
	unsigned int rotor;
	int node = 0;

	/* convert CPU partition ID to HW node id */

	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
		mask = cptab->ctb_nodemask;
		rotor = cptab->ctb_spread_rotor++;
	} else {
		mask = cptab->ctb_parts[cpt].cpt_nodemask;
		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
		node  = cptab->ctb_parts[cpt].cpt_node;
	}

	weight = nodes_weight(*mask);
	if (weight > 0) {
		rotor %= weight;

		for_each_node_mask(node, *mask) {
			if (!rotor--)
				return node;
		}
	}

	return node;
}
EXPORT_SYMBOL(cfs_cpt_spread_node);

int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
{
	int cpu;
	int cpt;

	preempt_disable();
	cpu = smp_processor_id();
	cpt = cptab->ctb_cpu2cpt[cpu];

	if (cpt < 0 && remap) {
		/* don't return negative value for safety of upper layer,
		 * instead we shadow the unknown cpu to a valid partition ID
		 */
		cpt = cpu % cptab->ctb_nparts;
	}
	preempt_enable();
	return cpt;
}
EXPORT_SYMBOL(cfs_cpt_current);

int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
{
	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);

	return cptab->ctb_cpu2cpt[cpu];
}
EXPORT_SYMBOL(cfs_cpt_of_cpu);

int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
{
	if (node < 0 || node > nr_node_ids)
		return CFS_CPT_ANY;

	return cptab->ctb_node2cpt[node];
}
EXPORT_SYMBOL(cfs_cpt_of_node);

int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
{
	nodemask_t *nodemask;
	cpumask_t *cpumask;
	int cpu;
	int rc;

	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));

	if (cpt == CFS_CPT_ANY) {
		cpumask = cptab->ctb_cpumask;
		nodemask = cptab->ctb_nodemask;
	} else {
		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
	}

	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
		CDEBUG(D_INFO,
		       "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
			cpt);
		return -ENODEV;
	}

	for_each_online_cpu(cpu) {
		if (cpumask_test_cpu(cpu, cpumask))
			continue;

		rc = set_cpus_allowed_ptr(current, cpumask);
		set_mems_allowed(*nodemask);
		if (!rc)
			schedule(); /* switch to allowed CPU */

		return rc;
	}

	/* don't need to set affinity because all online CPUs are covered */
	return 0;
}
EXPORT_SYMBOL(cfs_cpt_bind);

/**
 * cfs_cpt_choose_ncpus() - Choose max to @number CPUs from @node and set them
 *                          in @cpt.
 * @cptab: CPU Partitioning Table
 * @cpt: partitioning index
 * @node_mask: CPU Mask
 * @number: Count of CPU to select
 *
 * We always prefer to choose CPU in the same core/socket.
 *
 * Return:
 * * %0 on success
 * * %negative on failure
 */
static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
				cpumask_t *node_mask, int number)
{
	cpumask_var_t socket_mask;
	cpumask_var_t core_mask;
	int rc = 0;
	int cpu;
	int i;

	LASSERT(number > 0);

	if (number >= cpumask_weight(node_mask)) {
		while (!cpumask_empty(node_mask)) {
			cpu = cpumask_first(node_mask);
			cpumask_clear_cpu(cpu, node_mask);

			if (!cpu_online(cpu))
				continue;

			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
			if (!rc)
				return -EINVAL;
		}
		return 0;
	}

	/* Allocate scratch buffers
	 * As we cannot initialize a cpumask_var_t, we need
	 * to alloc both before we can risk trying to free either
	 */
	if (!zalloc_cpumask_var(&socket_mask, GFP_NOFS))
		rc = -ENOMEM;
	if (!zalloc_cpumask_var(&core_mask, GFP_NOFS))
		rc = -ENOMEM;
	if (rc)
		goto out;

	while (!cpumask_empty(node_mask)) {
		cpu = cpumask_first(node_mask);

		/* get cpumask for cores in the same socket */
		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
		while (!cpumask_empty(socket_mask)) {
			/* get cpumask for hts in the same core */
			cpumask_and(core_mask, topology_sibling_cpumask(cpu),
				    node_mask);

			for_each_cpu(i, core_mask) {
				cpumask_clear_cpu(i, socket_mask);
				cpumask_clear_cpu(i, node_mask);

				if (!cpu_online(i))
					continue;

				rc = cfs_cpt_set_cpu(cptab, cpt, i);
				if (!rc) {
					rc = -EINVAL;
					goto out;
				}

				if (!--number)
					goto out;
			}
			cpu = cpumask_first(socket_mask);
		}
	}

out:
	free_cpumask_var(socket_mask);
	free_cpumask_var(core_mask);
	return rc;
}

#define CPT_WEIGHT_MIN	4u

static unsigned int cfs_cpt_num_estimate(void)
{
	unsigned int nthr;
	unsigned int ncpu = num_online_cpus();
	unsigned int ncpt = 1;

	preempt_disable();
	nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
	preempt_enable();

	if (ncpu > CPT_WEIGHT_MIN)
		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++)
			; /* nothing */

#if (BITS_PER_LONG == 32)
	/* config many CPU partitions on 32-bit system could consume
	 * too much memory
	 */
	ncpt = min(2U, ncpt);
#endif
	while (ncpu % ncpt)
		ncpt--; /* worst case is 1 */

	return ncpt;
}

static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
{
	struct cfs_cpt_table *cptab = NULL;
	cpumask_var_t node_mask;
	int cpt = 0;
	int node;
	int num;
	int rem;
	int rc = 0;

	num = cfs_cpt_num_estimate();
	if (ncpt <= 0)
		ncpt = num;

	if (ncpt > num_online_cpus()) {
		rc = -EINVAL;
		CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n",
		       ncpt, num_online_cpus(), rc);
		goto failed;
	}

	if (ncpt > 4 * num) {
		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
		      ncpt, num);
	}

	cptab = cfs_cpt_table_alloc(ncpt);
	if (!cptab) {
		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
		rc = -ENOMEM;
		goto failed;
	}

	if (!zalloc_cpumask_var(&node_mask, GFP_NOFS)) {
		CERROR("Failed to allocate scratch cpumask\n");
		rc = -ENOMEM;
		goto failed;
	}

	num = num_online_cpus() / ncpt;
	rem = num_online_cpus() % ncpt;
	for_each_online_node(node) {
		cpumask_copy(node_mask, cpumask_of_node(node));

		while (cpt < ncpt && !cpumask_empty(node_mask)) {
			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
			int ncpu = cpumask_weight(part->cpt_cpumask);

			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
						  (rem > 0) + num - ncpu);
			if (rc < 0) {
				rc = -EINVAL;
				goto failed_mask;
			}

			ncpu = cpumask_weight(part->cpt_cpumask);
			if (ncpu == num + !!(rem > 0)) {
				cpt++;
				rem--;
			}
		}
	}

	free_cpumask_var(node_mask);

	return cptab;

failed_mask:
	free_cpumask_var(node_mask);
failed:
	CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
	       rc, ncpt, num_online_nodes(), num_online_cpus());

	if (cptab)
		cfs_cpt_table_free(cptab);

	return ERR_PTR(rc);
}

static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
{
	struct cfs_cpt_table *cptab;
	char *pattern_dup;
	char *bracket;
	char *str;
	bool exclude = false;
	bool relative = false;
	int node = 0;
	int ncpt = cpu_npartitions;
	int cpt = 0;
	int high = 0;
	int rc;
	int c;
	int i;

	pattern_dup = kstrdup(pattern, GFP_KERNEL);
	if (!pattern_dup) {
		CERROR("Failed to duplicate pattern '%s'\n", pattern);
		return ERR_PTR(-ENOMEM);
	}

	str = skip_spaces(pattern_dup);
	if (*str == 'n' || *str == 'N') {
		str++; /* skip 'N' char */
		node = 1; /* NUMA pattern */
		if (*str == '\0') {
			if (cpu_npartitions) {
				kfree(pattern_dup);
				return cfs_cpt_table_create(cpu_npartitions);
			}
			node = -1;
			for_each_online_node(i)
				if (!cpumask_empty(cpumask_of_node(i)))
					ncpt++;
		}
		str = skip_spaces(str);
	}
	if (*str == 'x' || *str == 'X') {
		str++; /* skip 'X' char */
		exclude = true;
		str = skip_spaces(str);
	}

	if (*str == 'c' || *str == 'C') {
		str++; /* skip 'C' char */
		exclude = true;
		relative = true;
	}
	if (node && !ncpt) {
		for_each_online_node(i) {
			if (!cpumask_empty(cpumask_of_node(i)))
				ncpt++;
		}
	} else if (!ncpt) { /* scan for bracket at start of partition */
		bracket = str;
		while ((bracket = strchr(bracket, '['))) {
			bracket++;
			ncpt++;
		}
	}

	if ((!ncpt && !exclude) ||
	    (node && ncpt > num_online_nodes()) ||
	    (!node && ncpt > num_online_cpus())) {
		CERROR("Invalid pattern '%s', or too many partitions %d\n",
		       pattern_dup, ncpt);
		rc = -EINVAL;
		goto err_free_str;
	}

	cptab = cfs_cpt_table_alloc(ncpt);
	if (!cptab) {
		CERROR("Failed to allocate CPU partition table\n");
		rc = -ENOMEM;
		goto err_free_str;
	}

	if (exclude || node < 0) { /* create a default cpu layout */
		if (node) {
			for_each_online_node(i) {
				if (cpumask_empty(cpumask_of_node(i)))
					continue;

				rc = cfs_cpt_set_node(cptab, cpt++, i);
				if (!rc) {
					rc = -EINVAL;
					goto err_free_table;
				}

				if (relative) {
					c = 0;
					for_each_cpu(rc, cpumask_of_node(i))
						c++;
					if (high == 0 || c < high)
						high = c;
				}
			}
			if (node < 0) { /* return layout for only "N" */
				kfree(pattern_dup);
				return cptab;
			}
		} else {
			cfs_cpt_table_free(cptab); /* free old table */
			cptab = cfs_cpt_table_create(ncpt);
			if (!cptab) {
				rc = -ENOMEM;
				CERROR("Failed to allocate CPU partition table based on cpu_npartitions: rc=%d\n",
				       -rc);
				goto err_free_str;
			}
		}
		if (!relative)
			high = num_online_cpus() - 1;
	}

	if (!exclude)
		high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;

	for (c = 0; c < num_possible_cpus() /* should end sooner */; c++) {
		struct cfs_range_expr *range;
		struct cfs_expr_list *el;
		int n;

		bracket = strchr(str, '[');
		if (!bracket) {
			if (*str) {
				CERROR("Invalid pattern '%s'\n",
				       str);
				rc = -EINVAL;
				goto err_free_table;
			} else if (!exclude && c != ncpt) {
				CERROR("Expect %d partitions but found %d\n",
				       ncpt, c);
				rc = -EINVAL;
				goto err_free_table;
			}
			break;
		}

		if (!exclude && sscanf(str, "%d%n", &cpt, &n) < 1) {
			CERROR("Invalid CPU pattern '%s'\n", str);
			rc = -EINVAL;
			goto err_free_table;
		}

		if (!exclude && (cpt < 0 || cpt >= ncpt)) {
			CERROR("Invalid partition id %d, total partitions %d\n",
			       cpt, ncpt);
			rc = -EINVAL;
			goto err_free_table;
		}

		if (!exclude && cfs_cpt_weight(cptab, cpt)) {
			CERROR("Partition %d has already been set.\n", cpt);
			rc = -EPERM;
			goto err_free_table;
		}

		str = exclude ? bracket : strim(str + n); /* jump to next '[' */
		if (!exclude && str != bracket) {
			CERROR("Invalid pattern '%s' does not start with '['\n",
			       str);
			rc = -EINVAL;
			goto err_free_table;
		}

		bracket = strchr(str, ']');
		if (!bracket) {
			CERROR("Missing right bracket for partition %d in '%s'\n",
			       cpt, str);
			rc = -EINVAL;
			goto err_free_table;
		}

		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
					 &el);
		if (rc) {
			CERROR("Can't parse number range in '%s'\n", str);
			rc = -ERANGE;
			goto err_free_table;
		}

		list_for_each_entry(range, &el->el_exprs, re_link) {
			if (exclude && relative) {
				if (node)
					cfs_unset_node_core(cptab,
							    range->re_lo,
							    range->re_hi);
				else
					cfs_unset_cpt_core(cptab,
							   range->re_lo,
							   range->re_hi);
				continue;
			}

			for (i = range->re_lo; i <= range->re_hi; i++) {
				if ((i - range->re_lo) % range->re_stride)
					continue;

				if (exclude) {
					cfs_cpt_unset_cpu(cptab,
							  cfs_cpt_of_cpu(cptab,
									 i), i);
					continue;
				}

				rc = node ?
				     cfs_cpt_set_node(cptab, cpt, i)
				     : cfs_cpt_set_cpu(cptab, cpt, i);

				if (!rc) {
					cfs_expr_list_free(el);
					rc = -EINVAL;
					goto err_free_table;
				}
			}
		}

		cfs_expr_list_free(el);

		if (exclude || relative) {
			for (cpt = 0; cpt < ncpt; cpt++) {
				if (!cfs_cpt_online(cptab, cpt)) {
					rc = -ENODEV;
					CERROR("All cores are excluded on partition %d: rc=%d\n",
					       cpt, -rc);
					goto err_free_table;
				}
			}
		} else if (!exclude && !cfs_cpt_online(cptab, cpt)) {
			CERROR("No online CPU is found on partition %d\n", cpt);
			rc = -ENODEV;
			goto err_free_table;
		}

		str = strim(bracket + 1);
	}

	kfree(pattern_dup);
	return cptab;

err_free_table:
	cfs_cpt_table_free(cptab);
err_free_str:
	kfree(pattern_dup);
	return ERR_PTR(rc);
}

struct cfs_var_array {
	unsigned int		va_count;	/* # of buffers */
	unsigned int		va_size;	/* size of each var */
	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
	void			*va_ptrs[];	/* buffer addresses */
};

/* free per-cpu data, see more detail in cfs_percpt_free */
void
cfs_percpt_free(void *vars)
{
	struct cfs_var_array *arr;
	int i;

	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);

	for (i = 0; i < arr->va_count; i++) {
		LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
	}

	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
				  va_ptrs[arr->va_count]));
}
EXPORT_SYMBOL(cfs_percpt_free);

/* allocate per cpu-partition variables, returned value is an array of pointers,
 * variable can be indexed by CPU partition ID, i.e:
 *
 *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
 *	then caller can access memory block for CPU 0 by arr[0],
 *	memory block for CPU 1 by arr[1]...
 *	memory block for CPU N by arr[N]...
 *
 * cacheline aligned.
 */
void *
cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
{
	struct cfs_var_array *arr;
	int count;
	int i;

	count = cfs_cpt_number(cptab);

	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
	if (!arr)
		return NULL;

	size = L1_CACHE_ALIGN(size);
	arr->va_size = size;
	arr->va_count = count;
	arr->va_cptab = cptab;

	for (i = 0; i < count; i++) {
		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
		if (!arr->va_ptrs[i]) {
			cfs_percpt_free((void *)&arr->va_ptrs[0]);
			return NULL;
		}
	}

	return (void *)&arr->va_ptrs[0];
}
EXPORT_SYMBOL(cfs_percpt_alloc);

/* return number of CPUs (or number of elements in per-cpu data)
 * according to cptab of @vars
 */
int
cfs_percpt_number(void *vars)
{
	struct cfs_var_array *arr;

	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);

	return arr->va_count;
}
EXPORT_SYMBOL(cfs_percpt_number);

#ifdef CONFIG_HOTPLUG_CPU
static enum cpuhp_state lustre_cpu_online;

static int cfs_cpu_online(unsigned int cpu)
{
	return 0;
}

static int cfs_cpu_dead(unsigned int cpu)
{
	bool warn;

	/* if all HTs in a core are offline, it may break affinity */
	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
			       cpu_online_mask) >= nr_cpu_ids;
	CDEBUG(warn ? D_WARNING : D_INFO,
	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
	       cpu);
	return 0;
}
#endif /* CONFIG_HOTPLUG_CPU */

void cfs_cpu_fini(void)
{
	if (!IS_ERR_OR_NULL(cfs_cpt_tab))
		cfs_cpt_table_free(cfs_cpt_tab);

#ifdef CONFIG_HOTPLUG_CPU
	if (lustre_cpu_online > 0)
		cpuhp_remove_state_nocalls(lustre_cpu_online);
	cpuhp_remove_state_nocalls(CPUHP_BP_PREPARE_DYN);
#endif
}

int cfs_cpu_init(void)
{
	int ret;

	LASSERT(!cfs_cpt_tab);

#ifdef CONFIG_HOTPLUG_CPU
	ret = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
					"fs/lustre/cfe:dead", NULL,
					cfs_cpu_dead);
	if (ret < 0)
		goto failed_cpu_dead;

	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
					"fs/lustre/cfe:online",
					cfs_cpu_online, NULL);
	if (ret < 0)
		goto failed_cpu_online;

	lustre_cpu_online = ret;
#endif
	cpus_read_lock();
	if (*cpu_pattern) {
		cfs_cpt_tab = cfs_cpt_table_create_pattern(cpu_pattern);
		if (IS_ERR(cfs_cpt_tab)) {
			ret = PTR_ERR(cfs_cpt_tab);
			pr_err("libcfs: failed to create cptab from pattern '%s': rc = %d\n",
			       cpu_pattern, ret);
			goto failed_alloc_table;
		}
	} else {
		cfs_cpt_tab = cfs_cpt_table_create(cpu_npartitions);
		if (IS_ERR(cfs_cpt_tab)) {
			ret = PTR_ERR(cfs_cpt_tab);
			pr_err("libcfs: failed to create cptab with npartitions=%d: rc = %d\n",
			       cpu_npartitions, ret);
			goto failed_alloc_table;
		}
	}

	cpus_read_unlock();

	pr_notice("libcfs: HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
		  num_online_nodes(), num_online_cpus(),
		  cfs_cpt_number(cfs_cpt_tab));
	return 0;

failed_alloc_table:
	cpus_read_unlock();

	if (!IS_ERR_OR_NULL(cfs_cpt_tab))
		cfs_cpt_table_free(cfs_cpt_tab);

	ret = -EINVAL;
#ifdef CONFIG_HOTPLUG_CPU
	if (lustre_cpu_online > 0)
		cpuhp_remove_state_nocalls(lustre_cpu_online);
failed_cpu_online:
	cpuhp_remove_state_nocalls(CPUHP_BP_PREPARE_DYN);
failed_cpu_dead:
#endif
	return ret;
}