Viewing: ext4-simple-blockalloc.patch

commit 95f8ae5677491508ae7182b4f61ead3d413434ae
Author:     Artem Blagodarenko <artem.blagodarenko@hpe.com>
AuthorDate: Thu Jun 6 16:50:11 2019 +0300

LU-12103 ldiskfs: don't search large block range if disk full

Block allocator tries to find:
1) group with the same range as required
2) group with the same average range as required
3) group with required amount of space
4) any group

For quite full disk step 1 is failed with higth
probability, but takes a lot of time.

Skip 1st step if disk space < 25%
Skip 2d step if disk space < 15%
Skip 3d step if disk space < 5%
Also check if group has any free space on step 4.

This three thresholds can be adjusted through added interface.

Variables added which counts unsuccessfull group processing loops.
This can show allocator effectiveness in different circumstances.

This statistics output through mb_alloc file. This file is
useful to track allocator activity.

Signed-off-by: Artem Blagodarenko <c17828@cray.com>
Cray-bug-id: LUS-6746
Reviewed-by: Wang Shilong <wshilong@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Change-Id: I18c7147e32951c49e12a2444803aa2995bb4ae2d
Reviewed-on: https://review.whamcloud.com/35180

Index: linux-stage/fs/ext4/ext4.h
===================================================================
--- linux-stage.orig/fs/ext4/ext4.h
+++ linux-stage/fs/ext4/ext4.h
@@ -1494,6 +1494,9 @@ struct ext4_sb_info {
 	unsigned int s_mb_min_to_scan;
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
+	ext4_fsblk_t s_mb_c1_blocks;
+	ext4_fsblk_t s_mb_c2_blocks;
+	ext4_fsblk_t s_mb_c3_blocks;
 	unsigned long *s_mb_prealloc_table;
 	unsigned int s_mb_group_prealloc;
 	unsigned int s_max_dir_size_kb;
@@ -1510,6 +1513,9 @@ struct ext4_sb_info {
 	atomic_t s_bal_goals;	/* goal hits */
 	atomic_t s_bal_breaks;	/* too long searches */
 	atomic_t s_bal_2orders;	/* 2^order hits */
+	/* cX loop didn't find blocks */
+	atomic64_t s_bal_cX_failed[4];
+	atomic64_t s_bal_cX_skipped[3];
 	spinlock_t s_bal_lock;
 	unsigned long s_mb_buddies_generated;
 	unsigned long long s_mb_generation_time;
@@ -2723,6 +2729,9 @@ ext4_read_inode_bitmap(struct super_bloc
 /* mballoc.c */
 extern const struct file_operations ext4_seq_prealloc_table_fops;
 extern const struct seq_operations ext4_mb_seq_groups_ops;
+extern const struct file_operations ext4_mb_seq_alloc_fops;
+extern int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
+				  ext4_fsblk_t *blocks);
 extern const struct file_operations ext4_seq_mb_last_group_fops;
 extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
 extern long ext4_mb_stats;
Index: linux-stage/fs/ext4/mballoc.c
===================================================================
--- linux-stage.orig/fs/ext4/mballoc.c
+++ linux-stage/fs/ext4/mballoc.c
@@ -2114,6 +2114,20 @@ static int ext4_mb_good_group(struct ext
 	return 0;
 }
 
+static u64 available_blocks_count(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t resv_blocks;
+	u64 bfree;
+	struct ext4_super_block *es = sbi->s_es;
+
+	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
+	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+		 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+
+	bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+	return bfree - (ext4_r_blocks_count(es) + resv_blocks);
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2123,6 +2137,7 @@ ext4_mb_regular_allocator(struct ext4_al
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
+	ext4_fsblk_t avail_blocks;
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
@@ -2175,6 +2190,21 @@ ext4_mb_regular_allocator(struct ext4_al
 
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
+
+	/* Choose what loop to pass based on disk fullness */
+	avail_blocks = available_blocks_count(sbi) ;
+
+	if (avail_blocks < sbi->s_mb_c3_blocks) {
+		cr = 3;
+		atomic64_inc(&sbi->s_bal_cX_skipped[2]);
+	} else if(avail_blocks < sbi->s_mb_c2_blocks) {
+		cr = 2;
+		atomic64_inc(&sbi->s_bal_cX_skipped[1]);
+	} else if(avail_blocks < sbi->s_mb_c1_blocks) {
+		cr = 1;
+		atomic64_inc(&sbi->s_bal_cX_skipped[0]);
+	}
+
 	/*
 	 * cr == 0 try to get exact allocation,
 	 * cr == 3  try to get anything
@@ -2240,6 +2270,9 @@ repeat:
 			if (ac->ac_status != AC_STATUS_CONTINUE)
 				break;
 		}
+		/* Processed all groups and haven't found blocks */
+		if (i == ngroups)
+			atomic64_inc(&sbi->s_bal_cX_failed[cr]);
 	}
 
 	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2520,6 +2553,96 @@ const struct file_operations ext4_seq_mb
 	.write         = ext4_mb_last_group_write,
 };
 
+static int mb_seq_alloc_show(struct seq_file *seq, void *v)
+{
+	struct super_block *sb = seq->private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	seq_printf(seq, "mballoc:\n");
+	seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
+	seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+	seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+	seq_printf(seq, "\textents_scanned: %u\n",
+		   atomic_read(&sbi->s_bal_ex_scanned));
+	seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+	seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+	seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+	seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+
+	seq_printf(seq, "\tuseless_c0_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]));
+	seq_printf(seq, "\tuseless_c1_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]));
+	seq_printf(seq, "\tuseless_c2_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
+	seq_printf(seq, "\tuseless_c3_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3]));
+	seq_printf(seq, "\tskipped_c0_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]));
+	seq_printf(seq, "\tskipped_c1_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]));
+	seq_printf(seq, "\tskipped_c2_loops: %llu\n",
+		   (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+	seq_printf(seq, "\tbuddies_generated: %lu\n",
+		   sbi->s_mb_buddies_generated);
+	seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
+	seq_printf(seq, "\tpreallocated: %u\n",
+		   atomic_read(&sbi->s_mb_preallocated));
+	seq_printf(seq, "\tdiscarded: %u\n",
+		   atomic_read(&sbi->s_mb_discarded));
+	return 0;
+}
+
+static ssize_t mb_seq_alloc_write(struct file *file,
+			      const char __user *buf,
+			      size_t cnt, loff_t *pos)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
+
+	atomic_set(&sbi->s_bal_allocated, 0),
+	atomic_set(&sbi->s_bal_reqs, 0),
+	atomic_set(&sbi->s_bal_success, 0);
+
+	atomic_set(&sbi->s_bal_ex_scanned, 0),
+	atomic_set(&sbi->s_bal_goals, 0),
+	atomic_set(&sbi->s_bal_2orders, 0),
+	atomic_set(&sbi->s_bal_breaks, 0),
+	atomic_set(&sbi->s_mb_lost_chunks, 0);
+
+	atomic64_set(&sbi->s_bal_cX_failed[0], 0),
+	atomic64_set(&sbi->s_bal_cX_failed[1], 0),
+	atomic64_set(&sbi->s_bal_cX_failed[2], 0);
+	atomic64_set(&sbi->s_bal_cX_failed[3], 0);
+
+	atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
+	atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
+	atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
+
+
+	sbi->s_mb_buddies_generated = 0;
+	sbi->s_mb_generation_time = 0;
+
+	atomic_set(&sbi->s_mb_preallocated, 0),
+	atomic_set(&sbi->s_mb_discarded, 0);
+
+	return cnt;
+}
+
+static int mb_seq_alloc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
+}
+
+const struct file_operations ext4_mb_seq_alloc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= mb_seq_alloc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= mb_seq_alloc_write,
+};
+
 int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(m->private);
@@ -2759,6 +2879,8 @@ static int ext4_groupinfo_create_slab(si
 	return 0;
 }
 
+#define THRESHOLD_BLOCKS(sbi, percent)					\
+	(ext4_blocks_count((sbi)->s_es) / 100 * (percent))
 int ext4_mb_init(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2812,6 +2934,15 @@ int ext4_mb_init(struct super_block *sb)
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
 	sbi->s_mb_stats = MB_DEFAULT_STATS;
 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+	if (!sbi->s_mb_c1_blocks)
+		sbi->s_mb_c1_blocks =
+			THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
+	if (!sbi->s_mb_c2_blocks)
+		sbi->s_mb_c2_blocks =
+			THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
+	if (!sbi->s_mb_c3_blocks)
+		sbi->s_mb_c3_blocks =
+			THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
 	/*
 	 * The default group preallocation is 512, which for 4k block
 	 * sizes translates to 2 megabytes.  However for bigalloc file
@@ -2951,6 +3082,17 @@ int ext4_mb_release(struct super_block *
 				atomic_read(&sbi->s_bal_reqs),
 				atomic_read(&sbi->s_bal_success));
 		ext4_msg(sb, KERN_INFO,
+			"mballoc: (%llu, %llu, %llu, %llu) useless c(0,1,2,3) loops",
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]),
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]),
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]),
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3]));
+		ext4_msg(sb, KERN_INFO,
+			"mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]),
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]),
+				(unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+		ext4_msg(sb, KERN_INFO,
 		      "mballoc: %u extents scanned, %u goal hits, "
 				"%u 2^N hits, %u breaks, %u lost",
 				atomic_read(&sbi->s_bal_ex_scanned),
Index: linux-stage/fs/ext4/mballoc.h
===================================================================
--- linux-stage.orig/fs/ext4/mballoc.h
+++ linux-stage/fs/ext4/mballoc.h
@@ -72,6 +72,9 @@ do {									\
  * for which requests use 2^N search using buddies
  */
 #define MB_DEFAULT_ORDER2_REQS		8
+#define MB_DEFAULT_C1_THRESHOLD		25
+#define MB_DEFAULT_C2_THRESHOLD		15
+#define MB_DEFAULT_C3_THRESHOLD		5
 
 /*
  * default group prealloc size 512 blocks
Index: linux-stage/fs/ext4/super.c
===================================================================
--- linux-stage.orig/fs/ext4/super.c
+++ linux-stage/fs/ext4/super.c
@@ -1468,6 +1468,7 @@ enum {
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
+	Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
 };
@@ -1554,6 +1555,9 @@ static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+	{Opt_mb_c1_threshold, "mb_c1_threshold=%s"},
+	{Opt_mb_c2_threshold, "mb_c2_threshold=%s"},
+	{Opt_mb_c3_threshold, "mb_c3_threshold=%s"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
 	{Opt_nombcache, "nombcache"},
 	{Opt_nombcache, "no_mbcache"},	/* for backward compatibility */
@@ -1766,6 +1770,9 @@ static const struct mount_opts {
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_mb_c1_threshold, 0, MOPT_STRING},
+	{Opt_mb_c2_threshold, 0, MOPT_STRING},
+	{Opt_mb_c3_threshold, 0, MOPT_STRING},
 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
 	{Opt_err, 0, 0}
@@ -1929,6 +1936,12 @@ static int handle_mount_opt(struct super
 		sbi->s_max_dir_size_kb = arg;
 		/* reset s_warning_dir_size and make it re-calculated */
 		sbi->s_warning_dir_size = 0;
+	} else if (token == Opt_mb_c1_threshold) {
+		save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c1_blocks);
+	} else if (token == Opt_mb_c2_threshold) {
+		save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c2_blocks);
+	} else if (token == Opt_mb_c3_threshold) {
+		save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c3_blocks);
 	} else if (token == Opt_stripe) {
 		sbi->s_stripe = arg;
 	} else if (token == Opt_resuid) {
Index: linux-stage/fs/ext4/sysfs.c
===================================================================
--- linux-stage.orig/fs/ext4/sysfs.c
+++ linux-stage/fs/ext4/sysfs.c
@@ -20,6 +20,9 @@
 typedef enum {
 	attr_noop,
 	attr_delayed_allocation_blocks,
+	attr_mb_c1_threshold,
+	attr_mb_c2_threshold,
+	attr_mb_c3_threshold,
 	attr_session_write_kbytes,
 	attr_lifetime_write_kbytes,
 	attr_reserved_clusters,
@@ -135,6 +138,32 @@ static ssize_t journal_task_show(struct
 			task_pid_vnr(sbi->s_journal->j_task));
 }
 
+int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
+			   ext4_fsblk_t *blocks)
+{
+	unsigned long long val;
+
+	int ret;
+
+	ret = kstrtoull(skip_spaces(buf), 0, &val);
+	if (ret || val > 100)
+		return -EINVAL;
+
+	*blocks = val * ext4_blocks_count(sbi->s_es) / 100;
+	return 0;
+}
+
+#define THRESHOLD_PERCENT(sbi, blocks)					\
+	(((blocks) - 1) * 100 / ext4_blocks_count((sbi)->s_es) + 1)
+static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
+				  const char *buf, size_t count,
+				  ext4_fsblk_t *blocks)
+{
+	int ret = save_threshold_percent(sbi, buf, blocks);
+
+	return ret ?: count;
+}
+
 #define EXT4_ATTR(_name,_mode,_id)					\
 static struct ext4_attr ext4_attr_##_name = {				\
 	.attr = {.name = __stringify(_name), .mode = _mode },		\
@@ -178,6 +207,9 @@ EXT4_ATTR_FUNC(session_write_kbytes, 044
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
 EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
+EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
+EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
+EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
@@ -214,6 +246,9 @@ static struct attribute *ext4_attrs[] =
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(reserved_clusters),
 	ATTR_LIST(sra_exceeded_retry_limit),
+	ATTR_LIST(mb_c1_threshold),
+	ATTR_LIST(mb_c2_threshold),
+	ATTR_LIST(mb_c3_threshold),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(max_dir_size),
@@ -311,6 +346,15 @@ static ssize_t ext4_attr_show(struct kob
 		return snprintf(buf, PAGE_SIZE, "%llu\n",
 				(s64) EXT4_C2B(sbi,
 		       percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+	case attr_mb_c1_threshold:
+		return scnprintf(buf, PAGE_SIZE, "%llu\n",
+				 THRESHOLD_PERCENT(sbi, sbi->s_mb_c1_blocks));
+	case attr_mb_c2_threshold:
+		return scnprintf(buf, PAGE_SIZE, "%llu\n",
+				 THRESHOLD_PERCENT(sbi, sbi->s_mb_c2_blocks));
+	case attr_mb_c3_threshold:
+		return scnprintf(buf, PAGE_SIZE, "%llu\n",
+				 THRESHOLD_PERCENT(sbi, sbi->s_mb_c3_blocks));
 	case attr_session_write_kbytes:
 		return session_write_kbytes_show(sbi, buf);
 	case attr_lifetime_write_kbytes:
@@ -384,6 +428,12 @@ static ssize_t ext4_attr_store(struct ko
 		return inode_readahead_blks_store(sbi, buf, len);
 	case attr_trigger_test_error:
 		return trigger_test_error(sbi, buf, len);
+	case attr_mb_c1_threshold:
+		return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
+	case attr_mb_c2_threshold:
+		return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
+	case attr_mb_c3_threshold:
+		return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
 	}
 	return 0;
 }
@@ -446,6 +496,8 @@ int ext4_register_sysfs(struct super_blo
 				&ext4_seq_mb_last_group_fops, sb);
 		proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
 				ext4_mb_seq_last_start_seq_show, sb);
+		proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
+				 sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
 	}
 	return 0;
 }