Viewing: ext4-simple-blockalloc.patch
commit 95f8ae5677491508ae7182b4f61ead3d413434ae
Author: Artem Blagodarenko <artem.blagodarenko@hpe.com>
AuthorDate: Thu Jun 6 16:50:11 2019 +0300
LU-12103 ldiskfs: don't search large block range if disk full
Block allocator tries to find:
1) group with the same range as required
2) group with the same average range as required
3) group with required amount of space
4) any group
For quite full disk step 1 is failed with higth
probability, but takes a lot of time.
Skip 1st step if disk space < 25%
Skip 2d step if disk space < 15%
Skip 3d step if disk space < 5%
Also check if group has any free space on step 4.
This three thresholds can be adjusted through added interface.
Variables added which counts unsuccessfull group processing loops.
This can show allocator effectiveness in different circumstances.
This statistics output through mb_alloc file. This file is
useful to track allocator activity.
Signed-off-by: Artem Blagodarenko <c17828@cray.com>
Cray-bug-id: LUS-6746
Reviewed-by: Wang Shilong <wshilong@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Change-Id: I18c7147e32951c49e12a2444803aa2995bb4ae2d
Reviewed-on: https://review.whamcloud.com/35180
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3b9ec24..64dc5fd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1450,6 +1450,9 @@ struct ext4_sb_info {
unsigned int s_mb_min_to_scan;
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
+ ext4_fsblk_t s_mb_c1_blocks;
+ ext4_fsblk_t s_mb_c2_blocks;
+ ext4_fsblk_t s_mb_c3_blocks;
unsigned long *s_mb_prealloc_table;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
@@ -1466,6 +1469,9 @@ struct ext4_sb_info {
atomic_t s_bal_goals; /* goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
+ /* cX loop didn't find blocks */
+ atomic64_t s_bal_cX_failed[4];
+ atomic64_t s_bal_cX_skipped[3];
spinlock_t s_bal_lock;
unsigned long s_mb_buddies_generated;
unsigned long long s_mb_generation_time;
@@ -2563,6 +2569,9 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
extern const struct file_operations ext4_seq_prealloc_table_fops;
extern const struct seq_operations ext4_mb_seq_groups_ops;
+extern const struct file_operations ext4_mb_seq_alloc_fops;
+extern int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
+ ext4_fsblk_t *blocks);
extern const struct file_operations ext4_seq_mb_last_group_fops;
extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
extern long ext4_mb_stats;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 15c962f..7870406 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2104,6 +2104,20 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
+static u64 available_blocks_count(struct ext4_sb_info *sbi)
+{
+ ext4_fsblk_t resv_blocks;
+ u64 bfree;
+ struct ext4_super_block *es = sbi->s_es;
+
+ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
+ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+
+ bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+ return bfree - (ext4_r_blocks_count(es) + resv_blocks);
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -2113,6 +2127,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ ext4_fsblk_t avail_blocks;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2165,6 +2180,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
+
+ /* Choose what loop to pass based on disk fullness */
+ avail_blocks = available_blocks_count(sbi) ;
+
+ if (avail_blocks < sbi->s_mb_c3_blocks) {
+ cr = 3;
+ atomic64_inc(&sbi->s_bal_cX_skipped[2]);
+ } else if(avail_blocks < sbi->s_mb_c2_blocks) {
+ cr = 2;
+ atomic64_inc(&sbi->s_bal_cX_skipped[1]);
+ } else if(avail_blocks < sbi->s_mb_c1_blocks) {
+ cr = 1;
+ atomic64_inc(&sbi->s_bal_cX_skipped[0]);
+ }
+
/*
* cr == 0 try to get exact allocation,
* cr == 3 try to get anything
@@ -2230,6 +2260,9 @@ repeat:
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
}
+ /* Processed all groups and haven't found blocks */
+ if (i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2510,6 +2543,96 @@ const struct file_operations ext4_seq_mb_last_group_fops = {
.write = ext4_mb_last_group_write,
};
+static int mb_seq_alloc_show(struct seq_file *seq, void *v)
+{
+ struct super_block *sb = seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ seq_printf(seq, "mballoc:\n");
+ seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+ seq_printf(seq, "\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_ex_scanned));
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+
+ seq_printf(seq, "\tuseless_c0_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]));
+ seq_printf(seq, "\tuseless_c1_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]));
+ seq_printf(seq, "\tuseless_c2_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
+ seq_printf(seq, "\tuseless_c3_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3]));
+ seq_printf(seq, "\tskipped_c0_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]));
+ seq_printf(seq, "\tskipped_c1_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]));
+ seq_printf(seq, "\tskipped_c2_loops: %llu\n",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
+ seq_printf(seq, "\tbuddies_generated: %lu\n",
+ sbi->s_mb_buddies_generated);
+ seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
+ seq_printf(seq, "\tpreallocated: %u\n",
+ atomic_read(&sbi->s_mb_preallocated));
+ seq_printf(seq, "\tdiscarded: %u\n",
+ atomic_read(&sbi->s_mb_discarded));
+ return 0;
+}
+
+static ssize_t mb_seq_alloc_write(struct file *file,
+ const char __user *buf,
+ size_t cnt, loff_t *pos)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
+
+ atomic_set(&sbi->s_bal_allocated, 0),
+ atomic_set(&sbi->s_bal_reqs, 0),
+ atomic_set(&sbi->s_bal_success, 0);
+
+ atomic_set(&sbi->s_bal_ex_scanned, 0),
+ atomic_set(&sbi->s_bal_goals, 0),
+ atomic_set(&sbi->s_bal_2orders, 0),
+ atomic_set(&sbi->s_bal_breaks, 0),
+ atomic_set(&sbi->s_mb_lost_chunks, 0);
+
+ atomic64_set(&sbi->s_bal_cX_failed[0], 0),
+ atomic64_set(&sbi->s_bal_cX_failed[1], 0),
+ atomic64_set(&sbi->s_bal_cX_failed[2], 0);
+ atomic64_set(&sbi->s_bal_cX_failed[3], 0);
+
+ atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
+ atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
+ atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
+
+
+ sbi->s_mb_buddies_generated = 0;
+ sbi->s_mb_generation_time = 0;
+
+ atomic_set(&sbi->s_mb_preallocated, 0),
+ atomic_set(&sbi->s_mb_discarded, 0);
+
+ return cnt;
+}
+
+static int mb_seq_alloc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
+}
+
+const struct file_operations ext4_mb_seq_alloc_fops = {
+ .owner = THIS_MODULE,
+ .open = mb_seq_alloc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = mb_seq_alloc_write,
+};
+
int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
{
struct ext4_sb_info *sbi = EXT4_SB(m->private);
@@ -2734,6 +2854,8 @@ static int ext4_groupinfo_create_slab(size_t size)
return 0;
}
+#define THRESHOLD_BLOCKS(sbi, percent) \
+ (ext4_blocks_count((sbi)->s_es) / 100 * (percent))
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2787,6 +2908,15 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ if (!sbi->s_mb_c1_blocks)
+ sbi->s_mb_c1_blocks =
+ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
+ if (!sbi->s_mb_c2_blocks)
+ sbi->s_mb_c2_blocks =
+ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
+ if (!sbi->s_mb_c3_blocks)
+ sbi->s_mb_c3_blocks =
+ THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -2922,6 +3046,17 @@ int ext4_mb_release(struct super_block *sb)
atomic_read(&sbi->s_bal_allocated),
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: (%llu, %llu, %llu, %llu) useless c(0,1,2,3) loops",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]),
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]),
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]),
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3]));
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]),
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]),
+ (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
ext4_msg(sb, KERN_INFO,
"mballoc: %u extents scanned, %u goal hits, "
"%u 2^N hits, %u breaks, %u lost",
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index e00c3b7..d02daaf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -72,6 +72,9 @@ do { \
* for which requests use 2^N search using buddies
*/
#define MB_DEFAULT_ORDER2_REQS 8
+#define MB_DEFAULT_C1_THRESHOLD 25
+#define MB_DEFAULT_C2_THRESHOLD 15
+#define MB_DEFAULT_C3_THRESHOLD 5
/*
* default group prealloc size 512 blocks
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
===================================================================
--- linux-stage.orig/fs/ext4/super.c
+++ linux-stage/fs/ext4/super.c
@@ -1450,6 +1450,7 @@ enum {
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
+ Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
};
@@ -1604,6 +1605,9 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_mb_c1_threshold, "mb_c1_threshold=%s"},
+ {Opt_mb_c2_threshold, "mb_c2_threshold=%s"},
+ {Opt_mb_c3_threshold, "mb_c3_threshold=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
@@ -1748,6 +1752,9 @@ static const struct mount_opts {
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+ {Opt_mb_c1_threshold, 0, MOPT_STRING},
+ {Opt_mb_c2_threshold, 0, MOPT_STRING},
+ {Opt_mb_c3_threshold, 0, MOPT_STRING},
{Opt_test_dummy_encryption, 0, MOPT_GTE0},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_err, 0, 0}
@@ -1874,6 +1881,12 @@ static const struct mount_opts {
sbi->s_max_dir_size_kb = arg;
/* reset s_warning_dir_size and make it re-calculated */
sbi->s_warning_dir_size = 0;
+ } else if (token == Opt_mb_c1_threshold) {
+ save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c1_blocks);
+ } else if (token == Opt_mb_c2_threshold) {
+ save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c2_blocks);
+ } else if (token == Opt_mb_c3_threshold) {
+ save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c3_blocks);
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (token == Opt_resuid) {
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 417b33a..f49821e 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -20,6 +20,9 @@
typedef enum {
attr_noop,
attr_delayed_allocation_blocks,
+ attr_mb_c1_threshold,
+ attr_mb_c2_threshold,
+ attr_mb_c3_threshold,
attr_session_write_kbytes,
attr_lifetime_write_kbytes,
attr_reserved_clusters,
@@ -134,6 +137,32 @@ static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
task_pid_vnr(sbi->s_journal->j_task));
}
+int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
+ ext4_fsblk_t *blocks)
+{
+ unsigned long long val;
+
+ int ret;
+
+ ret = kstrtoull(skip_spaces(buf), 0, &val);
+ if (ret || val > 100)
+ return -EINVAL;
+
+ *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
+ return 0;
+}
+
+#define THRESHOLD_PERCENT(sbi, blocks) \
+ (((blocks) - 1) * 100 / ext4_blocks_count((sbi)->s_es) + 1)
+static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
+ const char *buf, size_t count,
+ ext4_fsblk_t *blocks)
+{
+ int ret = save_threshold_percent(sbi, buf, blocks);
+
+ return ret ?: count;
+}
+
#define EXT4_ATTR(_name,_mode,_id) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -176,6 +205,9 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
+EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
+EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
@@ -211,6 +243,9 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
+ ATTR_LIST(mb_c1_threshold),
+ ATTR_LIST(mb_c2_threshold),
+ ATTR_LIST(mb_c3_threshold),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(max_dir_size),
@@ -294,6 +329,15 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%llu\n",
(s64) EXT4_C2B(sbi,
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ case attr_mb_c1_threshold:
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ THRESHOLD_PERCENT(sbi, sbi->s_mb_c1_blocks));
+ case attr_mb_c2_threshold:
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ THRESHOLD_PERCENT(sbi, sbi->s_mb_c2_blocks));
+ case attr_mb_c3_threshold:
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ THRESHOLD_PERCENT(sbi, sbi->s_mb_c3_blocks));
case attr_session_write_kbytes:
return session_write_kbytes_show(sbi, buf);
case attr_lifetime_write_kbytes:
@@ -363,6 +407,12 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
return trigger_test_error(sbi, buf, len);
+ case attr_mb_c1_threshold:
+ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
+ case attr_mb_c2_threshold:
+ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
+ case attr_mb_c3_threshold:
+ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
}
return 0;
}
@@ -425,6 +475,8 @@ int ext4_register_sysfs(struct super_block *sb)
&ext4_seq_mb_last_group_fops, sb);
proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
ext4_mb_seq_last_start_seq_show, sb);
+ proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
+ sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
}
return 0;
}