Viewing: ext4-add-prefetching-for-block-allocation-bitmaps.patch

commit cfd73237722135807967f389bcbda558a60a30d6
Author:     Alex Zhuravlev <bzzz@whamcloud.com>
AuthorDate: Tue Apr 21 10:54:07 2020 +0300

ext4: add prefetching for block allocation bitmaps

This should significantly improve bitmap loading, especially for flex
groups as it tries to load all bitmaps within a flex.group instead of
one by one synchronously.

Prefetching is done in 8 * flex_bg groups, so it should be 8
read-ahead reads for a single allocating thread. At the end of
allocation the thread waits for read-ahead completion and initializes
buddy information so that read-aheads are not lost in case of memory
pressure.

At cr=0 the number of prefetching IOs is limited per allocation
context to prevent a situation when mballoc loads thousands of bitmaps
looking for a perfect group and ignoring groups with good chunks.

Together with the patch "ext4: limit scanning of uninitialized groups"
the mount time (which includes few tiny allocations) of a 1PB
filesystem is reduced significantly:

               0% full    50%-full unpatched    patched
  mount time       33s                9279s       563s

[ Restructured by tytso; removed the state flags in the allocation
context, so it can be used to lazily prefetch the allocation bitmaps
immediately after the file system is mounted.  Skip prefetching
block groups which are uninitialized.  Finally pass in the
REQ_RAHEAD flag to the block layer while prefetching. ]

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

---
Index: linux-stage/fs/ext4/balloc.c
===================================================================
--- linux-stage.orig/fs/ext4/balloc.c
+++ linux-stage/fs/ext4/balloc.c
@@ -498,7 +498,8 @@ ext4_read_block_bitmap_nowait(struct sup
 	trace_ext4_read_block_bitmap_load(sb, block_group);
 	bh->b_end_io = ext4_end_bitmap_read;
 	get_bh(bh);
-	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
+		  (ignore_locked ? REQ_RAHEAD : 0), bh);
 	return bh;
 verify:
 	err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
Index: linux-stage/fs/ext4/mballoc.c
===================================================================
--- linux-stage.orig/fs/ext4/mballoc.c
+++ linux-stage/fs/ext4/mballoc.c
@@ -2246,97 +2246,93 @@ static u64 available_blocks_count(struct
 }
 
 /*
- * each allocation context (i.e. a thread doing allocation) has own
- * sliding prefetch window of @s_mb_prefetch size which starts at the
- * very first goal and moves ahead of scaning.
- * a side effect is that subsequent allocations will likely find
- * the bitmaps in cache or at least in-flight.
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
  */
-static void
-ext4_mb_prefetch(struct ext4_allocation_context *ac,
-		    ext4_group_t start)
+static  ext4_group_t
+ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+		unsigned int nr, int *cnt)
 {
-	struct super_block *sb = ac->ac_sb;
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_group_info *grp;
-	ext4_group_t group = start;
 	struct buffer_head *bh;
-	int nr;
-
-	/* limit prefetching at cr=0, otherwise mballoc can
-	 * spend a lot of time loading imperfect groups */
-	if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
-		return;
-
-	/* batch prefetching to get few READs in flight */
-	nr = ac->ac_prefetch - group;
-	if (ac->ac_prefetch < group)
-		/* wrapped to the first groups */
-		nr += ngroups;
-	if (nr > 0)
-		return;
-	BUG_ON(nr < 0);
+	struct blk_plug plug;
 
-	nr = sbi->s_mb_prefetch;
-	if (ext4_has_feature_flex_bg(sb)) {
-		/* align to flex_bg to get more bitmas with a single IO */
-		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
-		nr = nr + sbi->s_mb_prefetch - group;
-	}
+	blk_start_plug(&plug);
 	while (nr-- > 0) {
-		grp = ext4_get_group_info(sb, group);
-		/* prevent expensive getblk() on groups w/ IO in progress */
-		if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp))
-			goto next;
-
-		/* ignore empty groups - those will be skipped
-		 * during the scanning as well */
-		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+								  NULL);
+		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+		/*
+		 * Prefetch block groups with free blocks; but don't
+		 * bother if it is marked uninitialized on disk, since
+		 * it won't require I/O to read.  Also only try to
+		 * prefetch once, so we avoid getblk() call, which can
+		 * be expensive.
+		 */
+		if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+		    EXT4_MB_GRP_NEED_INIT(grp) &&
+		    ext4_free_group_clusters(sb, gdp) > 0 &&
+		    !(ext4_has_group_desc_csum(sb) &&
+		      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
 			bh = ext4_read_block_bitmap_nowait(sb, group, 1);
 			if (bh && !IS_ERR(bh)) {
-				if (!buffer_uptodate(bh))
-					ac->ac_prefetch_ios++;
+				if (!buffer_uptodate(bh) && cnt)
+					(*cnt)++;
 				brelse(bh);
 			}
 		}
-next:
 		if (++group >= ngroups)
 			group = 0;
 	}
-	ac->ac_prefetch = group;
+	blk_finish_plug(&plug);
+	return group;
 }
 
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized.  Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+
 static void
-ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+		      unsigned int nr)
 {
-	struct ext4_group_info *grp;
-	ext4_group_t group;
-	int nr, rc;
-
-	/* initialize last window of prefetched groups */
-	nr = ac->ac_prefetch_ios;
-	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
-		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
-	group = ac->ac_prefetch;
 	while (nr-- > 0) {
-		grp = ext4_get_group_info(ac->ac_sb, group);
-		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
-			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
-			if (rc)
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+								  NULL);
+		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+		if (!group)
+			group = ext4_get_groups_count(sb);
+		group--;
+		grp = ext4_get_group_info(sb, group);
+
+		if (EXT4_MB_GRP_NEED_INIT(grp) &&
+		    ext4_free_group_clusters(sb, gdp) > 0 &&
+		    !(ext4_has_group_desc_csum(sb) &&
+		      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+			if (ext4_mb_init_group(sb, group, GFP_NOFS))
 				break;
 		}
-		if (group-- == 0)
-			group = ext4_get_groups_count(ac->ac_sb) - 1;
 	}
 }
 
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t ngroups, group, i;
+	ext4_group_t prefetch_grp = 0, ngroups, group, i;
 	int cr = -1;
 	int err = 0, first_err = 0;
+	unsigned int nr = 0, prefetch_ios = 0;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
@@ -2420,7 +2416,7 @@ repeat:
 		 * from the goal value specified
 		 */
 		group = ac->ac_g_ex.fe_group;
-		ac->ac_prefetch = group;
+		prefetch_grp = group;
 
 		for (i = 0; i < ngroups; group++, i++) {
 			int ret = 0;
@@ -2432,7 +2428,28 @@ repeat:
 			if (group >= ngroups)
 				group = 0;
 
-			ext4_mb_prefetch(ac, group);
+			/*
+			 * Batch reads of the block allocation bitmaps
+			 * to get multiple READs in flight; limit
+			 * prefetching at cr=0/1, otherwise mballoc can
+			 * spend a lot of time loading imperfect groups
+			 */
+			if ((prefetch_grp == group) &&
+			    (cr > 1 ||
+			     prefetch_ios < sbi->s_mb_prefetch_limit)) {
+				unsigned int curr_ios = prefetch_ios;
+
+				nr = sbi->s_mb_prefetch;
+				if (ext4_has_feature_flex_bg(sb)) {
+					nr = (group / sbi->s_mb_prefetch) *
+						sbi->s_mb_prefetch;
+					nr = nr + sbi->s_mb_prefetch - group;
+				}
+				prefetch_grp = ext4_mb_prefetch(sb, group,
+							nr, &prefetch_ios);
+				if (prefetch_ios == curr_ios)
+					nr = 0;
+			}
 
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group_nolock(ac, group, cr);
@@ -2512,8 +2529,8 @@ out:
 		 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
 		 ac->ac_flags, cr, err);
 
-	/* use prefetched bitmaps to init buddy so that read info is not lost */
-	ext4_mb_prefetch_fini(ac);
+	if (nr)
+		ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
 	return err;
 }
 
@@ -3012,6 +3029,26 @@ static int ext4_mb_init_backend(struct s
 			goto err_freebuddy;
 	}
 
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* a single flex group is supposed to be read by a single IO */
+		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+	} else {
+		sbi->s_mb_prefetch = 32;
+	}
+	if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+		sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+	/* now many real IOs to prefetch within a single allocation at cr=0
+	 * given cr=0 is an CPU-related optimization we shouldn't try to
+	 * load too many groups, at some point we should start to use what
+	 * we've got in memory.
+	 * with an average random access time 5ms, it'd take a second to get
+	 * 200 groups (* N with flex_bg), so let's make this limit 4
+	 */
+	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+	if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+		sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
 	return 0;
 
 err_freebuddy: