From zhangkelvin at google.com Fri Jul 1 03:14:42 2022 From: zhangkelvin at google.com (Kelvin Zhang) Date: Thu, 30 Jun 2022 10:14:42 -0700 Subject: [PATCH v1] Make --mount-point option generally available Message-ID: <20220630171442.3945056-1-zhangkelvin@google.com> This option does not have any android specific dependencies. It is also useful for all selinux enabled fs images, so move it out of android specific feature sets. Signed-off-by: Kelvin Zhang --- include/erofs/config.h | 2 +- lib/xattr.c | 2 -- mkfs/main.c | 6 +++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/erofs/config.h b/include/erofs/config.h index 0a1b18b..030054b 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -65,8 +65,8 @@ struct erofs_configure { u32 c_dict_size; u64 c_unix_timestamp; u32 c_uid, c_gid; + const char *mount_point; #ifdef WITH_ANDROID - char *mount_point; char *target_out_path; char *fs_config_file; char *block_list_file; diff --git a/lib/xattr.c b/lib/xattr.c index 00fb963..cf5c447 100644 --- a/lib/xattr.c +++ b/lib/xattr.c @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, unsigned int len[2]; char *kvbuf, *fspath; -#ifdef WITH_ANDROID if (cfg.mount_point) ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, erofs_fspath(srcpath)); else -#endif ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); if (ret <= 0) return ERR_PTR(-ENOMEM); diff --git a/mkfs/main.c b/mkfs/main.c index b62a8aa..879c2f2 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -50,8 +50,8 @@ static struct option long_options[] = { {"quiet", no_argument, 0, 12}, {"blobdev", required_argument, NULL, 13}, {"ignore-mtime", no_argument, NULL, 14}, -#ifdef WITH_ANDROID {"mount-point", required_argument, NULL, 512}, +#ifdef WITH_ANDROID {"product-out", required_argument, NULL, 513}, {"fs-config-file", required_argument, NULL, 514}, {"block-list-file", required_argument, NULL, 515}, @@ -103,9 +103,9 @@ static void usage(void) #ifndef NDEBUG " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" #endif + " --mount-point=X X=prefix of target fs path (default: /)\n" #ifdef WITH_ANDROID "\nwith following android-specific options:\n" - " --mount-point=X X=prefix of target fs path (default: /)\n" " --product-out=X X=product_out directory\n" " --fs-config-file=X X=fs_config file\n" " --block-list-file=X X=block_list file\n" @@ -314,7 +314,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) case 10: cfg.c_compress_hints_file = optarg; break; -#ifdef WITH_ANDROID case 512: cfg.mount_point = optarg; /* all trailing '/' should be deleted */ @@ -322,6 +321,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) if (opt && optarg[opt - 1] == '/') optarg[opt - 1] = '\0'; break; +#ifdef WITH_ANDROID case 513: cfg.target_out_path = optarg; break; -- 2.37.0.rc0.161.g10f37bed90-goog From hsiangkao at linux.alibaba.com Fri Jul 1 14:20:40 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 1 Jul 2022 12:20:40 +0800 Subject: [PATCH v1] Make --mount-point option generally available In-Reply-To: <20220630171442.3945056-1-zhangkelvin@google.com> References: <20220630171442.3945056-1-zhangkelvin@google.com> Message-ID: Hi Kelvin, On Thu, Jun 30, 2022 at 10:14:42AM -0700, Kelvin Zhang wrote: erofs-utils: make --mount-point option generally available > This option does not have any android specific dependencies. It is also > useful for all selinux enabled fs images, so move it out of android > specific feature sets. Thanks for this patch! The patch itself looks good to me, yet could you give some example how to use it for all selinux enabled fs images w/o Android in the commit message. That would be much better to us! Thanks, Gao Xiang > > Signed-off-by: Kelvin Zhang > --- > include/erofs/config.h | 2 +- > lib/xattr.c | 2 -- > mkfs/main.c | 6 +++--- > 3 files changed, 4 insertions(+), 6 deletions(-) > > diff --git a/include/erofs/config.h b/include/erofs/config.h > index 0a1b18b..030054b 100644 > --- a/include/erofs/config.h > +++ b/include/erofs/config.h > @@ -65,8 +65,8 @@ struct erofs_configure { > u32 c_dict_size; > u64 c_unix_timestamp; > u32 c_uid, c_gid; > + const char *mount_point; > #ifdef WITH_ANDROID > - char *mount_point; > char *target_out_path; > char *fs_config_file; > char *block_list_file; > diff --git a/lib/xattr.c b/lib/xattr.c > index 00fb963..cf5c447 100644 > --- a/lib/xattr.c > +++ b/lib/xattr.c > @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, > unsigned int len[2]; > char *kvbuf, *fspath; > > -#ifdef WITH_ANDROID > if (cfg.mount_point) > ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, > erofs_fspath(srcpath)); > else > -#endif > ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); > if (ret <= 0) > return ERR_PTR(-ENOMEM); > diff --git a/mkfs/main.c b/mkfs/main.c > index b62a8aa..879c2f2 100644 > --- a/mkfs/main.c > +++ b/mkfs/main.c > @@ -50,8 +50,8 @@ static struct option long_options[] = { > {"quiet", no_argument, 0, 12}, > {"blobdev", required_argument, NULL, 13}, > {"ignore-mtime", no_argument, NULL, 14}, > -#ifdef WITH_ANDROID > {"mount-point", required_argument, NULL, 512}, > +#ifdef WITH_ANDROID > {"product-out", required_argument, NULL, 513}, > {"fs-config-file", required_argument, NULL, 514}, > {"block-list-file", required_argument, NULL, 515}, > @@ -103,9 +103,9 @@ static void usage(void) > #ifndef NDEBUG > " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" > #endif > + " --mount-point=X X=prefix of target fs path (default: /)\n" > #ifdef WITH_ANDROID > "\nwith following android-specific options:\n" > - " --mount-point=X X=prefix of target fs path (default: /)\n" > " --product-out=X X=product_out directory\n" > " --fs-config-file=X X=fs_config file\n" > " --block-list-file=X X=block_list file\n" > @@ -314,7 +314,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > case 10: > cfg.c_compress_hints_file = optarg; > break; > -#ifdef WITH_ANDROID > case 512: > cfg.mount_point = optarg; > /* all trailing '/' should be deleted */ > @@ -322,6 +321,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > if (opt && optarg[opt - 1] == '/') > optarg[opt - 1] = '\0'; > break; > +#ifdef WITH_ANDROID > case 513: > cfg.target_out_path = optarg; > break; > -- > 2.37.0.rc0.161.g10f37bed90-goog From zhangkelvin at google.com Sat Jul 2 09:00:30 2022 From: zhangkelvin at google.com (Kelvin Zhang) Date: Fri, 1 Jul 2022 16:00:30 -0700 Subject: [PATCH v2] Make --mount-point option generally available In-Reply-To: References: Message-ID: <20220701230030.2633151-1-zhangkelvin@google.com> This option does not have any android specific dependencies. It is also useful for all selinux enabled fs images, so move it out of android specific feature sets. e.g. mkfs.erofs --file-contexts=selinux_context_file --mount_point=/product product.img your_product_out_dir Signed-off-by: Kelvin Zhang --- include/erofs/config.h | 2 +- lib/xattr.c | 2 -- mkfs/main.c | 6 +++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/erofs/config.h b/include/erofs/config.h index 0a1b18b..030054b 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -65,8 +65,8 @@ struct erofs_configure { u32 c_dict_size; u64 c_unix_timestamp; u32 c_uid, c_gid; + const char *mount_point; #ifdef WITH_ANDROID - char *mount_point; char *target_out_path; char *fs_config_file; char *block_list_file; diff --git a/lib/xattr.c b/lib/xattr.c index 00fb963..cf5c447 100644 --- a/lib/xattr.c +++ b/lib/xattr.c @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, unsigned int len[2]; char *kvbuf, *fspath; -#ifdef WITH_ANDROID if (cfg.mount_point) ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, erofs_fspath(srcpath)); else -#endif ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); if (ret <= 0) return ERR_PTR(-ENOMEM); diff --git a/mkfs/main.c b/mkfs/main.c index b62a8aa..879c2f2 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -50,8 +50,8 @@ static struct option long_options[] = { {"quiet", no_argument, 0, 12}, {"blobdev", required_argument, NULL, 13}, {"ignore-mtime", no_argument, NULL, 14}, -#ifdef WITH_ANDROID {"mount-point", required_argument, NULL, 512}, +#ifdef WITH_ANDROID {"product-out", required_argument, NULL, 513}, {"fs-config-file", required_argument, NULL, 514}, {"block-list-file", required_argument, NULL, 515}, @@ -103,9 +103,9 @@ static void usage(void) #ifndef NDEBUG " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" #endif + " --mount-point=X X=prefix of target fs path (default: /)\n" #ifdef WITH_ANDROID "\nwith following android-specific options:\n" - " --mount-point=X X=prefix of target fs path (default: /)\n" " --product-out=X X=product_out directory\n" " --fs-config-file=X X=fs_config file\n" " --block-list-file=X X=block_list file\n" @@ -314,7 +314,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) case 10: cfg.c_compress_hints_file = optarg; break; -#ifdef WITH_ANDROID case 512: cfg.mount_point = optarg; /* all trailing '/' should be deleted */ @@ -322,6 +321,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) if (opt && optarg[opt - 1] == '/') optarg[opt - 1] = '\0'; break; +#ifdef WITH_ANDROID case 513: cfg.target_out_path = optarg; break; -- 2.37.0.rc0.161.g10f37bed90-goog From hsiangkao at linux.alibaba.com Sat Jul 2 14:53:17 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Sat, 2 Jul 2022 12:53:17 +0800 Subject: [PATCH v2] Make --mount-point option generally available In-Reply-To: <20220701230030.2633151-1-zhangkelvin@google.com> References: <20220701230030.2633151-1-zhangkelvin@google.com> Message-ID: On Fri, Jul 01, 2022 at 04:00:30PM -0700, Kelvin Zhang wrote: > This option does not have any android specific dependencies. It is also > useful for all selinux enabled fs images, so move it out of android > specific feature sets. > > e.g. mkfs.erofs --file-contexts=selinux_context_file > --mount_point=/product product.img your_product_out_dir > > Signed-off-by: Kelvin Zhang Apart from the subject without the prefix, Looks good to me, Reviewed-by: Gao Xiang Will play with it and apply later... Thanks, Gao Xiang > --- > include/erofs/config.h | 2 +- > lib/xattr.c | 2 -- > mkfs/main.c | 6 +++--- > 3 files changed, 4 insertions(+), 6 deletions(-) > > diff --git a/include/erofs/config.h b/include/erofs/config.h > index 0a1b18b..030054b 100644 > --- a/include/erofs/config.h > +++ b/include/erofs/config.h > @@ -65,8 +65,8 @@ struct erofs_configure { > u32 c_dict_size; > u64 c_unix_timestamp; > u32 c_uid, c_gid; > + const char *mount_point; > #ifdef WITH_ANDROID > - char *mount_point; > char *target_out_path; > char *fs_config_file; > char *block_list_file; > diff --git a/lib/xattr.c b/lib/xattr.c > index 00fb963..cf5c447 100644 > --- a/lib/xattr.c > +++ b/lib/xattr.c > @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, > unsigned int len[2]; > char *kvbuf, *fspath; > > -#ifdef WITH_ANDROID > if (cfg.mount_point) > ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, > erofs_fspath(srcpath)); > else > -#endif > ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); > if (ret <= 0) > return ERR_PTR(-ENOMEM); > diff --git a/mkfs/main.c b/mkfs/main.c > index b62a8aa..879c2f2 100644 > --- a/mkfs/main.c > +++ b/mkfs/main.c > @@ -50,8 +50,8 @@ static struct option long_options[] = { > {"quiet", no_argument, 0, 12}, > {"blobdev", required_argument, NULL, 13}, > {"ignore-mtime", no_argument, NULL, 14}, > -#ifdef WITH_ANDROID > {"mount-point", required_argument, NULL, 512}, > +#ifdef WITH_ANDROID > {"product-out", required_argument, NULL, 513}, > {"fs-config-file", required_argument, NULL, 514}, > {"block-list-file", required_argument, NULL, 515}, > @@ -103,9 +103,9 @@ static void usage(void) > #ifndef NDEBUG > " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" > #endif > + " --mount-point=X X=prefix of target fs path (default: /)\n" > #ifdef WITH_ANDROID > "\nwith following android-specific options:\n" > - " --mount-point=X X=prefix of target fs path (default: /)\n" > " --product-out=X X=product_out directory\n" > " --fs-config-file=X X=fs_config file\n" > " --block-list-file=X X=block_list file\n" > @@ -314,7 +314,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > case 10: > cfg.c_compress_hints_file = optarg; > break; > -#ifdef WITH_ANDROID > case 512: > cfg.mount_point = optarg; > /* all trailing '/' should be deleted */ > @@ -322,6 +321,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > if (opt && optarg[opt - 1] == '/') > optarg[opt - 1] = '\0'; > break; > +#ifdef WITH_ANDROID > case 513: > cfg.target_out_path = optarg; > break; > -- > 2.37.0.rc0.161.g10f37bed90-goog From xiang at kernel.org Mon Jul 4 06:54:13 2022 From: xiang at kernel.org (Gao Xiang) Date: Mon, 4 Jul 2022 04:54:13 +0800 Subject: [PATCH v2] Make --mount-point option generally available In-Reply-To: References: <20220701230030.2633151-1-zhangkelvin@google.com> Message-ID: On Sat, Jul 02, 2022 at 12:53:17PM +0800, Gao Xiang wrote: > On Fri, Jul 01, 2022 at 04:00:30PM -0700, Kelvin Zhang wrote: > > This option does not have any android specific dependencies. It is also > > useful for all selinux enabled fs images, so move it out of android > > specific feature sets. > > > > e.g. mkfs.erofs --file-contexts=selinux_context_file > > --mount_point=/product product.img your_product_out_dir > > > > Signed-off-by: Kelvin Zhang > > Apart from the subject without the prefix, Looks good to me, > > Reviewed-by: Gao Xiang > > Will play with it and apply later... I tried to apply it to each branch but without any luck. Would you mind rebasing it? Thanks, Gao Xiang > > Thanks, > Gao Xiang > > > > --- > > include/erofs/config.h | 2 +- > > lib/xattr.c | 2 -- > > mkfs/main.c | 6 +++--- > > 3 files changed, 4 insertions(+), 6 deletions(-) > > > > diff --git a/include/erofs/config.h b/include/erofs/config.h > > index 0a1b18b..030054b 100644 > > --- a/include/erofs/config.h > > +++ b/include/erofs/config.h > > @@ -65,8 +65,8 @@ struct erofs_configure { > > u32 c_dict_size; > > u64 c_unix_timestamp; > > u32 c_uid, c_gid; > > + const char *mount_point; > > #ifdef WITH_ANDROID > > - char *mount_point; > > char *target_out_path; > > char *fs_config_file; > > char *block_list_file; > > diff --git a/lib/xattr.c b/lib/xattr.c > > index 00fb963..cf5c447 100644 > > --- a/lib/xattr.c > > +++ b/lib/xattr.c > > @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, > > unsigned int len[2]; > > char *kvbuf, *fspath; > > > > -#ifdef WITH_ANDROID > > if (cfg.mount_point) > > ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, > > erofs_fspath(srcpath)); > > else > > -#endif > > ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); > > if (ret <= 0) > > return ERR_PTR(-ENOMEM); > > diff --git a/mkfs/main.c b/mkfs/main.c > > index b62a8aa..879c2f2 100644 > > --- a/mkfs/main.c > > +++ b/mkfs/main.c > > @@ -50,8 +50,8 @@ static struct option long_options[] = { > > {"quiet", no_argument, 0, 12}, > > {"blobdev", required_argument, NULL, 13}, > > {"ignore-mtime", no_argument, NULL, 14}, > > -#ifdef WITH_ANDROID > > {"mount-point", required_argument, NULL, 512}, > > +#ifdef WITH_ANDROID > > {"product-out", required_argument, NULL, 513}, > > {"fs-config-file", required_argument, NULL, 514}, > > {"block-list-file", required_argument, NULL, 515}, > > @@ -103,9 +103,9 @@ static void usage(void) > > #ifndef NDEBUG > > " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" > > #endif > > + " --mount-point=X X=prefix of target fs path (default: /)\n" > > #ifdef WITH_ANDROID > > "\nwith following android-specific options:\n" > > - " --mount-point=X X=prefix of target fs path (default: /)\n" > > " --product-out=X X=product_out directory\n" > > " --fs-config-file=X X=fs_config file\n" > > " --block-list-file=X X=block_list file\n" > > @@ -314,7 +314,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > > case 10: > > cfg.c_compress_hints_file = optarg; > > break; > > -#ifdef WITH_ANDROID > > case 512: > > cfg.mount_point = optarg; > > /* all trailing '/' should be deleted */ > > @@ -322,6 +321,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > > if (opt && optarg[opt - 1] == '/') > > optarg[opt - 1] = '\0'; > > break; > > +#ifdef WITH_ANDROID > > case 513: > > cfg.target_out_path = optarg; > > break; > > -- > > 2.37.0.rc0.161.g10f37bed90-goog From zbestahu at gmail.com Mon Jul 4 12:19:26 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 4 Jul 2022 10:19:26 +0800 Subject: [PATCH v2] Make --mount-point option generally available In-Reply-To: <20220701230030.2633151-1-zhangkelvin@google.com> References: <20220701230030.2633151-1-zhangkelvin@google.com> Message-ID: <20220704101926.0000504d.zbestahu@gmail.com> Hi Kelvin, On Fri, 1 Jul 2022 16:00:30 -0700 Kelvin Zhang via Linux-erofs wrote: > This option does not have any android specific dependencies. It is also > useful for all selinux enabled fs images, so move it out of android > specific feature sets. > > e.g. mkfs.erofs --file-contexts=selinux_context_file > --mount_point=/product product.img your_product_out_dir > > Signed-off-by: Kelvin Zhang > --- > include/erofs/config.h | 2 +- > lib/xattr.c | 2 -- > mkfs/main.c | 6 +++--- > 3 files changed, 4 insertions(+), 6 deletions(-) > > diff --git a/include/erofs/config.h b/include/erofs/config.h > index 0a1b18b..030054b 100644 > --- a/include/erofs/config.h > +++ b/include/erofs/config.h > @@ -65,8 +65,8 @@ struct erofs_configure { > u32 c_dict_size; > u64 c_unix_timestamp; > u32 c_uid, c_gid; > + const char *mount_point; > #ifdef WITH_ANDROID > - char *mount_point; > char *target_out_path; > char *fs_config_file; > char *block_list_file; > diff --git a/lib/xattr.c b/lib/xattr.c > index 00fb963..cf5c447 100644 > --- a/lib/xattr.c > +++ b/lib/xattr.c > @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, > unsigned int len[2]; > char *kvbuf, *fspath; > > -#ifdef WITH_ANDROID > if (cfg.mount_point) > ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, > erofs_fspath(srcpath)); > else > -#endif > ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); > if (ret <= 0) > return ERR_PTR(-ENOMEM); > diff --git a/mkfs/main.c b/mkfs/main.c > index b62a8aa..879c2f2 100644 > --- a/mkfs/main.c > +++ b/mkfs/main.c > @@ -50,8 +50,8 @@ static struct option long_options[] = { > {"quiet", no_argument, 0, 12}, > {"blobdev", required_argument, NULL, 13}, > {"ignore-mtime", no_argument, NULL, 14}, > -#ifdef WITH_ANDROID > {"mount-point", required_argument, NULL, 512}, > +#ifdef WITH_ANDROID > {"product-out", required_argument, NULL, 513}, > {"fs-config-file", required_argument, NULL, 514}, > {"block-list-file", required_argument, NULL, 515}, > @@ -103,9 +103,9 @@ static void usage(void) > #ifndef NDEBUG > " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" > #endif > + " --mount-point=X X=prefix of target fs path (default: /)\n" > #ifdef WITH_ANDROID > "\nwith following android-specific options:\n" > - " --mount-point=X X=prefix of target fs path (default: /)\n" > " --product-out=X X=product_out directory\n" > " --fs-config-file=X X=fs_config file\n" > " --block-list-file=X X=block_list file\n" > @@ -314,7 +314,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > case 10: > cfg.c_compress_hints_file = optarg; > break; > -#ifdef WITH_ANDROID > case 512: > cfg.mount_point = optarg; > /* all trailing '/' should be deleted */ > @@ -322,6 +321,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > if (opt && optarg[opt - 1] == '/') > optarg[opt - 1] = '\0'; > break; > +#ifdef WITH_ANDROID > case 513: > cfg.target_out_path = optarg; > break; As Xiang pointed out, it is common convention to prefix the subject line to let us distinguish from others more easily. From zhangkelvin at google.com Fri Jul 8 03:40:58 2022 From: zhangkelvin at google.com (Kelvin Zhang) Date: Thu, 7 Jul 2022 10:40:58 -0700 Subject: [PATCH v3] erofs-utils: Make --mount-point option generally available In-Reply-To: <20220704101926.0000504d.zbestahu@gmail.com> References: <20220704101926.0000504d.zbestahu@gmail.com> Message-ID: <20220707174058.1577159-1-zhangkelvin@google.com> This option does not have any android specific dependencies. It is also useful for all selinux enabled fs images, so move it out of android specific feature sets. e.g. mkfs.erofs --file-contexts=selinux_context_file --mount_point=/product product.img your_product_out_dir Signed-off-by: Kelvin Zhang --- include/erofs/config.h | 2 +- lib/xattr.c | 2 -- mkfs/main.c | 6 +++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/erofs/config.h b/include/erofs/config.h index 0d0916c..2daf46c 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -67,8 +67,8 @@ struct erofs_configure { u32 c_dict_size; u64 c_unix_timestamp; u32 c_uid, c_gid; + const char *mount_point; #ifdef WITH_ANDROID - char *mount_point; char *target_out_path; char *fs_config_file; char *block_list_file; diff --git a/lib/xattr.c b/lib/xattr.c index 71ffe3e..c8ce278 100644 --- a/lib/xattr.c +++ b/lib/xattr.c @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, unsigned int len[2]; char *kvbuf, *fspath; -#ifdef WITH_ANDROID if (cfg.mount_point) ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, erofs_fspath(srcpath)); else -#endif ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); if (ret <= 0) return ERR_PTR(-ENOMEM); diff --git a/mkfs/main.c b/mkfs/main.c index d2c9830..deb8e1f 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -51,8 +51,8 @@ static struct option long_options[] = { {"blobdev", required_argument, NULL, 13}, {"ignore-mtime", no_argument, NULL, 14}, {"preserve-mtime", no_argument, NULL, 15}, -#ifdef WITH_ANDROID {"mount-point", required_argument, NULL, 512}, +#ifdef WITH_ANDROID {"product-out", required_argument, NULL, 513}, {"fs-config-file", required_argument, NULL, 514}, {"block-list-file", required_argument, NULL, 515}, @@ -105,9 +105,9 @@ static void usage(void) #ifndef NDEBUG " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" #endif + " --mount-point=X X=prefix of target fs path (default: /)\n" #ifdef WITH_ANDROID "\nwith following android-specific options:\n" - " --mount-point=X X=prefix of target fs path (default: /)\n" " --product-out=X X=product_out directory\n" " --fs-config-file=X X=fs_config file\n" " --block-list-file=X X=block_list file\n" @@ -323,7 +323,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) case 10: cfg.c_compress_hints_file = optarg; break; -#ifdef WITH_ANDROID case 512: cfg.mount_point = optarg; /* all trailing '/' should be deleted */ @@ -331,6 +330,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) if (opt && optarg[opt - 1] == '/') optarg[opt - 1] = '\0'; break; +#ifdef WITH_ANDROID case 513: cfg.target_out_path = optarg; break; -- 2.37.0.rc0.161.g10f37bed90-goog From hsiangkao at linux.alibaba.com Fri Jul 8 03:57:04 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 8 Jul 2022 01:57:04 +0800 Subject: [PATCH v3] erofs-utils: Make --mount-point option generally available In-Reply-To: <20220707174058.1577159-1-zhangkelvin@google.com> References: <20220704101926.0000504d.zbestahu@gmail.com> <20220707174058.1577159-1-zhangkelvin@google.com> Message-ID: On Thu, Jul 07, 2022 at 10:40:58AM -0700, Kelvin Zhang wrote: > This option does not have any android specific dependencies. It is also > useful for all selinux enabled fs images, so move it out of android > specific feature sets. > > e.g. mkfs.erofs --file-contexts=selinux_context_file > --mount_point=/product product.img your_product_out_dir > > Signed-off-by: Kelvin Zhang Looks good to me, Reviewed-by: Gao Xiang Thanks, Gao Xiang > --- > include/erofs/config.h | 2 +- > lib/xattr.c | 2 -- > mkfs/main.c | 6 +++--- > 3 files changed, 4 insertions(+), 6 deletions(-) > > diff --git a/include/erofs/config.h b/include/erofs/config.h > index 0d0916c..2daf46c 100644 > --- a/include/erofs/config.h > +++ b/include/erofs/config.h > @@ -67,8 +67,8 @@ struct erofs_configure { > u32 c_dict_size; > u64 c_unix_timestamp; > u32 c_uid, c_gid; > + const char *mount_point; > #ifdef WITH_ANDROID > - char *mount_point; > char *target_out_path; > char *fs_config_file; > char *block_list_file; > diff --git a/lib/xattr.c b/lib/xattr.c > index 71ffe3e..c8ce278 100644 > --- a/lib/xattr.c > +++ b/lib/xattr.c > @@ -210,12 +210,10 @@ static struct xattr_item *erofs_get_selabel_xattr(const char *srcpath, > unsigned int len[2]; > char *kvbuf, *fspath; > > -#ifdef WITH_ANDROID > if (cfg.mount_point) > ret = asprintf(&fspath, "/%s/%s", cfg.mount_point, > erofs_fspath(srcpath)); > else > -#endif > ret = asprintf(&fspath, "/%s", erofs_fspath(srcpath)); > if (ret <= 0) > return ERR_PTR(-ENOMEM); > diff --git a/mkfs/main.c b/mkfs/main.c > index d2c9830..deb8e1f 100644 > --- a/mkfs/main.c > +++ b/mkfs/main.c > @@ -51,8 +51,8 @@ static struct option long_options[] = { > {"blobdev", required_argument, NULL, 13}, > {"ignore-mtime", no_argument, NULL, 14}, > {"preserve-mtime", no_argument, NULL, 15}, > -#ifdef WITH_ANDROID > {"mount-point", required_argument, NULL, 512}, > +#ifdef WITH_ANDROID > {"product-out", required_argument, NULL, 513}, > {"fs-config-file", required_argument, NULL, 514}, > {"block-list-file", required_argument, NULL, 515}, > @@ -105,9 +105,9 @@ static void usage(void) > #ifndef NDEBUG > " --random-pclusterblks randomize pclusterblks for big pcluster (debugging only)\n" > #endif > + " --mount-point=X X=prefix of target fs path (default: /)\n" > #ifdef WITH_ANDROID > "\nwith following android-specific options:\n" > - " --mount-point=X X=prefix of target fs path (default: /)\n" > " --product-out=X X=product_out directory\n" > " --fs-config-file=X X=fs_config file\n" > " --block-list-file=X X=block_list file\n" > @@ -323,7 +323,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > case 10: > cfg.c_compress_hints_file = optarg; > break; > -#ifdef WITH_ANDROID > case 512: > cfg.mount_point = optarg; > /* all trailing '/' should be deleted */ > @@ -331,6 +330,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > if (opt && optarg[opt - 1] == '/') > optarg[opt - 1] = '\0'; > break; > +#ifdef WITH_ANDROID > case 513: > cfg.target_out_path = optarg; > break; > -- > 2.37.0.rc0.161.g10f37bed90-goog From duguoweisz at gmail.com Fri Jul 8 13:11:55 2022 From: duguoweisz at gmail.com (Guowei Du) Date: Fri, 8 Jul 2022 11:11:55 +0800 Subject: [PATCH 2/2] erofs: sequence each shrink task Message-ID: <20220708031155.21878-1-duguoweisz@gmail.com> From: duguowei Because of 'list_move_tail', if two or more tasks are shrinking, there will be different results for them. For example: After the first round, if shrink_run_no of entry equals run_no of task, task will break directly at the beginning of next round; if they are not equal, task will continue to shrink until encounter one entry which has the same number. It is difficult to confirm the real results of all tasks, so add a lock to only allow one task to shrink at the same time. How to test: task1: root#echo 3 > /proc/sys/vm/drop_caches [743071.839051] Call Trace: [743071.839052] [743071.839054] do_shrink_slab+0x112/0x300 [743071.839058] shrink_slab+0x211/0x2a0 [743071.839060] drop_slab+0x72/0xe0 [743071.839061] drop_caches_sysctl_handler+0x50/0xb0 [743071.839063] proc_sys_call_handler+0x173/0x250 [743071.839066] proc_sys_write+0x13/0x20 [743071.839067] new_sync_write+0x104/0x180 [743071.839070] ? send_command+0xe0/0x270 [743071.839073] vfs_write+0x247/0x2a0 [743071.839074] ksys_write+0xa7/0xe0 [743071.839075] ? fpregs_assert_state_consistent+0x23/0x50 [743071.839078] __x64_sys_write+0x1a/0x20 [743071.839079] do_syscall_64+0x3a/0x80 [743071.839081] entry_SYSCALL_64_after_hwframe+0x46/0xb0 task2: root#echo 3 > /proc/sys/vm/drop_caches [743079.843214] Call Trace: [743079.843214] [743079.843215] do_shrink_slab+0x112/0x300 [743079.843219] shrink_slab+0x211/0x2a0 [743079.843221] drop_slab+0x72/0xe0 [743079.843222] drop_caches_sysctl_handler+0x50/0xb0 [743079.843224] proc_sys_call_handler+0x173/0x250 [743079.843227] proc_sys_write+0x13/0x20 [743079.843228] new_sync_write+0x104/0x180 [743079.843231] ? send_command+0xe0/0x270 [743079.843233] vfs_write+0x247/0x2a0 [743079.843234] ksys_write+0xa7/0xe0 [743079.843235] ? fpregs_assert_state_consistent+0x23/0x50 [743079.843238] __x64_sys_write+0x1a/0x20 [743079.843239] do_syscall_64+0x3a/0x80 [743079.843241] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Signed-off-by: duguowei --- fs/erofs/utils.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index ec9a1d780dc1..9eca13a7e594 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -186,6 +186,8 @@ static unsigned int shrinker_run_no; /* protects the mounted 'erofs_sb_list' */ static DEFINE_SPINLOCK(erofs_sb_list_lock); +/* sequence each shrink task */ +static DEFINE_SPINLOCK(erofs_sb_shrink_lock); static LIST_HEAD(erofs_sb_list); void erofs_shrinker_register(struct super_block *sb) @@ -226,13 +228,14 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, struct list_head *p; unsigned long nr = sc->nr_to_scan; - unsigned int run_no; unsigned long freed = 0; + spin_lock(&erofs_sb_shrink_lock); spin_lock(&erofs_sb_list_lock); - do { - run_no = ++shrinker_run_no; - } while (run_no == 0); + shrinker_run_no++; + /* if overflow, restarting from 1 */ + if (shrinker_run_no == 0) + shrinker_run_no = 1; /* Iterate over all mounted superblocks and try to shrink them */ p = erofs_sb_list.next; @@ -243,7 +246,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, * We move the ones we do to the end of the list, so we stop * when we see one we have already done. */ - if (sbi->shrinker_run_no == run_no) + if (sbi->shrinker_run_no == shrinker_run_no) break; if (!mutex_trylock(&sbi->umount_mutex)) { @@ -252,7 +255,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, } spin_unlock(&erofs_sb_list_lock); - sbi->shrinker_run_no = run_no; + sbi->shrinker_run_no = shrinker_run_no; freed += erofs_shrink_workstation(sbi, nr - freed); @@ -271,6 +274,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, break; } spin_unlock(&erofs_sb_list_lock); + spin_unlock(&erofs_sb_shrink_lock); return freed; } -- 2.36.1 From hsiangkao at linux.alibaba.com Fri Jul 8 13:25:17 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 8 Jul 2022 11:25:17 +0800 Subject: [PATCH 2/2] erofs: sequence each shrink task In-Reply-To: <20220708031155.21878-1-duguoweisz@gmail.com> References: <20220708031155.21878-1-duguoweisz@gmail.com> Message-ID: Hi Guowei, On Fri, Jul 08, 2022 at 11:11:55AM +0800, Guowei Du wrote: > From: duguowei > > Because of 'list_move_tail', if two or more tasks are shrinking, there > will be different results for them. Thanks for the patch. Two quick questions: 1) where is the PATCH 1/2; 2) What problem is the current patch trying to resolve... > > For example: > After the first round, if shrink_run_no of entry equals run_no of task, > task will break directly at the beginning of next round; if they are > not equal, task will continue to shrink until encounter one entry > which has the same number. > > It is difficult to confirm the real results of all tasks, so add a lock > to only allow one task to shrink at the same time. > > How to test: > task1: > root#echo 3 > /proc/sys/vm/drop_caches > [743071.839051] Call Trace: > [743071.839052] > [743071.839054] do_shrink_slab+0x112/0x300 > [743071.839058] shrink_slab+0x211/0x2a0 > [743071.839060] drop_slab+0x72/0xe0 > [743071.839061] drop_caches_sysctl_handler+0x50/0xb0 > [743071.839063] proc_sys_call_handler+0x173/0x250 > [743071.839066] proc_sys_write+0x13/0x20 > [743071.839067] new_sync_write+0x104/0x180 > [743071.839070] ? send_command+0xe0/0x270 > [743071.839073] vfs_write+0x247/0x2a0 > [743071.839074] ksys_write+0xa7/0xe0 > [743071.839075] ? fpregs_assert_state_consistent+0x23/0x50 > [743071.839078] __x64_sys_write+0x1a/0x20 > [743071.839079] do_syscall_64+0x3a/0x80 > [743071.839081] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > task2: > root#echo 3 > /proc/sys/vm/drop_caches > [743079.843214] Call Trace: > [743079.843214] > [743079.843215] do_shrink_slab+0x112/0x300 > [743079.843219] shrink_slab+0x211/0x2a0 > [743079.843221] drop_slab+0x72/0xe0 > [743079.843222] drop_caches_sysctl_handler+0x50/0xb0 > [743079.843224] proc_sys_call_handler+0x173/0x250 > [743079.843227] proc_sys_write+0x13/0x20 > [743079.843228] new_sync_write+0x104/0x180 > [743079.843231] ? send_command+0xe0/0x270 > [743079.843233] vfs_write+0x247/0x2a0 > [743079.843234] ksys_write+0xa7/0xe0 > [743079.843235] ? fpregs_assert_state_consistent+0x23/0x50 > [743079.843238] __x64_sys_write+0x1a/0x20 > [743079.843239] do_syscall_64+0x3a/0x80 > [743079.843241] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > Signed-off-by: duguowei > --- > fs/erofs/utils.c | 16 ++++++++++------ > 1 file changed, 10 insertions(+), 6 deletions(-) > > diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c > index ec9a1d780dc1..9eca13a7e594 100644 > --- a/fs/erofs/utils.c > +++ b/fs/erofs/utils.c > @@ -186,6 +186,8 @@ static unsigned int shrinker_run_no; > > /* protects the mounted 'erofs_sb_list' */ > static DEFINE_SPINLOCK(erofs_sb_list_lock); > +/* sequence each shrink task */ > +static DEFINE_SPINLOCK(erofs_sb_shrink_lock); > static LIST_HEAD(erofs_sb_list); > > void erofs_shrinker_register(struct super_block *sb) > @@ -226,13 +228,14 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, > struct list_head *p; > > unsigned long nr = sc->nr_to_scan; > - unsigned int run_no; > unsigned long freed = 0; > > + spin_lock(&erofs_sb_shrink_lock); Btw, we cannot make the whole shrinker under one spin_lock. Thanks, Gao Xiang From duguoweisz at gmail.com Fri Jul 8 14:49:07 2022 From: duguoweisz at gmail.com (guowei du) Date: Fri, 8 Jul 2022 12:49:07 +0800 Subject: [PATCH 2/2] erofs: sequence each shrink task In-Reply-To: References: <20220708031155.21878-1-duguoweisz@gmail.com> Message-ID: hi, I am sorry?there is only one patch. If two or more tasks are doing a shrinking job, there are different results for each task. That is to say, the status of each task is not persistent each time, although they have the same purpose to release memory. And, If two tasks are shrinking, the erofs_sb_list will become no order, because each task will move one sbi to tail, but sbi is random, so results are more complex. Because of the use of the local variable 'run_no', it took me a long time to understand the procedure of each task when they are concurrent. There is the same issue for other fs, such as fs/ubifs/shrink.c?fs/f2fs/shrink.c. If scan_objects cost a long time ,it will trigger a watchdog, shrinking should not make work time-consuming. It should be done ASAP. So, I add a new spin lock to let tasks shrink fs sequentially, it will just make all tasks shrink one by one. Thanks very much. On Fri, Jul 8, 2022 at 11:25 AM Gao Xiang wrote: > Hi Guowei, > > On Fri, Jul 08, 2022 at 11:11:55AM +0800, Guowei Du wrote: > > From: duguowei > > > > Because of 'list_move_tail', if two or more tasks are shrinking, there > > will be different results for them. > > Thanks for the patch. Two quick questions: > 1) where is the PATCH 1/2; > 2) What problem is the current patch trying to resolve... > > > > > For example: > > After the first round, if shrink_run_no of entry equals run_no of task, > > task will break directly at the beginning of next round; if they are > > not equal, task will continue to shrink until encounter one entry > > which has the same number. > > > > It is difficult to confirm the real results of all tasks, so add a lock > > to only allow one task to shrink at the same time. > > > > How to test: > > task1: > > root#echo 3 > /proc/sys/vm/drop_caches > > [743071.839051] Call Trace: > > [743071.839052] > > [743071.839054] do_shrink_slab+0x112/0x300 > > [743071.839058] shrink_slab+0x211/0x2a0 > > [743071.839060] drop_slab+0x72/0xe0 > > [743071.839061] drop_caches_sysctl_handler+0x50/0xb0 > > [743071.839063] proc_sys_call_handler+0x173/0x250 > > [743071.839066] proc_sys_write+0x13/0x20 > > [743071.839067] new_sync_write+0x104/0x180 > > [743071.839070] ? send_command+0xe0/0x270 > > [743071.839073] vfs_write+0x247/0x2a0 > > [743071.839074] ksys_write+0xa7/0xe0 > > [743071.839075] ? fpregs_assert_state_consistent+0x23/0x50 > > [743071.839078] __x64_sys_write+0x1a/0x20 > > [743071.839079] do_syscall_64+0x3a/0x80 > > [743071.839081] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > > > task2: > > root#echo 3 > /proc/sys/vm/drop_caches > > [743079.843214] Call Trace: > > [743079.843214] > > [743079.843215] do_shrink_slab+0x112/0x300 > > [743079.843219] shrink_slab+0x211/0x2a0 > > [743079.843221] drop_slab+0x72/0xe0 > > [743079.843222] drop_caches_sysctl_handler+0x50/0xb0 > > [743079.843224] proc_sys_call_handler+0x173/0x250 > > [743079.843227] proc_sys_write+0x13/0x20 > > [743079.843228] new_sync_write+0x104/0x180 > > [743079.843231] ? send_command+0xe0/0x270 > > [743079.843233] vfs_write+0x247/0x2a0 > > [743079.843234] ksys_write+0xa7/0xe0 > > [743079.843235] ? fpregs_assert_state_consistent+0x23/0x50 > > [743079.843238] __x64_sys_write+0x1a/0x20 > > [743079.843239] do_syscall_64+0x3a/0x80 > > [743079.843241] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > > > Signed-off-by: duguowei > > --- > > fs/erofs/utils.c | 16 ++++++++++------ > > 1 file changed, 10 insertions(+), 6 deletions(-) > > > > diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c > > index ec9a1d780dc1..9eca13a7e594 100644 > > --- a/fs/erofs/utils.c > > +++ b/fs/erofs/utils.c > > @@ -186,6 +186,8 @@ static unsigned int shrinker_run_no; > > > > /* protects the mounted 'erofs_sb_list' */ > > static DEFINE_SPINLOCK(erofs_sb_list_lock); > > +/* sequence each shrink task */ > > +static DEFINE_SPINLOCK(erofs_sb_shrink_lock); > > static LIST_HEAD(erofs_sb_list); > > > > void erofs_shrinker_register(struct super_block *sb) > > @@ -226,13 +228,14 @@ static unsigned long erofs_shrink_scan(struct > shrinker *shrink, > > struct list_head *p; > > > > unsigned long nr = sc->nr_to_scan; > > - unsigned int run_no; > > unsigned long freed = 0; > > > > + spin_lock(&erofs_sb_shrink_lock); > > Btw, we cannot make the whole shrinker under one spin_lock. > > Thanks, > Gao Xiang > -------------- next part -------------- An HTML attachment was scrubbed... URL: From hsiangkao at linux.alibaba.com Fri Jul 8 15:41:47 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 8 Jul 2022 13:41:47 +0800 Subject: [PATCH 2/2] erofs: sequence each shrink task In-Reply-To: References: <20220708031155.21878-1-duguoweisz@gmail.com>

Message-ID: Hi, On Fri, Jul 08, 2022 at 12:49:07PM +0800, guowei du wrote: > hi, > I am sorry?there is only one patch. > > If two or more tasks are doing a shrinking job, there are different results > for each task. > That is to say, the status of each task is not persistent each time, > although they have > the same purpose to release memory. > > And, If two tasks are shrinking, the erofs_sb_list will become no order, > because each task will > move one sbi to tail, but sbi is random, so results are more complex. Thanks for the explanation. So it doesn't sound like a real issue but seems like an improvement? If it's more like this, you could patch this to the products first and beta for more time and see if it works well.. I'm more careful about such change to shrinker since it could impact downstream users... Yes, I know this behavior, but I'm not sure if it's needed to be treated as this way, because in principle shrinker can be processed by multiple tasks since otherwise it could be stuck by some low priority task (I remember it sometimes happens in Android.) > > Because of the use of the local variable 'run_no', it took me a long time > to understand the > procedure of each task when they are concurrent. > > There is the same issue for other fs, such as > fs/ubifs/shrink.c?fs/f2fs/shrink.c. > > If scan_objects cost a long time ,it will trigger a watchdog, shrinking > should > not make work time-consuming. It should be done ASAP. > So, I add a new spin lock to let tasks shrink fs sequentially, it will just > make all tasks shrink > one by one. Actually such shrinker is used for managed slots (sorry I needs more work to rename workgroup to such name). But currently one of my ongoing improvements is to remove pclusters immediately from managed slots if no compressed buffer is cached, so it's used for inflight I/Os (to merge decompression requests, including ongoing deduplication requests) and cached I/O only. So in that way objects will be more fewer than now. > > > Thanks very much. Thank you. Thanks, Gao Xiang From hsiangkao at linux.alibaba.com Fri Jul 8 20:10:01 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 8 Jul 2022 18:10:01 +0800 Subject: [PATCH] erofs: avoid consecutive detection for Highmem memory Message-ID: <20220708101001.21242-1-hsiangkao@linux.alibaba.com> Currently, vmap()s are avoided if physical addresses are consecutive for decompressed buffers. I observed that is very common for 4KiB pclusters since the numbers of decompressed pages are almost 2 or 3. However, such detection doesn't work for Highmem pages on 32-bit machines, let's fix it now. Reported-by: Liu Jinbao Fixes: 7fc45dbc938a ("staging: erofs: introduce generic decompression backend") Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..45be8f4aeb68 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (page) { __clear_bit(j, bounced); - if (kaddr) { - if (kaddr + PAGE_SIZE == page_address(page)) + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; - else - kaddr = NULL; - } else if (!i) { - kaddr = page_address(page); + continue; + } } + kaddr = NULL; continue; } kaddr = NULL; -- 2.24.4 From duguoweisz at gmail.com Fri Jul 8 23:34:38 2022 From: duguoweisz at gmail.com (guowei du) Date: Fri, 8 Jul 2022 21:34:38 +0800 Subject: [PATCH 2/2] erofs: sequence each shrink task In-Reply-To: References: <20220708031155.21878-1-duguoweisz@gmail.com>

Message-ID: Got it. Thanks very much. On Fri, Jul 8, 2022 at 1:41 PM Gao Xiang wrote: > Hi, > > On Fri, Jul 08, 2022 at 12:49:07PM +0800, guowei du wrote: > > hi, > > I am sorry?there is only one patch. > > > > If two or more tasks are doing a shrinking job, there are different > results > > for each task. > > That is to say, the status of each task is not persistent each time, > > although they have > > the same purpose to release memory. > > > > And, If two tasks are shrinking, the erofs_sb_list will become no order, > > because each task will > > move one sbi to tail, but sbi is random, so results are more complex. > > Thanks for the explanation. So it doesn't sound like a real issue but seems > like an improvement? If it's more like this, you could patch this to the > products first and beta for more time and see if it works well.. I'm > more careful about such change to shrinker since it could impact > downstream users... > > Yes, I know this behavior, but I'm not sure if it's needed to be treated > as this way, because in principle shrinker can be processed by multiple > tasks since otherwise it could be stuck by some low priority task (I > remember it sometimes happens in Android.) > > > > > Because of the use of the local variable 'run_no', it took me a long time > > to understand the > > procedure of each task when they are concurrent. > > > > There is the same issue for other fs, such as > > fs/ubifs/shrink.c?fs/f2fs/shrink.c. > > > > If scan_objects cost a long time ,it will trigger a watchdog, shrinking > > should > > not make work time-consuming. It should be done ASAP. > > So, I add a new spin lock to let tasks shrink fs sequentially, it will > just > > make all tasks shrink > > one by one. > > Actually such shrinker is used for managed slots (sorry I needs more > work to rename workgroup to such name). But currently one of my ongoing > improvements is to remove pclusters immediately from managed slots if > no compressed buffer is cached, so it's used for inflight I/Os (to merge > decompression requests, including ongoing deduplication requests) and > cached I/O only. So in that way objects will be more fewer than now. > > > > > > > Thanks very much. > > Thank you. > > Thanks, > Gao Xiang > > -------------- next part -------------- An HTML attachment was scrubbed... URL: From dhowells at redhat.com Sat Jul 9 07:32:31 2022 From: dhowells at redhat.com (David Howells) Date: Fri, 08 Jul 2022 22:32:31 +0100 Subject: [GIT PULL] fscache: Miscellaneous fixes Message-ID: <3753787.1657315951@warthog.procyon.org.uk> Hi Linus, Could you pull these fscache/cachefiles fixes please? (1) Fix a check in fscache_wait_on_volume_collision() in which the polarity is reversed. It should complain if a volume is still marked acquisition-pending after 20s, but instead complains if the mark has been cleared (ie. the condition has cleared). Also switch an open-coded test of the ACQUIRE_PENDING volume flag to use the helper function for consistency. (2) Not a fix per se, but neaten the code by using a helper to check for the DROPPED state. (3) Fix cachefiles's support for erofs to only flush requests associated with a released control file, not all requests. (4) Fix a race between one process invalidating an object in the cache and another process trying to look it up. Thanks, David --- The following changes since commit 03c765b0e3b4cb5063276b086c76f7a612856a9a: Linux 5.19-rc4 (2022-06-26 14:22:10 -0700) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git tags/fscache-fixes-20220708 for you to fetch changes up to 85e4ea1049c70fb99de5c6057e835d151fb647da: fscache: Fix invalidation/lookup race (2022-07-05 16:12:55 +0100) ---------------------------------------------------------------- fscache fixes ---------------------------------------------------------------- David Howells (1): fscache: Fix invalidation/lookup race Jia Zhu (1): cachefiles: narrow the scope of flushed requests when releasing fd Yue Hu (2): fscache: Fix if condition in fscache_wait_on_volume_collision() fscache: Introduce fscache_cookie_is_dropped() fs/cachefiles/ondemand.c | 3 ++- fs/fscache/cookie.c | 26 ++++++++++++++++++++++---- fs/fscache/volume.c | 4 ++-- include/linux/fscache.h | 1 + 4 files changed, 27 insertions(+), 7 deletions(-) From pr-tracker-bot at kernel.org Sat Jul 9 09:17:32 2022 From: pr-tracker-bot at kernel.org (pr-tracker-bot at kernel.org) Date: Fri, 08 Jul 2022 23:17:32 +0000 Subject: [GIT PULL] fscache: Miscellaneous fixes In-Reply-To: <3753787.1657315951@warthog.procyon.org.uk> References: <3753787.1657315951@warthog.procyon.org.uk> Message-ID: <165732225221.30799.13034712144647468572.pr-tracker-bot@kernel.org> The pull request you sent on Fri, 08 Jul 2022 22:32:31 +0100: > git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git tags/fscache-fixes-20220708 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/e5524c2a1fc4002a52e16236659e779767617a4f Thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/prtracker.html From lkp at intel.com Sat Jul 9 21:29:20 2022 From: lkp at intel.com (kernel test robot) Date: Sat, 09 Jul 2022 19:29:20 +0800 Subject: [xiang-erofs:dev-test] BUILD SUCCESS 448b5a1548d87c246c3d0c3df8480d3c6eb6c11a Message-ID: <62c96690.oyN429zTAX2sT77m%lkp@intel.com> tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test branch HEAD: 448b5a1548d87c246c3d0c3df8480d3c6eb6c11a erofs: avoid consecutive detection for Highmem memory elapsed time: 721m configs tested: 89 configs skipped: 2 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: arm64 allyesconfig arm defconfig arm allyesconfig i386 randconfig-c001 m68k apollo_defconfig arm footbridge_defconfig microblaze mmu_defconfig sh rsk7264_defconfig arc nsimosci_hs_smp_defconfig ia64 tiger_defconfig sh edosk7705_defconfig arm corgi_defconfig sh se7721_defconfig sh se7750_defconfig mips rt305x_defconfig alpha alldefconfig xtensa virt_defconfig m68k amcore_defconfig arm integrator_defconfig arm lart_defconfig mips bmips_be_defconfig arm clps711x_defconfig arm oxnas_v6_defconfig arm hisi_defconfig parisc64 defconfig nios2 alldefconfig mips maltasmvp_defconfig riscv nommu_virt_defconfig riscv rv32_defconfig riscv nommu_k210_defconfig riscv allnoconfig i386 debian-10.3-kselftests i386 debian-10.3 ia64 allmodconfig alpha allyesconfig m68k allmodconfig arc allyesconfig m68k allyesconfig powerpc allnoconfig mips allyesconfig powerpc allmodconfig sh allmodconfig i386 allyesconfig i386 defconfig x86_64 randconfig-a002 x86_64 randconfig-a004 x86_64 randconfig-a006 i386 randconfig-a001 i386 randconfig-a003 i386 randconfig-a005 x86_64 randconfig-a015 x86_64 randconfig-a013 x86_64 randconfig-a011 i386 randconfig-a012 i386 randconfig-a014 i386 randconfig-a016 arc randconfig-r043-20220707 riscv randconfig-r042-20220707 s390 randconfig-r044-20220707 x86_64 rhel-8.3-kselftests um i386_defconfig um x86_64_defconfig x86_64 defconfig x86_64 allyesconfig x86_64 rhel-8.3 x86_64 rhel-8.3-func x86_64 rhel-8.3-kunit x86_64 rhel-8.3-syz clang tested configs: hexagon alldefconfig arm milbeaut_m10v_defconfig powerpc walnut_defconfig powerpc g5_defconfig mips rbtx49xx_defconfig arm palmz72_defconfig x86_64 randconfig-k001 x86_64 randconfig-a001 x86_64 randconfig-a003 x86_64 randconfig-a005 i386 randconfig-a002 i386 randconfig-a004 i386 randconfig-a006 x86_64 randconfig-a012 x86_64 randconfig-a014 x86_64 randconfig-a016 i386 randconfig-a013 i386 randconfig-a011 i386 randconfig-a015 hexagon randconfig-r045-20220707 hexagon randconfig-r041-20220707 -- 0-DAY CI Kernel Test Service https://01.org/lkp From lihe at uniontech.com Mon Jul 11 12:47:17 2022 From: lihe at uniontech.com (Li He) Date: Mon, 11 Jul 2022 10:47:17 +0800 Subject: [PATCH] erofs-utils: fuse: support offset when read image Message-ID: <20220711024717.5554-1-lihe@uniontech.com> Add --offset to erofsfuse to skip bytes at the start of the image file. Signed-off-by: Li He --- fuse/main.c | 6 ++++++ include/erofs/config.h | 3 +++ lib/io.c | 2 ++ 3 files changed, 11 insertions(+) diff --git a/fuse/main.c b/fuse/main.c index f4c2476..a2a6449 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -151,6 +151,7 @@ static struct fuse_operations erofs_ops = { static struct options { const char *disk; const char *mountpoint; + u64 offset; unsigned int debug_lvl; bool show_help; bool odebug; @@ -158,6 +159,7 @@ static struct options { #define OPTION(t, p) { t, offsetof(struct options, p), 1 } static const struct fuse_opt option_spec[] = { + OPTION("--offset=%lu", offset), OPTION("--dbglevel=%u", debug_lvl), OPTION("--help", show_help), FUSE_OPT_KEY("--device=", 1), @@ -170,6 +172,7 @@ static void usage(void) fputs("usage: [options] IMAGE MOUNTPOINT\n\n" "Options:\n" + " --offset=# # bytes to skip when read IMAGE\n" " --dbglevel=# set output message level to # (maximum 9)\n" " --device=# specify an extra device to be used together\n" #if FUSE_MAJOR_VERSION < 3 @@ -190,6 +193,7 @@ static void usage(void) static void erofsfuse_dumpcfg(void) { erofs_dump("disk: %s\n", fusecfg.disk); + erofs_dump("offset: %lu\n", fusecfg.offset); erofs_dump("mountpoint: %s\n", fusecfg.mountpoint); erofs_dump("dbglevel: %u\n", cfg.c_dbg_lvl); } @@ -279,6 +283,8 @@ int main(int argc, char *argv[]) if (fusecfg.odebug && cfg.c_dbg_lvl < EROFS_DBG) cfg.c_dbg_lvl = EROFS_DBG; + cfg.c_offset = fusecfg.offset; + erofsfuse_dumpcfg(); ret = dev_open_ro(fusecfg.disk); if (ret) { diff --git a/include/erofs/config.h b/include/erofs/config.h index 0d0916c..8b6f7db 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -73,6 +73,9 @@ struct erofs_configure { char *fs_config_file; char *block_list_file; #endif + + /* offset when read mutli partiton image */ + u64 c_offset; }; extern struct erofs_configure cfg; diff --git a/lib/io.c b/lib/io.c index 9c663c5..524cfb4 100644 --- a/lib/io.c +++ b/lib/io.c @@ -261,6 +261,8 @@ int dev_read(int device_id, void *buf, u64 offset, size_t len) if (cfg.c_dry_run) return 0; + offset += cfg.c_offset; + if (!buf) { erofs_err("buf is NULL"); return -EINVAL; -- 2.20.1 From hsiangkao at linux.alibaba.com Mon Jul 11 13:52:46 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Mon, 11 Jul 2022 11:52:46 +0800 Subject: [PATCH] erofs-utils: fuse: support offset when read image In-Reply-To: <20220711024717.5554-1-lihe@uniontech.com> References: <20220711024717.5554-1-lihe@uniontech.com> Message-ID: Hi He, On Mon, Jul 11, 2022 at 10:47:17AM +0800, Li He wrote: > Add --offset to erofsfuse to skip bytes at the start of the image file. > > Signed-off-by: Li He Thanks for the patch! The patch roughly looks good to me, some nit as below.. > --- > fuse/main.c | 6 ++++++ > include/erofs/config.h | 3 +++ > lib/io.c | 2 ++ > 3 files changed, 11 insertions(+) > > diff --git a/fuse/main.c b/fuse/main.c > index f4c2476..a2a6449 100644 > --- a/fuse/main.c > +++ b/fuse/main.c > @@ -151,6 +151,7 @@ static struct fuse_operations erofs_ops = { > static struct options { > const char *disk; > const char *mountpoint; > + u64 offset; We can use cfg.c_offset directly instead it seems? > unsigned int debug_lvl; > bool show_help; > bool odebug; > @@ -158,6 +159,7 @@ static struct options { > > #define OPTION(t, p) { t, offsetof(struct options, p), 1 } > static const struct fuse_opt option_spec[] = { > + OPTION("--offset=%lu", offset), > OPTION("--dbglevel=%u", debug_lvl), > OPTION("--help", show_help), > FUSE_OPT_KEY("--device=", 1), > @@ -170,6 +172,7 @@ static void usage(void) > > fputs("usage: [options] IMAGE MOUNTPOINT\n\n" > "Options:\n" > + " --offset=# # bytes to skip when read IMAGE\n" need to update manpage as well... Thanks, Gao Xiang > " --dbglevel=# set output message level to # (maximum 9)\n" > " --device=# specify an extra device to be used together\n" > #if FUSE_MAJOR_VERSION < 3 > @@ -190,6 +193,7 @@ static void usage(void) > static void erofsfuse_dumpcfg(void) > { > erofs_dump("disk: %s\n", fusecfg.disk); > + erofs_dump("offset: %lu\n", fusecfg.offset); > erofs_dump("mountpoint: %s\n", fusecfg.mountpoint); > erofs_dump("dbglevel: %u\n", cfg.c_dbg_lvl); > } > @@ -279,6 +283,8 @@ int main(int argc, char *argv[]) > if (fusecfg.odebug && cfg.c_dbg_lvl < EROFS_DBG) > cfg.c_dbg_lvl = EROFS_DBG; > > + cfg.c_offset = fusecfg.offset; > + > erofsfuse_dumpcfg(); > ret = dev_open_ro(fusecfg.disk); > if (ret) { > diff --git a/include/erofs/config.h b/include/erofs/config.h > index 0d0916c..8b6f7db 100644 > --- a/include/erofs/config.h > +++ b/include/erofs/config.h > @@ -73,6 +73,9 @@ struct erofs_configure { > char *fs_config_file; > char *block_list_file; > #endif > + > + /* offset when read mutli partiton image */ > + u64 c_offset; > }; > > extern struct erofs_configure cfg; > diff --git a/lib/io.c b/lib/io.c > index 9c663c5..524cfb4 100644 > --- a/lib/io.c > +++ b/lib/io.c > @@ -261,6 +261,8 @@ int dev_read(int device_id, void *buf, u64 offset, size_t len) > if (cfg.c_dry_run) > return 0; > > + offset += cfg.c_offset; > + > if (!buf) { > erofs_err("buf is NULL"); > return -EINVAL; > -- > 2.20.1 > > From lihe at uniontech.com Mon Jul 11 18:54:59 2022 From: lihe at uniontech.com (Li He) Date: Mon, 11 Jul 2022 16:54:59 +0800 Subject: [PATCH] erofs-utils: fuse: support offset when read image In-Reply-To: References: Message-ID: <20220711085459.19730-1-lihe@uniontech.com> Add --offset to erofsfuse to skip bytes at the start of the image file. Signed-off-by: Li He --- We should add offset to fuse_operations so erofsfuse can parse it from command line. fuse_opt_parse can not parse cfg directly. Changes since v0 + Add manpage for erofsfuse offset option fuse/main.c | 6 ++++++ include/erofs/config.h | 3 +++ lib/io.c | 2 ++ man/erofsfuse.1 | 3 +++ 4 files changed, 14 insertions(+) diff --git a/fuse/main.c b/fuse/main.c index f4c2476..a2a6449 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -151,6 +151,7 @@ static struct fuse_operations erofs_ops = { static struct options { const char *disk; const char *mountpoint; + u64 offset; unsigned int debug_lvl; bool show_help; bool odebug; @@ -158,6 +159,7 @@ static struct options { #define OPTION(t, p) { t, offsetof(struct options, p), 1 } static const struct fuse_opt option_spec[] = { + OPTION("--offset=%lu", offset), OPTION("--dbglevel=%u", debug_lvl), OPTION("--help", show_help), FUSE_OPT_KEY("--device=", 1), @@ -170,6 +172,7 @@ static void usage(void) fputs("usage: [options] IMAGE MOUNTPOINT\n\n" "Options:\n" + " --offset=# # bytes to skip when read IMAGE\n" " --dbglevel=# set output message level to # (maximum 9)\n" " --device=# specify an extra device to be used together\n" #if FUSE_MAJOR_VERSION < 3 @@ -190,6 +193,7 @@ static void usage(void) static void erofsfuse_dumpcfg(void) { erofs_dump("disk: %s\n", fusecfg.disk); + erofs_dump("offset: %lu\n", fusecfg.offset); erofs_dump("mountpoint: %s\n", fusecfg.mountpoint); erofs_dump("dbglevel: %u\n", cfg.c_dbg_lvl); } @@ -279,6 +283,8 @@ int main(int argc, char *argv[]) if (fusecfg.odebug && cfg.c_dbg_lvl < EROFS_DBG) cfg.c_dbg_lvl = EROFS_DBG; + cfg.c_offset = fusecfg.offset; + erofsfuse_dumpcfg(); ret = dev_open_ro(fusecfg.disk); if (ret) { diff --git a/include/erofs/config.h b/include/erofs/config.h index 0d0916c..8b6f7db 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -73,6 +73,9 @@ struct erofs_configure { char *fs_config_file; char *block_list_file; #endif + + /* offset when read mutli partiton image */ + u64 c_offset; }; extern struct erofs_configure cfg; diff --git a/lib/io.c b/lib/io.c index 9c663c5..524cfb4 100644 --- a/lib/io.c +++ b/lib/io.c @@ -261,6 +261,8 @@ int dev_read(int device_id, void *buf, u64 offset, size_t len) if (cfg.c_dry_run) return 0; + offset += cfg.c_offset; + if (!buf) { erofs_err("buf is NULL"); return -EINVAL; diff --git a/man/erofsfuse.1 b/man/erofsfuse.1 index 9db6827..1d47163 100644 --- a/man/erofsfuse.1 +++ b/man/erofsfuse.1 @@ -26,6 +26,9 @@ warning messages. .BI "\-\-device=" path Specify an extra device to be used together. You may give multiple `--device' options in the correct order. +.TP +.BI "\-\-offset=" # +Specify offset bytes to skip when read image file. The default is 0. .SS "FUSE options:" .TP \fB-d -o\fR debug -- 2.20.1 From zbestahu at gmail.com Mon Jul 11 19:09:55 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 11 Jul 2022 17:09:55 +0800 Subject: [RFC PATCH 0/3] erofs-utils: compressed fragments feature Message-ID: In order to achieve greater compression ratio, let's introduce compressed fragments feature which can merge tail of per-file or the whole files into one special inode to reach the target. And we can also set pcluster size to fragments inode for different compression requirments. In this patchset, we also improve the uncompressed data layout of compressed files. Just write it from 'clusterofs' instead of 0 since it can benefit from in-place I/O. For now, it only goes with fragments. The main idea above is from Xiang. Here is some test data of Linux 5.10.87 source code under Ubuntu 18.04: linux-5.10.87 (erofs, uncompressed) 1.1G linux-5.10.87 (erofs, lz4hc,12 4k fragments,4k) 304M linux-5.10.87 (erofs, lz4hc,12 8k fragments,8k) 271M linux-5.10.87 (erofs, lz4hc,12 16k fragments,16k) 245M linux-5.10.87 (erofs, lz4hc,12 32k fragments,32k) 228M linux-5.10.87 (erofs, lz4hc,12 64k fragments,64k) 220M linux-5.10.87 (erofs, lz4hc,12 4k vanilla) 396M linux-5.10.87 (erofs, lz4hc,12 8k vanilla) 376M linux-5.10.87 (erofs, lz4hc,12 16k vanilla) 364M linux-5.10.87 (erofs, lz4hc,12 32k vanilla) 359M linux-5.10.87 (erofs, lz4hc,12 64k vanilla) 358M Usage: mkfs.erofs -zlz4hc,12 -C65536 -Efragments,65536 foo.erofs.img foo/ Yue Hu (3): erofs-utils: lib: add support for fragments data decompression erofs-utils: lib: support on-disk offset for shifted decompression erofs-utils: introduce compressed fragments support include/erofs/compress.h | 2 +- include/erofs/config.h | 3 +- include/erofs/decompress.h | 3 ++ include/erofs/fragments.h | 25 ++++++++++ include/erofs/inode.h | 2 + include/erofs/internal.h | 9 ++++ include/erofs_fs.h | 20 ++++++-- lib/Makefile.am | 4 +- lib/compress.c | 94 ++++++++++++++++++++++++++++---------- lib/data.c | 33 ++++++++++++- lib/decompress.c | 10 +++- lib/fragments.c | 77 +++++++++++++++++++++++++++++++ lib/inode.c | 50 +++++++++++++------- lib/super.c | 1 + lib/zmap.c | 14 ++++++ mkfs/main.c | 63 ++++++++++++++++++++++--- 16 files changed, 352 insertions(+), 58 deletions(-) create mode 100644 include/erofs/fragments.h create mode 100644 lib/fragments.c -- 2.17.1 From zbestahu at gmail.com Mon Jul 11 19:09:56 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 11 Jul 2022 17:09:56 +0800 Subject: [RFC PATCH 1/3] erofs-utils: lib: add support for fragments data decompression In-Reply-To: References: Message-ID: Add compressed fragments support for erofsfuse. Signed-off-by: Yue Hu --- include/erofs/internal.h | 8 ++++++++ include/erofs_fs.h | 15 +++++++++++---- lib/data.c | 26 ++++++++++++++++++++++++++ lib/zmap.c | 14 ++++++++++++++ 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 6a70f11..129ea54 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -102,6 +102,7 @@ struct erofs_sb_info { u16 devt_slotoff; /* used for mkfs */ u16 device_id_mask; /* used for others */ }; + erofs_nid_t fragments_nid; }; /* global sbi */ @@ -132,6 +133,7 @@ EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) #define EROFS_I_EA_INITED (1 << 0) @@ -190,6 +192,8 @@ struct erofs_inode { void *eof_tailraw; unsigned int eof_tailrawsize; + erofs_off_t fragmentoff; + union { void *compressmeta; void *chunkindexes; @@ -201,6 +205,7 @@ struct erofs_inode { uint64_t z_tailextent_headlcn; unsigned int z_idataoff; #define z_idata_size idata_size +#define z_fragmentoff fragmentoff }; }; #ifdef WITH_ANDROID @@ -276,6 +281,7 @@ enum { BH_Mapped, BH_Encoded, BH_FullMapped, + BH_Fragments, }; /* Has a disk mapping */ @@ -286,6 +292,8 @@ enum { #define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +/* Located in fragments */ +#define EROFS_MAP_FRAGMENTS (1 << BH_Fragments) struct erofs_map_blocks { char mpage[EROFS_BLKSIZ]; diff --git a/include/erofs_fs.h b/include/erofs_fs.h index 08f9761..4a0c74b 100644 --- a/include/erofs_fs.h +++ b/include/erofs_fs.h @@ -25,13 +25,15 @@ #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING) + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -294,16 +296,21 @@ struct z_erofs_lzma_cfgs { * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : fragment pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0010 struct z_erofs_map_header { - __le16 h_reserved1; - /* record the size of tailpacking data */ - __le16 h_idata_size; + union { + /* direct addressing for fragment offset */ + __le32 h_fragmentoff; + /* record the size of tailpacking data */ + __le16 h_idata_size; + }; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); diff --git a/lib/data.c b/lib/data.c index 6bc554d..c2ecbc9 100644 --- a/lib/data.c +++ b/lib/data.c @@ -275,6 +275,32 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, continue; } + if (map.m_flags & EROFS_MAP_FRAGMENTS) { + struct erofs_inode fragments = { + .nid = sbi.fragments_nid, + }; + char *out; + + ret = erofs_read_inode_from_disk(&fragments); + if (ret) + break; + out = malloc(length); + if (!out) { + ret = -ENOMEM; + break; + } + ret = z_erofs_read_data(&fragments, out, length, + inode->z_fragmentoff); + if (ret < 0) { + free(out); + break; + } + memcpy(buffer + end - offset, out + skip, length - + skip); + free(out); + continue; + } + if (map.m_plen > bufsize) { bufsize = map.m_plen; raw = realloc(raw, bufsize); diff --git a/lib/zmap.c b/lib/zmap.c index 95745c5..2b45a4f 100644 --- a/lib/zmap.c +++ b/lib/zmap.c @@ -83,6 +83,17 @@ static int z_erofs_fill_inode_lazy(struct erofs_inode *vi) if (ret < 0) return ret; } + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) { + struct erofs_map_blocks map = { .index = UINT_MAX }; + + ret = z_erofs_do_map_blocks(vi, &map, + EROFS_GET_BLOCKS_FINDTAIL); + if (ret < 0) { + erofs_err("failed to find tail for fragment pcluster"); + return ret; + } + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + } vi->flags |= EROFS_I_Z_INITED; return 0; } @@ -546,6 +557,7 @@ static int z_erofs_do_map_blocks(struct erofs_inode *vi, int flags) { bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool in_fragments = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = vi, .map = map, @@ -609,6 +621,8 @@ static int z_erofs_do_map_blocks(struct erofs_inode *vi, map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; + } else if (in_fragments && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENTS; } else { map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); -- 2.17.1 From zbestahu at gmail.com Mon Jul 11 19:09:57 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 11 Jul 2022 17:09:57 +0800 Subject: [RFC PATCH 2/3] erofs-utils: lib: support on-disk offset for shifted decompression In-Reply-To: References: Message-ID: <3b38c6f8c6e16b74042602144997e62bdd3259da.1657530420.git.huyue2@coolpad.com> Add support to uncompressed data layout with on-disk offset for compressed files. Signed-off-by: Yue Hu --- include/erofs/decompress.h | 3 +++ lib/data.c | 8 +++++++- lib/decompress.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/erofs/decompress.h b/include/erofs/decompress.h index 82bf7b8..b622df5 100644 --- a/include/erofs/decompress.h +++ b/include/erofs/decompress.h @@ -23,6 +23,9 @@ struct z_erofs_decompress_req { unsigned int decodedskip; unsigned int inputsize, decodedlength; + /* head offset of uncompressed data */ + unsigned int shiftedhead; + /* indicate the algorithm will be used for decompression */ unsigned int alg; bool partial_decoding; diff --git a/lib/data.c b/lib/data.c index c2ecbc9..5e44db9 100644 --- a/lib/data.c +++ b/lib/data.c @@ -226,7 +226,7 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, }; struct erofs_map_dev mdev; bool partial; - unsigned int bufsize = 0; + unsigned int bufsize = 0, head; char *raw = NULL; int ret = 0; @@ -313,10 +313,16 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, if (ret < 0) break; + head = 0; + if (erofs_sb_has_fragments() && + map.m_algorithmformat == Z_EROFS_COMPRESSION_SHIFTED) + head = erofs_blkoff(end); + ret = z_erofs_decompress(&(struct z_erofs_decompress_req) { .in = raw, .out = buffer + end - offset, .decodedskip = skip, + .shiftedhead = head, .inputsize = map.m_plen, .decodedlength = length, .alg = map.m_algorithmformat, diff --git a/lib/decompress.c b/lib/decompress.c index 1661f91..08a0861 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -132,14 +132,20 @@ out: int z_erofs_decompress(struct z_erofs_decompress_req *rq) { if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { + unsigned int count, rightpart; + if (rq->inputsize > EROFS_BLKSIZ) return -EFSCORRUPTED; DBG_BUGON(rq->decodedlength > EROFS_BLKSIZ); DBG_BUGON(rq->decodedlength < rq->decodedskip); - memcpy(rq->out, rq->in + rq->decodedskip, - rq->decodedlength - rq->decodedskip); + count = rq->decodedlength - rq->decodedskip; + rightpart = min(EROFS_BLKSIZ - rq->shiftedhead, count); + + memcpy(rq->out, rq->in + (erofs_sb_has_fragments() ? + rq->shiftedhead : rq->decodedskip), rightpart); + memcpy(rq->out + rightpart, rq->in, count - rightpart); return 0; } -- 2.17.1 From zbestahu at gmail.com Mon Jul 11 19:09:58 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 11 Jul 2022 17:09:58 +0800 Subject: [RFC PATCH 3/3] erofs-utils: introduce compressed fragments support In-Reply-To: References: Message-ID: <732d94e6234a95c8820b1fd079ec4d7b95f1b3a1.1657530420.git.huyue2@coolpad.com> This approach can merge tail pclusters or the whole files into a special inode in order to achieve greater compression ratio. And an option of pcluster size is provided for different compression requirments. Meanwhile, we change to write the uncompressed data from 'clusterofs' when compressing files since it can benefit from in-place I/O. For now, this change goes with the fragments. Signed-off-by: Yue Hu --- include/erofs/compress.h | 2 +- include/erofs/config.h | 3 +- include/erofs/fragments.h | 25 +++++++++++ include/erofs/inode.h | 2 + include/erofs/internal.h | 1 + include/erofs_fs.h | 5 ++- lib/Makefile.am | 4 +- lib/compress.c | 94 ++++++++++++++++++++++++++++----------- lib/fragments.c | 76 +++++++++++++++++++++++++++++++ lib/inode.c | 50 ++++++++++++++------- lib/super.c | 1 + mkfs/main.c | 63 +++++++++++++++++++++++--- 12 files changed, 275 insertions(+), 51 deletions(-) create mode 100644 include/erofs/fragments.h create mode 100644 lib/fragments.c diff --git a/include/erofs/compress.h b/include/erofs/compress.h index 24f6204..fecc316 100644 --- a/include/erofs/compress.h +++ b/include/erofs/compress.h @@ -18,7 +18,7 @@ extern "C" #define EROFS_CONFIG_COMPR_MIN_SZ (32 * 1024) void z_erofs_drop_inline_pcluster(struct erofs_inode *inode); -int erofs_write_compressed_file(struct erofs_inode *inode); +int erofs_write_compressed_file_from_fd(struct erofs_inode *inode, int fd); int z_erofs_compress_init(struct erofs_buffer_head *bh); int z_erofs_compress_exit(void); diff --git a/include/erofs/config.h b/include/erofs/config.h index 0d0916c..5b83419 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -44,6 +44,7 @@ struct erofs_configure { char c_chunkbits; bool c_noinline_data; bool c_ztailpacking; + bool c_fragments; bool c_ignore_mtime; bool c_showprogress; @@ -62,7 +63,7 @@ struct erofs_configure { /* < 0, xattr disabled and INT_MAX, always use inline xattrs */ int c_inline_xattr_tolerance; - u32 c_pclusterblks_max, c_pclusterblks_def; + u32 c_pclusterblks_max, c_pclusterblks_def, c_pclusterblks_frags; u32 c_max_decompressed_extent_bytes; u32 c_dict_size; u64 c_unix_timestamp; diff --git a/include/erofs/fragments.h b/include/erofs/fragments.h new file mode 100644 index 0000000..89f0f18 --- /dev/null +++ b/include/erofs/fragments.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */ +/* + * Copyright (C), 2022, Coolpad Group Limited. + */ +#ifndef __EROFS_FRAGMENTS_H +#define __EROFS_FRAGMENTS_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "erofs/internal.h" + +int z_erofs_fill_fragments(struct erofs_inode *inode, void *data, + unsigned int len); +struct erofs_inode *erofs_mkfs_build_fragments(void); +int erofs_fragments_init(void); +void erofs_fragments_exit(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/erofs/inode.h b/include/erofs/inode.h index 79b39b0..0a87c58 100644 --- a/include/erofs/inode.h +++ b/include/erofs/inode.h @@ -21,6 +21,8 @@ unsigned int erofs_iput(struct erofs_inode *inode); erofs_nid_t erofs_lookupnid(struct erofs_inode *inode); struct erofs_inode *erofs_mkfs_build_tree_from_path(struct erofs_inode *parent, const char *path); +int erofs_prepare_inode_buffer(struct erofs_inode *inode); +struct erofs_inode *erofs_generate_inode(struct stat64 *st, const char *path); #ifdef __cplusplus } diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 129ea54..5a7b2fa 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -193,6 +193,7 @@ struct erofs_inode { unsigned int eof_tailrawsize; erofs_off_t fragmentoff; + unsigned int fragment_size; union { void *compressmeta; diff --git a/include/erofs_fs.h b/include/erofs_fs.h index 4a0c74b..4fc2756 100644 --- a/include/erofs_fs.h +++ b/include/erofs_fs.h @@ -75,7 +75,9 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved2[38]; + __u8 reserved[6]; + __le64 fragments_nid; + __u8 reserved2[24]; }; /* @@ -265,6 +267,7 @@ struct erofs_inode_chunk_index { /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +#define Z_EROFS_PCLUSTER_MAX_BLKS (Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) /* available compression algorithm types (for h_algorithmtype) */ enum { diff --git a/lib/Makefile.am b/lib/Makefile.am index 3fad357..95f1d55 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -22,12 +22,14 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \ $(top_srcdir)/include/erofs/trace.h \ $(top_srcdir)/include/erofs/xattr.h \ $(top_srcdir)/include/erofs/compress_hints.h \ + $(top_srcdir)/include/erofs/fragments.h \ $(top_srcdir)/lib/liberofs_private.h noinst_HEADERS += compressor.h liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \ namei.c data.c compress.c compressor.c zmap.c decompress.c \ - compress_hints.c hashmap.c sha256.c blobchunk.c dir.c + compress_hints.c hashmap.c sha256.c blobchunk.c dir.c \ + fragments.c liberofs_la_CFLAGS = -Wall -I$(top_srcdir)/include if ENABLE_LZ4 liberofs_la_CFLAGS += ${LZ4_CFLAGS} diff --git a/lib/compress.c b/lib/compress.c index ee3b856..145e83e 100644 --- a/lib/compress.c +++ b/lib/compress.c @@ -18,6 +18,7 @@ #include "compressor.h" #include "erofs/block_list.h" #include "erofs/compress_hints.h" +#include "erofs/fragments.h" static struct erofs_compress compresshandle; static unsigned int algorithmtype[2]; @@ -35,6 +36,11 @@ struct z_erofs_vle_compress_ctx { #define Z_EROFS_LEGACY_MAP_HEADER_SIZE \ (sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) +static inline bool erofs_has_srcpath(struct erofs_inode *inode) +{ + return !(inode->i_srcpath[0] == '\0'); +} + static unsigned int vle_compressmeta_capacity(erofs_off_t filesize) { const unsigned int indexsize = BLK_ROUND_UP(filesize) * @@ -74,9 +80,9 @@ static void vle_write_indexes(struct z_erofs_vle_compress_ctx *ctx, if (!d1) { /* * A lcluster cannot have three parts with the middle one which - * is well-compressed for !ztailpacking cases. + * is well-compressed for !ztailpacking and !fragments cases. */ - DBG_BUGON(!raw && !cfg.c_ztailpacking); + DBG_BUGON(!raw && !cfg.c_ztailpacking && !cfg.c_fragments); type = raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN : Z_EROFS_VLE_CLUSTER_TYPE_HEAD; advise = cpu_to_le16(type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT); @@ -143,7 +149,7 @@ static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx, unsigned int *len, char *dst) { int ret; - unsigned int count; + unsigned int count, offset, rcopied, rzeroed; /* reset clusterofs to 0 if permitted */ if (!erofs_sb_has_lz4_0padding() && ctx->clusterofs && @@ -153,11 +159,21 @@ static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx, ctx->clusterofs = 0; } - /* write uncompressed data */ + /* + * write uncompressed data from clusterofs which can benefit from + * in-place I/O, loop shift right when to exceed EROFS_BLKSIZ. + */ count = min(EROFS_BLKSIZ, *len); - memcpy(dst, ctx->queue + ctx->head, count); - memset(dst + count, 0, EROFS_BLKSIZ - count); + offset = cfg.c_fragments ? ctx->clusterofs : 0; + rcopied = min(EROFS_BLKSIZ - offset, count); + rzeroed = EROFS_BLKSIZ - offset - rcopied; + + memcpy(dst + offset, ctx->queue + ctx->head, rcopied); + memcpy(dst, ctx->queue + ctx->head + rcopied, count - rcopied); + + memset(dst + offset + rcopied, 0, rzeroed); + memset(dst + count - rcopied, 0, EROFS_BLKSIZ - count - rzeroed); erofs_dbg("Writing %u uncompressed data to block %u", count, ctx->blkaddr); @@ -169,6 +185,8 @@ static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx, static unsigned int z_erofs_get_max_pclusterblks(struct erofs_inode *inode) { + if (cfg.c_fragments && !erofs_has_srcpath(inode)) + return cfg.c_pclusterblks_frags; #ifndef NDEBUG if (cfg.c_random_pclusterblks) return 1 + rand() % cfg.c_pclusterblks_max; @@ -237,12 +255,15 @@ static int vle_compress_one(struct erofs_inode *inode, unsigned int pclustersize = z_erofs_get_max_pclusterblks(inode) * EROFS_BLKSIZ; bool may_inline = (cfg.c_ztailpacking && final); + bool to_fragments = (cfg.c_fragments && final && + erofs_has_srcpath(inode)); bool raw; if (len <= pclustersize) { if (!final) break; - if (!may_inline && len <= EROFS_BLKSIZ) + if (!may_inline && len <= EROFS_BLKSIZ && + !to_fragments) goto nocompression; } @@ -294,6 +315,15 @@ nocompression: return ret; ctx->compressedblks = 1; raw = false; + } else if (to_fragments && len == count && + ret < pclustersize) { + ret = z_erofs_fill_fragments(inode, + ctx->queue + ctx->head, + len); + if (ret < 0) + return ret; + ctx->compressedblks = 0; + raw = false; } else { unsigned int tailused, padding; @@ -546,13 +576,17 @@ static void z_erofs_write_mapheader(struct erofs_inode *inode, { struct z_erofs_map_header h = { .h_advise = cpu_to_le16(inode->z_advise), - .h_idata_size = cpu_to_le16(inode->idata_size), .h_algorithmtype = inode->z_algorithmtype[1] << 4 | inode->z_algorithmtype[0], /* lclustersize */ .h_clusterbits = inode->z_logical_clusterbits - 12, }; + if (cfg.c_fragments) + h.h_fragmentoff = cpu_to_le32(inode->fragmentoff); + else + h.h_idata_size = cpu_to_le16(inode->idata_size); + memset(compressmeta, 0, Z_EROFS_LEGACY_MAP_HEADER_SIZE); /* write out map header */ memcpy(compressmeta, &h, sizeof(struct z_erofs_map_header)); @@ -604,30 +638,24 @@ void z_erofs_drop_inline_pcluster(struct erofs_inode *inode) inode->eof_tailraw = NULL; } -int erofs_write_compressed_file(struct erofs_inode *inode) +int erofs_write_compressed_file_from_fd(struct erofs_inode *inode, int fd) { struct erofs_buffer_head *bh; static struct z_erofs_vle_compress_ctx ctx; erofs_off_t remaining; erofs_blk_t blkaddr, compressed_blocks; unsigned int legacymetasize; - int ret, fd; + int ret; u8 *compressmeta = malloc(vle_compressmeta_capacity(inode->i_size)); if (!compressmeta) return -ENOMEM; - fd = open(inode->i_srcpath, O_RDONLY | O_BINARY); - if (fd < 0) { - ret = -errno; - goto err_free_meta; - } - /* allocate main data buffer */ bh = erofs_balloc(DATA, 0, 0, 0); if (IS_ERR(bh)) { ret = PTR_ERR(bh); - goto err_close; + goto err_free_meta; } /* initialize per-file compression setting */ @@ -648,6 +676,9 @@ int erofs_write_compressed_file(struct erofs_inode *inode) inode->z_algorithmtype[1] = algorithmtype[1]; inode->z_logical_clusterbits = LOG_BLOCK_SIZE; + inode->idata_size = 0; + inode->fragment_size = 0; + blkaddr = erofs_mapbh(bh->block); /* start_blkaddr */ ctx.blkaddr = blkaddr; ctx.metacur = compressmeta + Z_EROFS_LEGACY_MAP_HEADER_SIZE; @@ -681,19 +712,20 @@ int erofs_write_compressed_file(struct erofs_inode *inode) vle_write_indexes_final(&ctx); legacymetasize = ctx.metacur - compressmeta; /* estimate if data compression saves space or not */ - if (compressed_blocks * EROFS_BLKSIZ + inode->idata_size + + if (!inode->fragment_size && + compressed_blocks * EROFS_BLKSIZ + inode->idata_size + legacymetasize >= inode->i_size) { ret = -ENOSPC; goto err_free_idata; } z_erofs_write_mapheader(inode, compressmeta); - close(fd); if (compressed_blocks) { ret = erofs_bh_balloon(bh, blknr_to_addr(compressed_blocks)); DBG_BUGON(ret != EROFS_BLKSIZ); } else { - DBG_BUGON(!inode->idata_size); + if (!cfg.c_fragments) + DBG_BUGON(!inode->idata_size); } erofs_info("compressed %s (%llu bytes) into %u blocks", @@ -716,7 +748,8 @@ int erofs_write_compressed_file(struct erofs_inode *inode) DBG_BUGON(ret); } inode->compressmeta = compressmeta; - erofs_droid_blocklist_write(inode, blkaddr, compressed_blocks); + if (erofs_has_srcpath(inode)) + erofs_droid_blocklist_write(inode, blkaddr, compressed_blocks); return 0; err_free_idata: @@ -726,8 +759,6 @@ err_free_idata: } err_bdrop: erofs_bdrop(bh, true); /* revoke buffer */ -err_close: - close(fd); err_free_meta: free(compressmeta); return ret; @@ -833,14 +864,27 @@ int z_erofs_compress_init(struct erofs_buffer_head *sb_bh) * to be loaded in order to get those compressed block counts. */ if (cfg.c_pclusterblks_max > 1) { - if (cfg.c_pclusterblks_max > - Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + if (cfg.c_pclusterblks_max > Z_EROFS_PCLUSTER_MAX_BLKS) { erofs_err("unsupported clusterblks %u (too large)", cfg.c_pclusterblks_max); return -EINVAL; } + if (cfg.c_pclusterblks_frags > Z_EROFS_PCLUSTER_MAX_BLKS) { + erofs_err("unsupported clusterblks %u (too large for fragments)", + cfg.c_pclusterblks_frags); + return -EINVAL; + } + if (cfg.c_pclusterblks_frags == 1) { + erofs_err("physical cluster size of fragments should > 4096 bytes"); + return -EINVAL; + } erofs_sb_set_big_pcluster(); } + if (!erofs_sb_has_big_pcluster() && cfg.c_pclusterblks_frags > 1) { + erofs_err("invalid clusterblks %u (for fragments)", + cfg.c_pclusterblks_frags); + return -EINVAL; + } if (ret != Z_EROFS_COMPRESSION_LZ4) erofs_sb_set_compr_cfgs(); diff --git a/lib/fragments.c b/lib/fragments.c new file mode 100644 index 0000000..e74eec2 --- /dev/null +++ b/lib/fragments.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 +/* + * Copyright (C), 2022, Coolpad Group Limited. + * Created by Yue Hu + */ +#define _GNU_SOURCE +#include +#include +#include +#include "erofs/err.h" +#include "erofs/inode.h" +#include "erofs/compress.h" +#include "erofs/print.h" +#include "erofs/fragments.h" + +static FILE *fragmentsfp; + +int z_erofs_fill_fragments(struct erofs_inode *inode, void *data, + unsigned int len) +{ + inode->z_advise |= Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + inode->fragmentoff = ftell(fragmentsfp); + inode->fragment_size = len; + + if (write(fileno(fragmentsfp), data, len) < 0) + return -EIO; + + erofs_sb_set_fragments(); + + erofs_dbg("Recording %u fragment data at %lu", inode->fragment_size, + inode->fragmentoff); + return len; +} + +struct erofs_inode *erofs_mkfs_build_fragments(void) +{ + struct stat64 st; + struct erofs_inode *inode; + int ret, fd = fileno(fragmentsfp); + + ret = fstat64(fd, &st); + if (ret) + return ERR_PTR(-errno); + + inode = erofs_generate_inode(&st, NULL); + if (IS_ERR(inode)) + return inode; + + fseek(fragmentsfp, 0, SEEK_SET); + ret = erofs_write_compressed_file_from_fd(inode, fd); + if (ret) { + erofs_err("write fragments file error"); + return ERR_PTR(ret); + } + + erofs_prepare_inode_buffer(inode); + return inode; +} + +void erofs_fragments_exit(void) +{ + if (fragmentsfp) + fclose(fragmentsfp); +} + +int erofs_fragments_init(void) +{ +#ifdef HAVE_TMPFILE64 + fragmentsfp = tmpfile64(); +#else + fragmentsfp = tmpfile(); +#endif + if (!fragmentsfp) + return -ENOMEM; + return 0; +} diff --git a/lib/inode.c b/lib/inode.c index f192510..86adbc4 100644 --- a/lib/inode.c +++ b/lib/inode.c @@ -405,7 +405,11 @@ int erofs_write_file(struct erofs_inode *inode) } if (cfg.c_compr_alg_master && erofs_file_is_compressible(inode)) { - ret = erofs_write_compressed_file(inode); + fd = open(inode->i_srcpath, O_RDONLY | O_BINARY); + if (fd < 0) + return -errno; + ret = erofs_write_compressed_file_from_fd(inode, fd); + close(fd); if (!ret || ret != -ENOSPC) return ret; @@ -583,7 +587,7 @@ static int erofs_prepare_tail_block(struct erofs_inode *inode) return 0; } -static int erofs_prepare_inode_buffer(struct erofs_inode *inode) +int erofs_prepare_inode_buffer(struct erofs_inode *inode) { unsigned int inodesize; struct erofs_buffer_head *bh, *ibh; @@ -782,6 +786,9 @@ int erofs_droid_inode_fsconfig(struct erofs_inode *inode, const char *fspath; char *decorated = NULL; + if (!path) + return 0; + inode->capabilities = 0; if (!cfg.fs_config_file && !cfg.mount_point) return 0; @@ -868,14 +875,18 @@ static int erofs_fill_inode(struct erofs_inode *inode, return -EINVAL; } - strncpy(inode->i_srcpath, path, sizeof(inode->i_srcpath) - 1); - inode->i_srcpath[sizeof(inode->i_srcpath) - 1] = '\0'; + if (path) { + strncpy(inode->i_srcpath, path, sizeof(inode->i_srcpath) - 1); + inode->i_srcpath[sizeof(inode->i_srcpath) - 1] = '\0'; + } else { + inode->i_srcpath[0] = '\0'; + } inode->dev = st->st_dev; inode->i_ino[1] = st->st_ino; if (erofs_should_use_inode_extended(inode)) { - if (cfg.c_force_inodeversion == FORCE_INODE_COMPACT) { + if (path && cfg.c_force_inodeversion == FORCE_INODE_COMPACT) { erofs_err("file %s cannot be in compact form", inode->i_srcpath); return -EINVAL; @@ -907,6 +918,23 @@ static struct erofs_inode *erofs_new_inode(void) return inode; } +struct erofs_inode *erofs_generate_inode(struct stat64 *st, const char *path) +{ + struct erofs_inode *inode; + int ret; + + inode = erofs_new_inode(); + if (IS_ERR(inode)) + return inode; + + ret = erofs_fill_inode(inode, st, path); + if (ret) { + free(inode); + return ERR_PTR(ret); + } + return inode; +} + /* get the inode from the (source) path */ static struct erofs_inode *erofs_iget_from_path(const char *path, bool is_src) { @@ -934,17 +962,7 @@ static struct erofs_inode *erofs_iget_from_path(const char *path, bool is_src) } /* cannot find in the inode cache */ - inode = erofs_new_inode(); - if (IS_ERR(inode)) - return inode; - - ret = erofs_fill_inode(inode, &st, path); - if (ret) { - free(inode); - return ERR_PTR(ret); - } - - return inode; + return erofs_generate_inode(&st, path); } static void erofs_fixup_meta_blkaddr(struct erofs_inode *rootdir) diff --git a/lib/super.c b/lib/super.c index f486eb7..e1cf614 100644 --- a/lib/super.c +++ b/lib/super.c @@ -100,6 +100,7 @@ int erofs_read_superblock(void) sbi.xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi.islotbits = EROFS_ISLOTBITS; sbi.root_nid = le16_to_cpu(dsb->root_nid); + sbi.fragments_nid = le64_to_cpu(dsb->fragments_nid); sbi.inos = le64_to_cpu(dsb->inos); sbi.checksum = le32_to_cpu(dsb->checksum); diff --git a/mkfs/main.c b/mkfs/main.c index d2c9830..227a9be 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -23,6 +23,7 @@ #include "erofs/block_list.h" #include "erofs/compress_hints.h" #include "erofs/blobchunk.h" +#include "erofs/fragments.h" #include "../lib/liberofs_private.h" #ifdef HAVE_LIBUUID @@ -129,9 +130,9 @@ static int parse_extended_opts(const char *opts) const char *p = strchr(token, ','); next = NULL; - if (p) + if (p) { next = p + 1; - else { + } else { p = token + strlen(token); next = p; } @@ -198,7 +199,34 @@ static int parse_extended_opts(const char *opts) return -EINVAL; cfg.c_ztailpacking = true; } + + if (MATCH_EXTENTED_OPT("fragments", token, keylen)) { + char *endptr; + u64 i; + + if (vallen || cfg.c_ztailpacking) + return -EINVAL; + cfg.c_fragments = true; + + i = strtoull(next, &endptr, 0); + if (i == 0 || (*endptr != ',' && *endptr != '\0')) { + cfg.c_pclusterblks_frags = 1; + continue; + } + if (i % EROFS_BLKSIZ) { + erofs_err("invalid physical clustersize %llu", + i); + return -EINVAL; + } + cfg.c_pclusterblks_frags = i / EROFS_BLKSIZ; + + if (*endptr == ',') + next = strchr(next, ',') + 1; + else + goto out; + } } +out: return 0; } @@ -438,7 +466,8 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) int erofs_mkfs_update_super_block(struct erofs_buffer_head *bh, erofs_nid_t root_nid, - erofs_blk_t *blocks) + erofs_blk_t *blocks, + erofs_nid_t fragments_nid) { struct erofs_super_block sb = { .magic = cpu_to_le32(EROFS_SUPER_MAGIC_V1), @@ -462,6 +491,7 @@ int erofs_mkfs_update_super_block(struct erofs_buffer_head *bh, *blocks = erofs_mapbh(NULL); sb.blocks = cpu_to_le32(*blocks); sb.root_nid = cpu_to_le16(root_nid); + sb.fragments_nid = cpu_to_le64(fragments_nid); memcpy(sb.uuid, sbi.uuid, sizeof(sb.uuid)); if (erofs_sb_has_compr_cfgs()) @@ -579,8 +609,8 @@ int main(int argc, char **argv) { int err = 0; struct erofs_buffer_head *sb_bh; - struct erofs_inode *root_inode; - erofs_nid_t root_nid; + struct erofs_inode *root_inode, *fragments_inode; + erofs_nid_t root_nid, fragments_nid; struct stat64 st; erofs_blk_t nblocks; struct timeval t; @@ -650,6 +680,14 @@ int main(int argc, char **argv) erofs_warn("EXPERIMENTAL chunked file feature in use. Use at your own risk!"); if (cfg.c_ztailpacking) erofs_warn("EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); + if (cfg.c_fragments) { + err = erofs_fragments_init(); + if (err) { + erofs_err("failed to initialize fragments"); + return 1; + } + erofs_warn("EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); + } erofs_set_fs_root(cfg.c_src_path); #ifndef NDEBUG if (cfg.c_random_pclusterblks) @@ -719,7 +757,18 @@ int main(int argc, char **argv) goto exit; } - err = erofs_mkfs_update_super_block(sb_bh, root_nid, &nblocks); + fragments_nid = 0; + if (cfg.c_fragments) { + fragments_inode = erofs_mkfs_build_fragments(); + if (IS_ERR(fragments_inode)) { + err = PTR_ERR(fragments_inode); + goto exit; + } + fragments_nid = erofs_lookupnid(fragments_inode); + } + + err = erofs_mkfs_update_super_block(sb_bh, root_nid, &nblocks, + fragments_nid); if (err) goto exit; @@ -741,6 +790,8 @@ exit: erofs_cleanup_exclude_rules(); if (cfg.c_chunkbits) erofs_blob_exit(); + if (cfg.c_fragments) + erofs_fragments_exit(); erofs_exit_configure(); if (err) { -- 2.17.1 From hsiangkao at linux.alibaba.com Mon Jul 11 19:51:48 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Mon, 11 Jul 2022 17:51:48 +0800 Subject: [PATCH] erofs-utils: fuse: support offset when read image In-Reply-To: <20220711085459.19730-1-lihe@uniontech.com> References: <20220711085459.19730-1-lihe@uniontech.com> Message-ID: On Mon, Jul 11, 2022 at 04:54:59PM +0800, Li He wrote: > Add --offset to erofsfuse to skip bytes at the start of the image file. > > Signed-off-by: Li He > --- > We should add offset to fuse_operations so erofsfuse can parse it from > command line. fuse_opt_parse can not parse cfg directly. Oh, I forgot that, looks good to me now! Reviewed-by: Gao Xiang Thanks, Gao Xiang From hsiangkao at linux.alibaba.com Mon Jul 11 20:02:43 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Mon, 11 Jul 2022 18:02:43 +0800 Subject: [RFC PATCH 0/3] erofs-utils: compressed fragments feature In-Reply-To: References: Message-ID: Hi Yue, On Mon, Jul 11, 2022 at 05:09:55PM +0800, Yue Hu wrote: > In order to achieve greater compression ratio, let's introduce > compressed fragments feature which can merge tail of per-file or the > whole files into one special inode to reach the target. > > And we can also set pcluster size to fragments inode for different > compression requirments. > > In this patchset, we also improve the uncompressed data layout of > compressed files. Just write it from 'clusterofs' instead of 0 since it > can benefit from in-place I/O. For now, it only goes with fragments. > > The main idea above is from Xiang. > I just took a preliminary try and it seems work, but in order to form it better, I have to postpone it for the next version of 5.20 (6.0?) since I'm still working on cleaning up and rolling hash deduplication. Thanks, Gao Xiang From zbestahu at gmail.com Mon Jul 11 20:14:30 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 11 Jul 2022 18:14:30 +0800 Subject: [RFC PATCH 0/3] erofs-utils: compressed fragments feature In-Reply-To: References:

Message-ID: <20220711181430.000006bd.zbestahu@gmail.com> Hi Xiang, On Mon, 11 Jul 2022 18:02:43 +0800 Gao Xiang wrote: > Hi Yue, > > On Mon, Jul 11, 2022 at 05:09:55PM +0800, Yue Hu wrote: > > In order to achieve greater compression ratio, let's introduce > > compressed fragments feature which can merge tail of per-file or the > > whole files into one special inode to reach the target. > > > > And we can also set pcluster size to fragments inode for different > > compression requirments. > > > > In this patchset, we also improve the uncompressed data layout of > > compressed files. Just write it from 'clusterofs' instead of 0 since it > > can benefit from in-place I/O. For now, it only goes with fragments. > > > > The main idea above is from Xiang. > > > > I just took a preliminary try and it seems work, but in order to form it > better, I have to postpone it for the next version of 5.20 (6.0?) > since I'm still working on cleaning up and rolling hash deduplication. Got it. Thanks. > > Thanks, > Gao Xiang From contactorutempresacl066601 at sending.contact Wed Jul 13 16:38:33 2022 From: contactorutempresacl066601 at sending.contact (=?UTF-8?Q?=E2=9C=85_?=Agencia Tributaria) Date: Wed, 13 Jul 2022 06:38:33 +0000 Subject: En relacion con el Impuesto sobre Transacciones Financieras - Protocolo: JGTAMOK3JH Message-ID: <094b43a918f1bd898013fd72570e9b1d@localhost.localdomain> ssssssssssssssssssssssssssssssssssaaaaaaaaaaaaaaaaa -------------- next part -------------- An HTML attachment was scrubbed... URL: From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:38 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:38 +0800 Subject: [PATCH 03/16] erofs: introduce `z_erofs_parse_out_bvecs()' In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-4-hsiangkao@linux.alibaba.com> `z_erofs_decompress_pcluster()' is too long therefore it'd be better to introduce another helper to parse decompressed pages (or laterly, decompressed bvecs.) BTW, since `decompressed_bvecs' is too long as a part of the function name, `out_bvecs' is used instead. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 81 +++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c7be447ac64d..c183cd0bc42b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -778,18 +778,58 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } +static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, + struct page **pages, struct page **pagepool) +{ + struct z_erofs_pagevec_ctor ctor; + enum z_erofs_page_type page_type; + int i, err = 0; + + z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, + pcl->pagevec, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type); + unsigned int pagenr; + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (z_erofs_put_shortlivedpage(pagepool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= pcl->nr_pages); + /* + * currently EROFS doesn't support multiref(dedup), + * so here erroring out one multiref page. + */ + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + } + z_erofs_pagevec_ctor_exit(&ctor, true); + return err; +} + static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; struct page **pages, **compressed_pages, *page; - enum z_erofs_page_type page_type; bool overlapped, partial; int err; @@ -823,42 +863,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, for (i = 0; i < nr_pages; ++i) pages[i] = NULL; - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; - - page = z_erofs_pagevec_dequeue(&ctor, &page_type); - - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; - - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; - } - z_erofs_pagevec_ctor_exit(&ctor, true); + err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); overlapped = false; compressed_pages = pcl->compressed_pages; -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:43 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:43 +0800 Subject: [PATCH 08/16] erofs: rework online page handling In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-9-hsiangkao@linux.alibaba.com> Since all decompressed offsets have been integrated to bvecs[], this patch avoids all sub-indexes so that page->private only includes a part count and an eio flag, thus in the future folio->private can have the same meaning. In addition, PG_error will not be used anymore after this patch and we're heading to use page->private (later folio->private) and page->mapping (later folio->mapping) only. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 51 ++++++++++++++---------------------- fs/erofs/zdata.h | 68 ++++++++++++++---------------------------------- 2 files changed, 38 insertions(+), 81 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f2e3f07baad7..9065e160d6a6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -743,7 +743,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -755,7 +755,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -766,7 +766,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, @@ -823,16 +823,15 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, if (err) { DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); - goto err_out; + goto out; } - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); - + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; + /* also update nr_pages */ + index = page->index - (map->m_la >> PAGE_SHIFT); fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); next_part: /* can be used for verification */ @@ -843,16 +842,13 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -901,7 +897,7 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, */ if (pages[pagenr]) { DBG_BUGON(1); - SetPageError(pages[pagenr]); + z_erofs_page_mark_eio(pages[pagenr]); z_erofs_onlinepage_endio(pages[pagenr]); err = -EFSCORRUPTED; } @@ -957,19 +953,13 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, DBG_BUGON(pgnr >= pcl->nr_pages); if (pages[pgnr]) { DBG_BUGON(1); - SetPageError(pages[pgnr]); + z_erofs_page_mark_eio(pages[pgnr]); z_erofs_onlinepage_endio(pages[pgnr]); err = -EFSCORRUPTED; } pages[pgnr] = page; *overlapped = true; } - - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } } if (err) { @@ -981,7 +971,7 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct page **pagepool) + struct page **pagepool, int err) { struct erofs_sb_info *const sbi = EROFS_SB(sb); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); @@ -990,7 +980,6 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, struct page **pages, **compressed_pages, *page; bool overlapped, partial; - int err; might_sleep(); DBG_BUGON(!READ_ONCE(pcl->nr_pages)); @@ -1090,10 +1079,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } @@ -1129,7 +1116,8 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + z_erofs_decompress_pcluster(io->sb, pcl, pagepool, + io->eio ? -EIO : 0); erofs_workgroup_put(&pcl->obj); } } @@ -1233,7 +1221,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, if (page->mapping == mc) { WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for @@ -1305,6 +1292,7 @@ jobqueue_init(struct super_block *sb, q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1365,15 +1353,14 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index a70f1b73e901..852da31e2e91 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -109,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -123,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -162,45 +143,34 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } #define Z_EROFS_VMAP_ONSTACK_PAGES \ -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:35 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:35 +0800 Subject: [PATCH 00/16] erofs: prepare for folios, duplication and kill PG_error Message-ID: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Hi folks, I've been doing this for almost 2 months, the main point of this is to support large folios and rolling hash duplication for compressed data. This patchset is as a start of this work targeting for the next 5.20, it introduces a flexable range representation for (de)compressed buffers instead of too relying on page(s) directly themselves, so large folios can laterly base on this work. Also, this patchset gets rid of all PG_error flags in the decompression code. It's a cleanup as a result as well. In addition, this patchset kicks off rolling hash duplication for compressed data by introducing fully-referenced multi-reference pclusters first instead of reporting fs corruption if one pcluster is introduced by several differnt extents. The full implementation is expected to be finished in the merge window after the next. One of my colleagues is actively working on the userspace part of this feature. However, it's still easy to verify fully-referenced multi-reference pcluster by constructing some image by hand (see attachment): Dataset: 300M seq-read (data-duplicated, read_ahead_kb 8192): 1095MiB/s seq-read (data-duplicated, read_ahead_kb 4096): 771MiB/s seq-read (data-duplicated, read_ahead_kb 512): 577MiB/s seq-read (vanilla, read_ahead_kb 8192): 364MiB/s Finally, this patchset survives ro-fsstress on my side. Thanks, Gao Xiang Gao Xiang (16): erofs: get rid of unneeded `inode', `map' and `sb' erofs: clean up z_erofs_collector_begin() erofs: introduce `z_erofs_parse_out_bvecs()' erofs: introduce bufvec to store decompressed buffers erofs: drop the old pagevec approach erofs: introduce `z_erofs_parse_in_bvecs' erofs: switch compressed_pages[] to bufvec erofs: rework online page handling erofs: get rid of `enum z_erofs_page_type' erofs: clean up `enum z_erofs_collectmode' erofs: get rid of `z_pagemap_global' erofs: introduce struct z_erofs_decompress_backend erofs: try to leave (de)compressed_pages on stack if possible erofs: introduce z_erofs_do_decompressed_bvec() erofs: record the longest decompressed size in this round erofs: introduce multi-reference pclusters (fully-referenced) fs/erofs/compress.h | 2 +- fs/erofs/decompressor.c | 2 +- fs/erofs/zdata.c | 777 ++++++++++++++++++++++------------------ fs/erofs/zdata.h | 119 +++--- fs/erofs/zpvec.h | 159 -------- 5 files changed, 490 insertions(+), 569 deletions(-) delete mode 100644 fs/erofs/zpvec.h -- From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:36 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:36 +0800 Subject: [PATCH 01/16] erofs: get rid of unneeded `inode', `map' and `sb' In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-2-hsiangkao@linux.alibaba.com> Since commit 5c6dcc57e2e5 ("erofs: get rid of `struct z_erofs_collector'"), these arguments can be dropped as well. No logic changes. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..1b6816dd235f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -404,10 +404,9 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; struct z_erofs_pcluster *pcl = fe->pcl; unsigned int length; @@ -449,10 +448,9 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, return 0; } -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -494,7 +492,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,10 +518,9 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; struct erofs_workgroup *grp; int ret; @@ -541,19 +538,19 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, goto tailpacking; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(fe->inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); + ret = z_erofs_register_pcluster(fe); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_pcluster(fe, inode, map); + ret = z_erofs_lookup_pcluster(fe); if (ret) { erofs_workgroup_put(&fe->pcl->obj); return ret; @@ -663,7 +660,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) goto err_out; @@ -1259,13 +1256,13 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1314,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1366,14 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1471,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1520,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:40 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:40 +0800 Subject: [PATCH 05/16] erofs: drop the old pagevec approach In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-6-hsiangkao@linux.alibaba.com> Remove the old pagevec approach but keep z_erofs_page_type for now. It will be reworked in the following commits as well. Also rename Z_EROFS_NR_INLINE_PAGEVECS as Z_EROFS_INLINE_BVECS with the new value 2 since it's actually enough to bootstrap. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 17 +++-- fs/erofs/zdata.h | 9 +-- fs/erofs/zpvec.h | 159 ----------------------------------------------- 3 files changed, 16 insertions(+), 169 deletions(-) delete mode 100644 fs/erofs/zpvec.h diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f52c54058f31..e96704db106e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -27,6 +27,17 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +/* page type in pagevec for decompress subsystem */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + struct z_erofs_bvec_iter { struct page *bvpage; struct z_erofs_bvset *bvset; @@ -273,7 +284,6 @@ struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; @@ -636,7 +646,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) return ret; } z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, - Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt); + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -871,8 +881,7 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, struct page *old_bvpage; int i, err = 0; - z_erofs_bvec_iter_begin(&biter, &pcl->bvset, - Z_EROFS_NR_INLINE_PAGEVECS, 0); + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; unsigned int pagenr; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index d03e333e4fde..a755c5a44d87 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,10 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 +#define Z_EROFS_INLINE_BVECS 2 #define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 #define Z_EROFS_PCLUSTER_LENGTH_BIT 1 @@ -34,7 +34,7 @@ struct name { \ struct z_erofs_bvec bvec[total]; \ }; __Z_EROFS_BVSET(z_erofs_bvset,) -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS) +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS) /* * Structure fields follow one of the following exclusion rules. @@ -69,9 +69,6 @@ struct z_erofs_pcluster { unsigned short nr_pages; union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; - /* L: inline a certain number of bvec for bootstrap */ struct z_erofs_bvset_inline bvset; diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:46 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:46 +0800 Subject: [PATCH 11/16] erofs: get rid of `z_pagemap_global' In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-12-hsiangkao@linux.alibaba.com> In order to introduce multi-reference pclusters for compressed data deduplication, let's get rid of the global page array for now since it needs to be re-designed then at least. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 28 ++++------------------------ fs/erofs/zdata.h | 1 - 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 55bcd6e5ae9a..f24b866bc975 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -291,9 +291,6 @@ struct z_erofs_decompress_frontend { .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); - static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, struct page **pagepool) @@ -966,26 +963,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, mutex_lock(&pcl->lock); nr_pages = pcl->nr_pages; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { + if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; - - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; - + else pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); - - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } - } + GFP_KERNEL | __GFP_NOFAIL); for (i = 0; i < nr_pages; ++i) pages[i] = NULL; @@ -1063,9 +1045,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) + if (pages != pages_onstack) kvfree(pages); pcl->nr_pages = 0; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 852da31e2e91..5964c942799e 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -175,6 +175,5 @@ static inline void z_erofs_onlinepage_endio(struct page *page) #define Z_EROFS_VMAP_ONSTACK_PAGES \ min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 #endif -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:39 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:39 +0800 Subject: [PATCH 04/16] erofs: introduce bufvec to store decompressed buffers In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-5-hsiangkao@linux.alibaba.com> For each pcluster, the total compressed buffers are determined in advance, yet the number of decompressed buffers actually vary. Too many decompressed pages can be recorded if one pcluster is highly compressed or its pcluster size is large. That takes extra memory footprints compared to uncompressed filesystems, especially a lot of I/O in flight on low-ended devices. Therefore, similar to inplace I/O, pagevec was introduced to reuse page cache to store these pointers in the time-sharing way since these pages are actually unused before decompressing. In order to make it more flexable, a cleaner bufvec is used to replace the old pagevec stuffs so that - Decompressed offsets can be stored inline, thus it can be used for the upcoming feature like compressed data deduplication; - Towards supporting large folios for compressed inodes since our final goal is to completely avoid page->private but use folio->private only for all page cache pages. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 177 +++++++++++++++++++++++++++++++++++------------ fs/erofs/zdata.h | 26 +++++-- 2 files changed, 153 insertions(+), 50 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c183cd0bc42b..f52c54058f31 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -195,9 +272,10 @@ enum z_erofs_collectmode { struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; - + struct z_erofs_bvec_iter biter; struct z_erofs_pagevec_ctor vector; + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; /* a pointer used to pick up inplace I/O pages */ struct page **icpage_ptr; @@ -358,21 +436,24 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, + enum z_erofs_page_type type) { int ret; - /* give priority for inplaceio */ if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec->page)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -554,9 +635,8 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } - - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -588,9 +668,14 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. @@ -712,22 +797,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), page_type); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); goto err_out; + } index = page->index - (map->m_la >> PAGE_SHIFT); @@ -781,29 +867,24 @@ static bool z_erofs_page_is_invalidated(struct page *page) static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, struct page **pages, struct page **pagepool) { - struct z_erofs_pagevec_ctor ctor; - enum z_erofs_page_type page_type; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; int i, err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, + Z_EROFS_NR_INLINE_PAGEVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { - struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type); + struct z_erofs_bvec bvec; unsigned int pagenr; - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + if (old_bvpage) + z_erofs_put_shortlivedpage(pagepool, old_bvpage); + pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pagenr >= pcl->nr_pages); + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); /* * currently EROFS doesn't support multiref(dedup), * so here erroring out one multiref page. @@ -814,9 +895,12 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, z_erofs_onlinepage_endio(pages[pagenr]); err = -EFSCORRUPTED; } - pages[pagenr] = page; + pages[pagenr] = bvec.page; } - z_erofs_pagevec_ctor_exit(&ctor, true); + + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(pagepool, old_bvpage); return err; } @@ -986,6 +1070,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, kvfree(pages); pcl->nr_pages = 0; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..d03e333e4fde 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -21,6 +21,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -41,22 +56,25 @@ struct z_erofs_pcluster { /* A: lower limit of decompressed length and if full length or not */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ + /* L: maximum relative page index in bvecs */ unsigned short nr_pages; - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { /* L: inline a certain number of pagevecs for bootstrap */ erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:50 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:50 +0800 Subject: [PATCH 15/16] erofs: record the longest decompressed size in this round In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-16-hsiangkao@linux.alibaba.com> Currently, `pcl->length' records the longest decompressed length as long as the pcluster itself isn't reclaimed. However, such number is unneeded for the general cases since it doesn't indicate the exact decompressed size in this round. Instead, let's record the decompressed size for this round instead, thus `pcl->nr_pages' can be completely dropped and pageofs_out is also designed to be kept in sync with `pcl->length'. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 76 +++++++++++++++++------------------------------- fs/erofs/zdata.h | 11 +++---- 2 files changed, 30 insertions(+), 57 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 391755dafecd..8dcfc2a9704e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -482,7 +482,6 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; /* to avoid unexpected loop formed by corrupted images */ if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { @@ -495,24 +494,6 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) return -EFSCORRUPTED; } - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -543,9 +524,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; @@ -703,7 +683,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -806,12 +786,17 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - index = page->index - (map->m_la >> PAGE_SHIFT); - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) @@ -858,7 +843,7 @@ struct z_erofs_decompress_backend { struct page **compressed_pages; struct page **pagepool; - unsigned int onstack_used; + unsigned int onstack_used, nr_pages; }; static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, @@ -867,7 +852,7 @@ static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, unsigned int pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; struct page *oldpage; - DBG_BUGON(pgnr >= be->pcl->nr_pages); + DBG_BUGON(pgnr >= be->nr_pages); oldpage = be->decompressed_pages[pgnr]; be->decompressed_pages[pgnr] = bvec->page; @@ -955,23 +940,22 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - unsigned int i, inputsize, outputsize, llen, nr_pages, err2; + unsigned int i, inputsize, err2; struct page *page; - bool overlapped, partial; + bool overlapped; - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; /* allocate (de)compressed page arrays if cannot be kept on stack */ be->decompressed_pages = NULL; be->compressed_pages = NULL; be->onstack_used = 0; - if (nr_pages <= Z_EROFS_ONSTACK_PAGES) { + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { be->decompressed_pages = be->onstack_pages; - be->onstack_used = nr_pages; + be->onstack_used = be->nr_pages; memset(be->decompressed_pages, 0, - sizeof(struct page *) * nr_pages); + sizeof(struct page *) * be->nr_pages); } if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) @@ -979,7 +963,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (!be->decompressed_pages) be->decompressed_pages = - kvcalloc(nr_pages, sizeof(struct page *), + kvcalloc(be->nr_pages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = @@ -993,15 +977,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else @@ -1014,10 +989,10 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial + .partial_decoding = pcl->partial, }, be->pagepool); out: @@ -1042,7 +1017,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); - for (i = 0; i < nr_pages; ++i) { + for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; if (!page) continue; @@ -1060,7 +1035,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (be->decompressed_pages != be->onstack_pages) kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index ec09ca035fbb..a7fd44d21d9e 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -12,9 +12,6 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 - /* * let's leave a type here in case of introducing * another tagged pointer later. @@ -53,7 +50,7 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; /* L: total number of bvecs */ @@ -65,9 +62,6 @@ struct z_erofs_pcluster { /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in bvecs */ - unsigned short nr_pages; - union { /* L: inline a certain number of bvec for bootstrap */ struct z_erofs_bvset_inline bvset; @@ -87,6 +81,9 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; + /* L: whether partial decompression or not */ + bool partial; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:48 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:48 +0800 Subject: [PATCH 13/16] erofs: try to leave (de)compressed_pages on stack if possible In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-14-hsiangkao@linux.alibaba.com> For the most cases, small pclusters can be decompressed with page arrays on stack. Try to leave both (de)compressed_pages on stack if possible as before. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 7aea6bb1e018..4093d8a4ce93 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -858,6 +858,7 @@ struct z_erofs_decompress_backend { struct page **compressed_pages; struct page **pagepool; + unsigned int onstack_used; }; static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) @@ -904,14 +905,9 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct page **compressed_pages; int i, err = 0; - /* XXX: will have a better approach in the following commits */ - compressed_pages = kmalloc_array(pclusterpages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); *overlapped = false; - for (i = 0; i < pclusterpages; ++i) { struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; @@ -922,7 +918,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, DBG_BUGON(1); continue; } - compressed_pages[i] = page; + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -953,11 +949,8 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, } } - if (err) { - kfree(compressed_pages); + if (err) return err; - } - be->compressed_pages = compressed_pages; return 0; } @@ -975,15 +968,28 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, mutex_lock(&pcl->lock); nr_pages = pcl->nr_pages; + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; if (nr_pages <= Z_EROFS_ONSTACK_PAGES) { be->decompressed_pages = be->onstack_pages; + be->onstack_used = nr_pages; memset(be->decompressed_pages, 0, sizeof(struct page *) * nr_pages); - } else { + } + + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) be->decompressed_pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); - } + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); err = z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); @@ -1037,7 +1043,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } - kfree(be->compressed_pages); + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); for (i = 0; i < nr_pages; ++i) { page = be->decompressed_pages[i]; -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:41 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:41 +0800 Subject: [PATCH 06/16] erofs: introduce `z_erofs_parse_in_bvecs' In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-7-hsiangkao@linux.alibaba.com> `z_erofs_decompress_pcluster()' is too long therefore it'd be better to introduce another helper to parse compressed pages (or laterly, compressed bvecs.) BTW, since `compressed_bvecs' is too long as a part of the function name, `in_bvecs' is used here instead. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 132 ++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 52 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index e96704db106e..757d352bc2c7 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -913,6 +913,76 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, return err; } +static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, struct page **pages, + struct page **pagepool, bool *overlapped) +{ + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + struct page **compressed_pages; + int i, err = 0; + + /* XXX: will have a better approach in the following commits */ + compressed_pages = kmalloc_array(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + *overlapped = false; + + for (i = 0; i < pclusterpages; ++i) { + unsigned int pagenr; + struct page *page = pcl->compressed_pages[i]; + + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + compressed_pages[i] = page; + + if (z_erofs_is_inline_pcluster(pcl)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + DBG_BUGON(z_erofs_page_is_invalidated(page)); + if (!z_erofs_is_shortlived_page(page)) { + if (erofs_page_is_managed(sbi, page)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + /* + * only if non-head page can be selected + * for inplace decompression + */ + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= pcl->nr_pages); + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + + *overlapped = true; + } + + /* PG_error needs checking for all non-managed pages */ + if (PageError(page)) { + DBG_BUGON(PageUptodate(page)); + err = -EIO; + } + } + + if (err) { + kfree(compressed_pages); + return ERR_PTR(err); + } + return compressed_pages; +} + static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) @@ -957,54 +1027,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, pages[i] = NULL; err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); - - overlapped = false; - compressed_pages = pcl->compressed_pages; - - for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; - - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); - - if (z_erofs_is_inline_pcluster(pcl)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - - DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; - - overlapped = true; - } - - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, + pagepool, &overlapped); + if (IS_ERR(compressed_pages)) { + err = PTR_ERR(compressed_pages); + compressed_pages = NULL; } if (err) @@ -1040,21 +1067,22 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_pages[0]; + WRITE_ONCE(pcl->compressed_pages[0], NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_pages[i], NULL); } } + kfree(compressed_pages); for (i = 0; i < nr_pages; ++i) { page = pages[i]; -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:42 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:42 +0800 Subject: [PATCH 07/16] erofs: switch compressed_pages[] to bufvec In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-8-hsiangkao@linux.alibaba.com> Convert compressed_pages[] to bufvec in order to avoid using page->private to keep onlinepage_index (decompressed offset) for inplace I/O pages. In the future, we only rely on folio->private to keep a countdown to unlock folios and set folio_uptodate. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 113 +++++++++++++++++++++++------------------------ fs/erofs/zdata.h | 4 +- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 757d352bc2c7..f2e3f07baad7 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -134,7 +134,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -287,16 +287,16 @@ struct z_erofs_decompress_frontend { struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - enum z_erofs_collectmode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ @@ -319,24 +319,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; if (fe->mode < COLLECT_PRIMARY_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -357,7 +354,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -388,7 +386,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -401,7 +399,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -411,36 +409,39 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } @@ -454,7 +455,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, if (fe->mode >= COLLECT_PRIMARY && type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec->page)) + if (z_erofs_try_inplace_io(fe, bvec)) return 0; /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && @@ -648,8 +649,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -769,7 +769,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto err_out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ @@ -927,8 +928,9 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; - struct page *page = pcl->compressed_pages[i]; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; + unsigned int pgnr; /* compressed pages ought to be present before decompressing */ if (!page) { @@ -951,21 +953,15 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, continue; } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= pcl->nr_pages); - if (pages[pagenr]) { + pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= pcl->nr_pages); + if (pages[pgnr]) { DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); + SetPageError(pages[pgnr]); + z_erofs_onlinepage_endio(pages[pgnr]); err = -EFSCORRUPTED; } - pages[pagenr] = page; - + pages[pgnr] = page; *overlapped = true; } @@ -1067,19 +1063,19 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = pcl->compressed_pages[0]; - WRITE_ONCE(pcl->compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } kfree(compressed_pages); @@ -1193,7 +1189,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1209,7 +1205,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1235,14 +1231,14 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1271,7 +1267,8 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index a755c5a44d87..a70f1b73e901 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -87,8 +87,8 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:49 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:49 +0800 Subject: [PATCH 14/16] erofs: introduce z_erofs_do_decompressed_bvec() In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-15-hsiangkao@linux.alibaba.com> Both out_bvecs and in_bvecs share the common logic for decompressed buffers. So let's make a helper for this. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 49 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 4093d8a4ce93..391755dafecd 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -861,6 +861,26 @@ struct z_erofs_decompress_backend { unsigned int onstack_used; }; +static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + unsigned int pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + struct page *oldpage; + + DBG_BUGON(pgnr >= be->pcl->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; + + /* error out if one pcluster is refenenced multiple times. */ + if (oldpage) { + DBG_BUGON(1); + z_erofs_page_mark_eio(oldpage); + z_erofs_onlinepage_endio(oldpage); + return -EFSCORRUPTED; + } + return 0; +} + static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; @@ -871,27 +891,14 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; - unsigned int pgnr; z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - pgnr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= pcl->nr_pages); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (be->decompressed_pages[pgnr]) { - DBG_BUGON(1); - z_erofs_page_mark_eio(be->decompressed_pages[pgnr]); - z_erofs_onlinepage_endio(be->decompressed_pages[pgnr]); - err = -EFSCORRUPTED; - } - be->decompressed_pages[pgnr] = bvec.page; + err = z_erofs_do_decompressed_bvec(be, &bvec); } old_bvpage = z_erofs_bvec_iter_end(&biter); @@ -911,7 +918,6 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, for (i = 0; i < pclusterpages; ++i) { struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - unsigned int pgnr; /* compressed pages ought to be present before decompressing */ if (!page) { @@ -933,18 +939,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, err = -EIO; continue; } - - pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= pcl->nr_pages); - if (be->decompressed_pages[pgnr]) { - DBG_BUGON(1); - z_erofs_page_mark_eio( - be->decompressed_pages[pgnr]); - z_erofs_onlinepage_endio( - be->decompressed_pages[pgnr]); - err = -EFSCORRUPTED; - } - be->decompressed_pages[pgnr] = page; + err = z_erofs_do_decompressed_bvec(be, bvec); *overlapped = true; } } -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:47 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:47 +0800 Subject: [PATCH 12/16] erofs: introduce struct z_erofs_decompress_backend In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-13-hsiangkao@linux.alibaba.com> Let's introduce struct z_erofs_decompress_backend in order to pass on the decompression backend context between helper functions more easier and avoid too many arguments. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 142 +++++++++++++++++++++++++---------------------- fs/erofs/zdata.h | 3 +- 2 files changed, 76 insertions(+), 69 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f24b866bc975..7aea6bb1e018 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -847,9 +847,22 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, - struct page **pages, struct page **pagepool) +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; + + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; + + struct page **pagepool; +}; + +static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { + struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; struct page *old_bvpage; int i, err = 0; @@ -857,39 +870,39 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; - unsigned int pagenr; + unsigned int pgnr; z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); if (old_bvpage) - z_erofs_put_shortlivedpage(pagepool, old_bvpage); + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pagenr >= pcl->nr_pages); + pgnr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= pcl->nr_pages); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); /* * currently EROFS doesn't support multiref(dedup), * so here erroring out one multiref page. */ - if (pages[pagenr]) { + if (be->decompressed_pages[pgnr]) { DBG_BUGON(1); - z_erofs_page_mark_eio(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); + z_erofs_page_mark_eio(be->decompressed_pages[pgnr]); + z_erofs_onlinepage_endio(be->decompressed_pages[pgnr]); err = -EFSCORRUPTED; } - pages[pagenr] = bvec.page; + be->decompressed_pages[pgnr] = bvec.page; } old_bvpage = z_erofs_bvec_iter_end(&biter); if (old_bvpage) - z_erofs_put_shortlivedpage(pagepool, old_bvpage); + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); return err; } -static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, - struct z_erofs_pcluster *pcl, struct page **pages, - struct page **pagepool, bool *overlapped) +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) { + struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct page **compressed_pages; int i, err = 0; @@ -919,7 +932,7 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; @@ -927,59 +940,55 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= pcl->nr_pages); - if (pages[pgnr]) { + if (be->decompressed_pages[pgnr]) { DBG_BUGON(1); - z_erofs_page_mark_eio(pages[pgnr]); - z_erofs_onlinepage_endio(pages[pgnr]); + z_erofs_page_mark_eio( + be->decompressed_pages[pgnr]); + z_erofs_onlinepage_endio( + be->decompressed_pages[pgnr]); err = -EFSCORRUPTED; } - pages[pgnr] = page; + be->decompressed_pages[pgnr] = page; *overlapped = true; } } if (err) { kfree(compressed_pages); - return ERR_PTR(err); + return err; } - return compressed_pages; + be->compressed_pages = compressed_pages; + return 0; } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool, int err) +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; - + unsigned int i, inputsize, outputsize, llen, nr_pages, err2; + struct page *page; bool overlapped, partial; - might_sleep(); DBG_BUGON(!READ_ONCE(pcl->nr_pages)); - mutex_lock(&pcl->lock); nr_pages = pcl->nr_pages; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) - pages = pages_onstack; - else - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); - - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; - - err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); - compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, - pagepool, &overlapped); - if (IS_ERR(compressed_pages)) { - err = PTR_ERR(compressed_pages); - compressed_pages = NULL; + if (nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * nr_pages); + } else { + be->decompressed_pages = + kvcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); } + err = z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; @@ -998,9 +1007,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, @@ -1008,7 +1017,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = partial - }, pagepool); + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ @@ -1024,29 +1033,29 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } - kfree(compressed_pages); + kfree(be->compressed_pages); for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; if (err) z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); pcl->nr_pages = 0; pcl->bvset.nextpage = NULL; @@ -1061,23 +1070,23 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool, - io->eio ? -EIO : 0); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1103,7 +1112,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 5964c942799e..ec09ca035fbb 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -173,7 +173,6 @@ static inline void z_erofs_onlinepage_endio(struct page *page) } } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_ONSTACK_PAGES 32 #endif -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:45 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:45 +0800 Subject: [PATCH 10/16] erofs: clean up `enum z_erofs_collectmode' In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-11-hsiangkao@linux.alibaba.com> `enum z_erofs_collectmode' is really ambiguous, but I'm not quite sure if there are better naming, basically it's used to judge whether inplace I/O can be used due to the current status of pclusters in the chain. Rename it as `enum z_erofs_pclustermode' instead. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 63 ++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index cdfb2706e4ae..55bcd6e5ae9a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -227,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or * pagevec) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -261,12 +260,12 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { @@ -277,7 +276,7 @@ struct z_erofs_decompress_frontend { struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; z_erofs_next_pcluster_t owned_head; - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ @@ -290,7 +289,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); @@ -310,7 +309,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pcl->pclusterpages; ++i) { @@ -358,7 +357,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -439,12 +438,12 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, { int ret; - if (fe->mode >= COLLECT_PRIMARY && exclusive) { + if (exclusive) { /* give priority for inplaceio to use file pages first */ if (z_erofs_try_inplace_io(fe, bvec)) return 0; /* otherwise, check if it can be used as a bvpage */ - if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } @@ -463,7 +462,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -474,12 +473,12 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) @@ -554,7 +553,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -676,7 +675,7 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -756,7 +755,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, get_page(fe->map.buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -774,8 +773,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or pagevec (should be processed in strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -785,7 +784,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:51 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:51 +0800 Subject: [PATCH 16/16] erofs: introduce multi-reference pclusters (fully-referenced) In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-17-hsiangkao@linux.alibaba.com> Let's introduce multi-reference pclusters at runtime. In details, if one pcluster is requested by multiple extents at almost the same time (even belong to different files), the longest extent will be decompressed as representative and the other extents are actually copied from the longest one. After this patch, fully-referenced extents can be correctly handled and the full decoding check needs to be bypassed for partial-referenced extents. Signed-off-by: Gao Xiang --- fs/erofs/compress.h | 2 +- fs/erofs/decompressor.c | 2 +- fs/erofs/zdata.c | 120 +++++++++++++++++++++++++++------------- fs/erofs/zdata.h | 3 + 4 files changed, 88 insertions(+), 39 deletions(-) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..91b9bff10198 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8dcfc2a9704e..601cfcb07c50 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -467,7 +467,8 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; f->mode = Z_EROFS_PCLUSTER_HOOKED; @@ -480,20 +481,8 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) { - struct erofs_map_blocks *map = &fe->map; struct z_erofs_pcluster *pcl = fe->pcl; - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -785,6 +774,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; + fe->pcl->multibases = + (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)); if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; @@ -842,36 +833,90 @@ struct z_erofs_decompress_backend { /* pages to keep the compressed data */ struct page **compressed_pages; + struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; }; -static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, - struct z_erofs_bvec *bvec) +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; + +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) { - unsigned int pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; - struct page *oldpage; + struct z_erofs_bvec_item *item; - DBG_BUGON(pgnr >= be->nr_pages); - oldpage = be->decompressed_pages[pgnr]; - be->decompressed_pages[pgnr] = bvec->page; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - /* error out if one pcluster is refenenced multiple times. */ - if (oldpage) { - DBG_BUGON(1); - z_erofs_page_mark_eio(oldpage); - z_erofs_onlinepage_endio(oldpage); - return -EFSCORRUPTED; + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; + + if (!oldpage) + return; + } + + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} + +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); } - return 0; } -static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; struct page *old_bvpage; - int i, err = 0; + int i; z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { @@ -883,13 +928,12 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); - err = z_erofs_do_decompressed_bvec(be, &bvec); + z_erofs_do_decompressed_bvec(be, &bvec); } old_bvpage = z_erofs_bvec_iter_end(&biter); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - return err; } static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, @@ -924,7 +968,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, err = -EIO; continue; } - err = z_erofs_do_decompressed_bvec(be, bvec); + z_erofs_do_decompressed_bvec(be, bvec); *overlapped = true; } } @@ -940,7 +984,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - unsigned int i, inputsize, err2; + unsigned int i, inputsize; struct page *page; bool overlapped; @@ -970,10 +1014,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, kvcalloc(pclusterpages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); - err = z_erofs_parse_out_bvecs(be); - err2 = z_erofs_parse_in_bvecs(be, &overlapped); - if (err2) - err = err2; + z_erofs_parse_out_bvecs(be); + err = z_erofs_parse_in_bvecs(be, &overlapped); if (err) goto out; @@ -993,6 +1035,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, }, be->pagepool); out: @@ -1016,6 +1059,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; @@ -1052,6 +1096,8 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct z_erofs_decompress_backend be = { .sb = io->sb, .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), }; z_erofs_next_pcluster_t owned = io->head; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index a7fd44d21d9e..515fa2b28b97 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -84,6 +84,9 @@ struct z_erofs_pcluster { /* L: whether partial decompression or not */ bool partial; + /* L: indicate several pageofs_outs or not */ + bool multibases; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:44 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:44 +0800 Subject: [PATCH 09/16] erofs: get rid of `enum z_erofs_page_type' In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-10-hsiangkao@linux.alibaba.com> Remove it since pagevec[] is no longer used. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 9065e160d6a6..cdfb2706e4ae 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -27,17 +27,6 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - struct z_erofs_bvec_iter { struct page *bvpage; struct z_erofs_bvset *bvset; @@ -429,7 +418,6 @@ int erofs_try_to_free_cached_page(struct page *page) return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { @@ -447,13 +435,11 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec, - enum z_erofs_page_type type) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { + if (fe->mode >= COLLECT_PRIMARY && exclusive) { /* give priority for inplaceio to use file pages first */ if (z_erofs_try_inplace_io(fe, bvec)) return 0; @@ -718,10 +704,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; unsigned int cur, end, spiltted, index; int err = 0; @@ -798,12 +783,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); @@ -812,7 +792,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, .page = page, .offset = offset - map->m_la, .end = end, - }), page_type); + }), exclusive); /* should allocate an additional short-lived page for bvset */ if (err == -EAGAIN && !fe->candidate_bvpage) { fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:20:37 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:20:37 +0800 Subject: [PATCH 02/16] erofs: clean up z_erofs_collector_begin() In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: <20220714132051.46012-3-hsiangkao@linux.alibaba.com> Rearrange the code and get rid of all gotos. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 1b6816dd235f..c7be447ac64d 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -521,7 +521,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; - struct erofs_workgroup *grp; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -530,33 +530,31 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(fe->inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: ret = z_erofs_register_pcluster(fe); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; } - ret = z_erofs_lookup_pcluster(fe); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + ret = z_erofs_lookup_pcluster(fe); + if (ret) { + erofs_workgroup_put(&fe->pcl->obj); + return ret; + } + } else if (ret) { return ret; } -out: z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->pagevec, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ -- 2.24.4 From hsiangkao at linux.alibaba.com Thu Jul 14 23:38:57 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Thu, 14 Jul 2022 21:38:57 +0800 Subject: [PATCH 00/16] erofs: prepare for folios, duplication and kill PG_error In-Reply-To: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> Message-ID: On Thu, Jul 14, 2022 at 09:20:35PM +0800, Gao Xiang wrote: > Hi folks, > > I've been doing this for almost 2 months, the main point of this is > to support large folios and rolling hash deduplication for compressed > data. > > This patchset is as a start of this work targeting for the next 5.20, > it introduces a flexable range representation for (de)compressed buffers > instead of too relying on page(s) directly themselves, so large folios > can laterly base on this work. Also, this patchset gets rid of all > PG_error flags in the decompression code. It's a cleanup as a result > as well. > > In addition, this patchset kicks off rolling hash deduplication for > compressed data by introducing fully-referenced multi-reference > pclusters first instead of reporting fs corruption if one pcluster > is introduced by several differnt extents. The full implementation > is expected to be finished in the merge window after the next. One > of my colleagues is actively working on the userspace part of this > feature. > > However, it's still easy to verify fully-referenced multi-reference > pcluster by constructing some image by hand (see attachment): > > Dataset: 300M > seq-read (data-deduplicated, read_ahead_kb 8192): 1095MiB/s > seq-read (data-deduplicated, read_ahead_kb 4096): 771MiB/s > seq-read (data-deduplicated, read_ahead_kb 512): 577MiB/s > seq-read (vanilla, read_ahead_kb 8192): 364MiB/s > Testdata above as attachment for reference. -------------- next part -------------- A non-text attachment was scrubbed... Name: pat.erofs.xz Type: application/octet-stream Size: 12212 bytes Desc: not available URL: From zbestahu at gmail.com Fri Jul 15 16:20:58 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 14:20:58 +0800 Subject: [PATCH 01/16] erofs: get rid of unneeded `inode', `map' and `sb' In-Reply-To: <20220714132051.46012-2-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-2-hsiangkao@linux.alibaba.com> Message-ID: <20220715142058.00005f60.zbestahu@gmail.com> On Thu, 14 Jul 2022 21:20:36 +0800 Gao Xiang wrote: > Since commit 5c6dcc57e2e5 ("erofs: get rid of > `struct z_erofs_collector'"), these arguments can be dropped as well. > > No logic changes. > > Signed-off-by: Gao Xiang > --- > fs/erofs/zdata.c | 42 +++++++++++++++++++----------------------- > 1 file changed, 19 insertions(+), 23 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index 724bb57075f6..1b6816dd235f 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -404,10 +404,9 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) > f->mode = COLLECT_PRIMARY; > } > > -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, > - struct inode *inode, > - struct erofs_map_blocks *map) > +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) > { > + struct erofs_map_blocks *map = &fe->map; > struct z_erofs_pcluster *pcl = fe->pcl; > unsigned int length; > > @@ -449,10 +448,9 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, > return 0; > } > > -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, > - struct inode *inode, > - struct erofs_map_blocks *map) > +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) > { > + struct erofs_map_blocks *map = &fe->map; > bool ztailpacking = map->m_flags & EROFS_MAP_META; > struct z_erofs_pcluster *pcl; > struct erofs_workgroup *grp; > @@ -494,7 +492,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, > } else { > pcl->obj.index = map->m_pa >> PAGE_SHIFT; > > - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); > + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); > if (IS_ERR(grp)) { > err = PTR_ERR(grp); > goto err_out; > @@ -520,10 +518,9 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, > return err; > } > > -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, > - struct inode *inode, > - struct erofs_map_blocks *map) > +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) > { > + struct erofs_map_blocks *map = &fe->map; > struct erofs_workgroup *grp; > int ret; > > @@ -541,19 +538,19 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, > goto tailpacking; > } > > - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); > + grp = erofs_find_workgroup(fe->inode->i_sb, map->m_pa >> PAGE_SHIFT); > if (grp) { > fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); > } else { > tailpacking: > - ret = z_erofs_register_pcluster(fe, inode, map); > + ret = z_erofs_register_pcluster(fe); > if (!ret) > goto out; > if (ret != -EEXIST) > return ret; > } > > - ret = z_erofs_lookup_pcluster(fe, inode, map); > + ret = z_erofs_lookup_pcluster(fe); > if (ret) { > erofs_workgroup_put(&fe->pcl->obj); > return ret; > @@ -663,7 +660,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, > if (!(map->m_flags & EROFS_MAP_MAPPED)) > goto hitted; > > - err = z_erofs_collector_begin(fe, inode, map); > + err = z_erofs_collector_begin(fe); > if (err) > goto err_out; > > @@ -1259,13 +1256,13 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) > bio_put(bio); > } > > -static void z_erofs_submit_queue(struct super_block *sb, > - struct z_erofs_decompress_frontend *f, > +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, > struct page **pagepool, > struct z_erofs_decompressqueue *fgq, > bool *force_fg) > { > - struct erofs_sb_info *const sbi = EROFS_SB(sb); > + struct super_block *sb = f->inode->i_sb; > + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); > z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; > struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; > void *bi_private; > @@ -1317,7 +1314,7 @@ static void z_erofs_submit_queue(struct super_block *sb, > struct page *page; > > page = pickup_page_for_submission(pcl, i++, pagepool, > - MNGD_MAPPING(sbi)); > + mc); > if (!page) > continue; > > @@ -1369,15 +1366,14 @@ static void z_erofs_submit_queue(struct super_block *sb, > z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); > } > > -static void z_erofs_runqueue(struct super_block *sb, > - struct z_erofs_decompress_frontend *f, > +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, > struct page **pagepool, bool force_fg) > { > struct z_erofs_decompressqueue io[NR_JOBQUEUES]; > > if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) > return; > - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); > + z_erofs_submit_queue(f, pagepool, io, &force_fg); > > /* handle bypass queue (no i/o pclusters) immediately */ > z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); > @@ -1475,7 +1471,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) > (void)z_erofs_collector_end(&f); > > /* if some compressed cluster ready, need submit them anyway */ > - z_erofs_runqueue(inode->i_sb, &f, &pagepool, > + z_erofs_runqueue(&f, &pagepool, > z_erofs_get_sync_decompress_policy(sbi, 0)); > > if (err) > @@ -1524,7 +1520,7 @@ static void z_erofs_readahead(struct readahead_control *rac) > z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); > (void)z_erofs_collector_end(&f); > > - z_erofs_runqueue(inode->i_sb, &f, &pagepool, > + z_erofs_runqueue(&f, &pagepool, > z_erofs_get_sync_decompress_policy(sbi, nr_pages)); > erofs_put_metabuf(&f.map.buf); > erofs_release_pages(&pagepool); Reviewed-by: Yue Hu From zbestahu at gmail.com Fri Jul 15 16:22:01 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 14:22:01 +0800 Subject: [PATCH 02/16] erofs: clean up z_erofs_collector_begin() In-Reply-To: <20220714132051.46012-3-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-3-hsiangkao@linux.alibaba.com> Message-ID: <20220715142201.000030f1.zbestahu@gmail.com> On Thu, 14 Jul 2022 21:20:37 +0800 Gao Xiang wrote: > Rearrange the code and get rid of all gotos. > > Signed-off-by: Gao Xiang > --- > fs/erofs/zdata.c | 32 +++++++++++++++----------------- > 1 file changed, 15 insertions(+), 17 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index 1b6816dd235f..c7be447ac64d 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -521,7 +521,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) > static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) > { > struct erofs_map_blocks *map = &fe->map; > - struct erofs_workgroup *grp; > + struct erofs_workgroup *grp = NULL; > int ret; > > DBG_BUGON(fe->pcl); > @@ -530,33 +530,31 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) > DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); > DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); > > - if (map->m_flags & EROFS_MAP_META) { > - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { > - DBG_BUGON(1); > - return -EFSCORRUPTED; > - } > - goto tailpacking; > + if (!(map->m_flags & EROFS_MAP_META)) { > + grp = erofs_find_workgroup(fe->inode->i_sb, > + map->m_pa >> PAGE_SHIFT); > + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { > + DBG_BUGON(1); > + return -EFSCORRUPTED; > } > > - grp = erofs_find_workgroup(fe->inode->i_sb, map->m_pa >> PAGE_SHIFT); > if (grp) { > fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); > + ret = -EEXIST; > } else { > -tailpacking: > ret = z_erofs_register_pcluster(fe); > - if (!ret) > - goto out; > - if (ret != -EEXIST) > - return ret; > } > > - ret = z_erofs_lookup_pcluster(fe); > - if (ret) { > - erofs_workgroup_put(&fe->pcl->obj); > + if (ret == -EEXIST) { > + ret = z_erofs_lookup_pcluster(fe); > + if (ret) { > + erofs_workgroup_put(&fe->pcl->obj); > + return ret; > + } > + } else if (ret) { > return ret; > } > > -out: > z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, > fe->pcl->pagevec, fe->pcl->vcnt); > /* since file-backed online pages are traversed in reverse order */ Reviewed-by: Yue Hu From zbestahu at gmail.com Fri Jul 15 16:22:53 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 14:22:53 +0800 Subject: [PATCH 03/16] erofs: introduce `z_erofs_parse_out_bvecs()' In-Reply-To: <20220714132051.46012-4-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-4-hsiangkao@linux.alibaba.com> Message-ID: <20220715142253.00005239.zbestahu@gmail.com> On Thu, 14 Jul 2022 21:20:38 +0800 Gao Xiang wrote: > `z_erofs_decompress_pcluster()' is too long therefore it'd be better > to introduce another helper to parse decompressed pages (or laterly, > decompressed bvecs.) > > BTW, since `decompressed_bvecs' is too long as a part of the function > name, `out_bvecs' is used instead. > > Signed-off-by: Gao Xiang > --- > fs/erofs/zdata.c | 81 +++++++++++++++++++++++++----------------------- > 1 file changed, 43 insertions(+), 38 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index c7be447ac64d..c183cd0bc42b 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -778,18 +778,58 @@ static bool z_erofs_page_is_invalidated(struct page *page) > return !page->mapping && !z_erofs_is_shortlived_page(page); > } > > +static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, > + struct page **pages, struct page **pagepool) > +{ > + struct z_erofs_pagevec_ctor ctor; > + enum z_erofs_page_type page_type; > + int i, err = 0; > + > + z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, > + pcl->pagevec, 0); > + for (i = 0; i < pcl->vcnt; ++i) { > + struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type); > + unsigned int pagenr; > + > + /* all pages in pagevec ought to be valid */ > + DBG_BUGON(!page); > + DBG_BUGON(z_erofs_page_is_invalidated(page)); > + > + if (z_erofs_put_shortlivedpage(pagepool, page)) > + continue; > + > + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) > + pagenr = 0; > + else > + pagenr = z_erofs_onlinepage_index(page); > + > + DBG_BUGON(pagenr >= pcl->nr_pages); > + /* > + * currently EROFS doesn't support multiref(dedup), > + * so here erroring out one multiref page. > + */ > + if (pages[pagenr]) { > + DBG_BUGON(1); > + SetPageError(pages[pagenr]); > + z_erofs_onlinepage_endio(pages[pagenr]); > + err = -EFSCORRUPTED; > + } > + pages[pagenr] = page; > + } > + z_erofs_pagevec_ctor_exit(&ctor, true); > + return err; > +} > + > static int z_erofs_decompress_pcluster(struct super_block *sb, > struct z_erofs_pcluster *pcl, > struct page **pagepool) > { > struct erofs_sb_info *const sbi = EROFS_SB(sb); > unsigned int pclusterpages = z_erofs_pclusterpages(pcl); > - struct z_erofs_pagevec_ctor ctor; > unsigned int i, inputsize, outputsize, llen, nr_pages; > struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; > struct page **pages, **compressed_pages, *page; > > - enum z_erofs_page_type page_type; > bool overlapped, partial; > int err; > > @@ -823,42 +863,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, > for (i = 0; i < nr_pages; ++i) > pages[i] = NULL; > > - err = 0; > - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, > - pcl->pagevec, 0); > - > - for (i = 0; i < pcl->vcnt; ++i) { > - unsigned int pagenr; > - > - page = z_erofs_pagevec_dequeue(&ctor, &page_type); > - > - /* all pages in pagevec ought to be valid */ > - DBG_BUGON(!page); > - DBG_BUGON(z_erofs_page_is_invalidated(page)); > - > - if (z_erofs_put_shortlivedpage(pagepool, page)) > - continue; > - > - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) > - pagenr = 0; > - else > - pagenr = z_erofs_onlinepage_index(page); > - > - DBG_BUGON(pagenr >= nr_pages); > - > - /* > - * currently EROFS doesn't support multiref(dedup), > - * so here erroring out one multiref page. > - */ > - if (pages[pagenr]) { > - DBG_BUGON(1); > - SetPageError(pages[pagenr]); > - z_erofs_onlinepage_endio(pages[pagenr]); > - err = -EFSCORRUPTED; > - } > - pages[pagenr] = page; > - } > - z_erofs_pagevec_ctor_exit(&ctor, true); > + err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); > > overlapped = false; > compressed_pages = pcl->compressed_pages; Reviewed-by: Yue Hu From zbestahu at gmail.com Fri Jul 15 16:29:30 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 14:29:30 +0800 Subject: [PATCH 04/16] erofs: introduce bufvec to store decompressed buffers In-Reply-To: <20220714132051.46012-5-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-5-hsiangkao@linux.alibaba.com> Message-ID: <20220715142930.00001cdd.zbestahu@gmail.com> On Thu, 14 Jul 2022 21:20:39 +0800 Gao Xiang wrote: > For each pcluster, the total compressed buffers are determined in > advance, yet the number of decompressed buffers actually vary. Too > many decompressed pages can be recorded if one pcluster is highly > compressed or its pcluster size is large. That takes extra memory > footprints compared to uncompressed filesystems, especially a lot of > I/O in flight on low-ended devices. > > Therefore, similar to inplace I/O, pagevec was introduced to reuse > page cache to store these pointers in the time-sharing way since > these pages are actually unused before decompressing. > > In order to make it more flexable, a cleaner bufvec is used to > replace the old pagevec stuffs so that > > - Decompressed offsets can be stored inline, thus it can be used > for the upcoming feature like compressed data deduplication; > > - Towards supporting large folios for compressed inodes since > our final goal is to completely avoid page->private but use > folio->private only for all page cache pages. > > Signed-off-by: Gao Xiang > --- > fs/erofs/zdata.c | 177 +++++++++++++++++++++++++++++++++++------------ > fs/erofs/zdata.h | 26 +++++-- > 2 files changed, 153 insertions(+), 50 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index c183cd0bc42b..f52c54058f31 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -2,6 +2,7 @@ > /* > * Copyright (C) 2018 HUAWEI, Inc. > * https://www.huawei.com/ > + * Copyright (C) 2022 Alibaba Cloud > */ > #include "zdata.h" > #include "compress.h" > @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { > _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) > }; > > +struct z_erofs_bvec_iter { > + struct page *bvpage; > + struct z_erofs_bvset *bvset; > + unsigned int nr, cur; > +}; > + > +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) > +{ > + if (iter->bvpage) > + kunmap_local(iter->bvset); > + return iter->bvpage; > +} > + > +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) > +{ > + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; > + /* have to access nextpage in advance, otherwise it will be unmapped */ > + struct page *nextpage = iter->bvset->nextpage; > + struct page *oldpage; > + > + DBG_BUGON(!nextpage); > + oldpage = z_erofs_bvec_iter_end(iter); > + iter->bvpage = nextpage; > + iter->bvset = kmap_local_page(nextpage); > + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); > + iter->cur = 0; > + return oldpage; > +} > + > +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, > + struct z_erofs_bvset_inline *bvset, > + unsigned int bootstrap_nr, > + unsigned int cur) > +{ > + *iter = (struct z_erofs_bvec_iter) { > + .nr = bootstrap_nr, > + .bvset = (struct z_erofs_bvset *)bvset, > + }; > + > + while (cur > iter->nr) { > + cur -= iter->nr; > + z_erofs_bvset_flip(iter); > + } > + iter->cur = cur; > +} > + > +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, > + struct z_erofs_bvec *bvec, > + struct page **candidate_bvpage) > +{ > + if (iter->cur == iter->nr) { > + if (!*candidate_bvpage) > + return -EAGAIN; > + > + DBG_BUGON(iter->bvset->nextpage); > + iter->bvset->nextpage = *candidate_bvpage; > + z_erofs_bvset_flip(iter); > + > + iter->bvset->nextpage = NULL; > + *candidate_bvpage = NULL; > + } > + iter->bvset->bvec[iter->cur++] = *bvec; > + return 0; > +} > + > +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, > + struct z_erofs_bvec *bvec, > + struct page **old_bvpage) > +{ > + if (iter->cur == iter->nr) > + *old_bvpage = z_erofs_bvset_flip(iter); > + else > + *old_bvpage = NULL; > + *bvec = iter->bvset->bvec[iter->cur++]; > +} > + Touch a new file to include bufvec related code? call it zbvec.c/h? > static void z_erofs_destroy_pcluster_pool(void) > { > int i; > @@ -195,9 +272,10 @@ enum z_erofs_collectmode { > struct z_erofs_decompress_frontend { > struct inode *const inode; > struct erofs_map_blocks map; > - > + struct z_erofs_bvec_iter biter; > struct z_erofs_pagevec_ctor vector; > > + struct page *candidate_bvpage; > struct z_erofs_pcluster *pcl, *tailpcl; > /* a pointer used to pick up inplace I/O pages */ > struct page **icpage_ptr; > @@ -358,21 +436,24 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, > > /* callers must be with pcluster lock held */ > static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, > - struct page *page, enum z_erofs_page_type type, > - bool pvec_safereuse) > + struct z_erofs_bvec *bvec, > + enum z_erofs_page_type type) > { > int ret; > > - /* give priority for inplaceio */ > if (fe->mode >= COLLECT_PRIMARY && > - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && > - z_erofs_try_inplace_io(fe, page)) > - return 0; > - > - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, > - pvec_safereuse); > - fe->pcl->vcnt += (unsigned int)ret; > - return ret ? 0 : -EAGAIN; > + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { > + /* give priority for inplaceio to use file pages first */ > + if (z_erofs_try_inplace_io(fe, bvec->page)) > + return 0; > + /* otherwise, check if it can be used as a bvpage */ > + if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && > + !fe->candidate_bvpage) > + fe->candidate_bvpage = bvec->page; > + } > + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); > + fe->pcl->vcnt += (ret >= 0); > + return ret; > } > > static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) > @@ -554,9 +635,8 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) > } else if (ret) { > return ret; > } > - > - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, > - fe->pcl->pagevec, fe->pcl->vcnt); > + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, > + Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt); > /* since file-backed online pages are traversed in reverse order */ > fe->icpage_ptr = fe->pcl->compressed_pages + > z_erofs_pclusterpages(fe->pcl); > @@ -588,9 +668,14 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) > if (!pcl) > return false; > > - z_erofs_pagevec_ctor_exit(&fe->vector, false); > + z_erofs_bvec_iter_end(&fe->biter); > mutex_unlock(&pcl->lock); > > + if (fe->candidate_bvpage) { > + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); > + fe->candidate_bvpage = NULL; > + } > + > /* > * if all pending pages are added, don't hold its reference > * any longer if the pcluster isn't hosted by ourselves. > @@ -712,22 +797,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, > tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); > > retry: > - err = z_erofs_attach_page(fe, page, page_type, > - fe->mode >= COLLECT_PRIMARY_FOLLOWED); > - /* should allocate an additional short-lived page for pagevec */ > - if (err == -EAGAIN) { > - struct page *const newpage = > - alloc_page(GFP_NOFS | __GFP_NOFAIL); > - > - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); > - err = z_erofs_attach_page(fe, newpage, > - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); > - if (!err) > - goto retry; > + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { > + .page = page, > + .offset = offset - map->m_la, > + .end = end, > + }), page_type); > + /* should allocate an additional short-lived page for bvset */ > + if (err == -EAGAIN && !fe->candidate_bvpage) { > + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); > + set_page_private(fe->candidate_bvpage, > + Z_EROFS_SHORTLIVED_PAGE); > + goto retry; > } > > - if (err) > + if (err) { > + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); > goto err_out; > + } > > index = page->index - (map->m_la >> PAGE_SHIFT); > > @@ -781,29 +867,24 @@ static bool z_erofs_page_is_invalidated(struct page *page) > static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, > struct page **pages, struct page **pagepool) > { > - struct z_erofs_pagevec_ctor ctor; > - enum z_erofs_page_type page_type; > + struct z_erofs_bvec_iter biter; > + struct page *old_bvpage; > int i, err = 0; > > - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, > - pcl->pagevec, 0); > + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, > + Z_EROFS_NR_INLINE_PAGEVECS, 0); > for (i = 0; i < pcl->vcnt; ++i) { > - struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type); > + struct z_erofs_bvec bvec; > unsigned int pagenr; > > - /* all pages in pagevec ought to be valid */ > - DBG_BUGON(!page); > - DBG_BUGON(z_erofs_page_is_invalidated(page)); > - > - if (z_erofs_put_shortlivedpage(pagepool, page)) > - continue; > + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); > > - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) > - pagenr = 0; > - else > - pagenr = z_erofs_onlinepage_index(page); > + if (old_bvpage) > + z_erofs_put_shortlivedpage(pagepool, old_bvpage); > > + pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; > DBG_BUGON(pagenr >= pcl->nr_pages); > + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); > /* > * currently EROFS doesn't support multiref(dedup), > * so here erroring out one multiref page. > @@ -814,9 +895,12 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, > z_erofs_onlinepage_endio(pages[pagenr]); > err = -EFSCORRUPTED; > } > - pages[pagenr] = page; > + pages[pagenr] = bvec.page; > } > - z_erofs_pagevec_ctor_exit(&ctor, true); > + > + old_bvpage = z_erofs_bvec_iter_end(&biter); > + if (old_bvpage) > + z_erofs_put_shortlivedpage(pagepool, old_bvpage); > return err; > } > > @@ -986,6 +1070,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, > kvfree(pages); > > pcl->nr_pages = 0; > + pcl->bvset.nextpage = NULL; > pcl->vcnt = 0; > > /* pcluster lock MUST be taken before the following line */ > diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h > index 58053bb5066f..d03e333e4fde 100644 > --- a/fs/erofs/zdata.h > +++ b/fs/erofs/zdata.h > @@ -21,6 +21,21 @@ > */ > typedef void *z_erofs_next_pcluster_t; > > +struct z_erofs_bvec { > + struct page *page; > + int offset; > + unsigned int end; > +}; > + > +#define __Z_EROFS_BVSET(name, total) \ > +struct name { \ > + /* point to the next page which contains the following bvecs */ \ > + struct page *nextpage; \ > + struct z_erofs_bvec bvec[total]; \ > +} > +__Z_EROFS_BVSET(z_erofs_bvset,); > +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS); > + > /* > * Structure fields follow one of the following exclusion rules. > * > @@ -41,22 +56,25 @@ struct z_erofs_pcluster { > /* A: lower limit of decompressed length and if full length or not */ > unsigned int length; > > + /* L: total number of bvecs */ > + unsigned int vcnt; > + > /* I: page offset of start position of decompression */ > unsigned short pageofs_out; > > /* I: page offset of inline compressed data */ > unsigned short pageofs_in; > > - /* L: maximum relative page index in pagevec[] */ > + /* L: maximum relative page index in bvecs */ > unsigned short nr_pages; > > - /* L: total number of pages in pagevec[] */ > - unsigned int vcnt; > - > union { > /* L: inline a certain number of pagevecs for bootstrap */ > erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; > > + /* L: inline a certain number of bvec for bootstrap */ > + struct z_erofs_bvset_inline bvset; > + > /* I: can be used to free the pcluster by RCU. */ > struct rcu_head rcu; > }; From hsiangkao at linux.alibaba.com Fri Jul 15 16:36:05 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 14:36:05 +0800 Subject: [PATCH 04/16] erofs: introduce bufvec to store decompressed buffers In-Reply-To: <20220715142930.00001cdd.zbestahu@gmail.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-5-hsiangkao@linux.alibaba.com> <20220715142930.00001cdd.zbestahu@gmail.com> Message-ID: Hi Yue, On Fri, Jul 15, 2022 at 02:29:30PM +0800, Yue Hu wrote: > On Thu, 14 Jul 2022 21:20:39 +0800 > Gao Xiang wrote: > > > For each pcluster, the total compressed buffers are determined in > > advance, yet the number of decompressed buffers actually vary. Too > > many decompressed pages can be recorded if one pcluster is highly > > compressed or its pcluster size is large. That takes extra memory > > footprints compared to uncompressed filesystems, especially a lot of > > I/O in flight on low-ended devices. > > > > Therefore, similar to inplace I/O, pagevec was introduced to reuse > > page cache to store these pointers in the time-sharing way since > > these pages are actually unused before decompressing. > > > > In order to make it more flexable, a cleaner bufvec is used to > > replace the old pagevec stuffs so that > > > > - Decompressed offsets can be stored inline, thus it can be used > > for the upcoming feature like compressed data deduplication; > > > > - Towards supporting large folios for compressed inodes since > > our final goal is to completely avoid page->private but use > > folio->private only for all page cache pages. > > > > Signed-off-by: Gao Xiang > > --- > > fs/erofs/zdata.c | 177 +++++++++++++++++++++++++++++++++++------------ > > fs/erofs/zdata.h | 26 +++++-- > > 2 files changed, 153 insertions(+), 50 deletions(-) > > > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > > index c183cd0bc42b..f52c54058f31 100644 > > --- a/fs/erofs/zdata.c > > +++ b/fs/erofs/zdata.c > > @@ -2,6 +2,7 @@ > > /* > > * Copyright (C) 2018 HUAWEI, Inc. > > * https://www.huawei.com/ > > + * Copyright (C) 2022 Alibaba Cloud > > */ > > #include "zdata.h" > > #include "compress.h" > > @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { > > _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) > > }; > > > > +struct z_erofs_bvec_iter { > > + struct page *bvpage; > > + struct z_erofs_bvset *bvset; > > + unsigned int nr, cur; > > +}; > > + > > +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) > > +{ > > + if (iter->bvpage) > > + kunmap_local(iter->bvset); > > + return iter->bvpage; > > +} > > + > > +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) > > +{ > > + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; > > + /* have to access nextpage in advance, otherwise it will be unmapped */ > > + struct page *nextpage = iter->bvset->nextpage; > > + struct page *oldpage; > > + > > + DBG_BUGON(!nextpage); > > + oldpage = z_erofs_bvec_iter_end(iter); > > + iter->bvpage = nextpage; > > + iter->bvset = kmap_local_page(nextpage); > > + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); > > + iter->cur = 0; > > + return oldpage; > > +} > > + > > +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, > > + struct z_erofs_bvset_inline *bvset, > > + unsigned int bootstrap_nr, > > + unsigned int cur) > > +{ > > + *iter = (struct z_erofs_bvec_iter) { > > + .nr = bootstrap_nr, > > + .bvset = (struct z_erofs_bvset *)bvset, > > + }; > > + > > + while (cur > iter->nr) { > > + cur -= iter->nr; > > + z_erofs_bvset_flip(iter); > > + } > > + iter->cur = cur; > > +} > > + > > +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, > > + struct z_erofs_bvec *bvec, > > + struct page **candidate_bvpage) > > +{ > > + if (iter->cur == iter->nr) { > > + if (!*candidate_bvpage) > > + return -EAGAIN; > > + > > + DBG_BUGON(iter->bvset->nextpage); > > + iter->bvset->nextpage = *candidate_bvpage; > > + z_erofs_bvset_flip(iter); > > + > > + iter->bvset->nextpage = NULL; > > + *candidate_bvpage = NULL; > > + } > > + iter->bvset->bvec[iter->cur++] = *bvec; > > + return 0; > > +} > > + > > +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, > > + struct z_erofs_bvec *bvec, > > + struct page **old_bvpage) > > +{ > > + if (iter->cur == iter->nr) > > + *old_bvpage = z_erofs_bvset_flip(iter); > > + else > > + *old_bvpage = NULL; > > + *bvec = iter->bvset->bvec[iter->cur++]; > > +} > > + > > Touch a new file to include bufvec related code? call it zbvec.c/h? Thanks for the suggestion. The new implementation is simple enough, so I tend to directly leave it in zdata.c instead of making more churn to add more new files and zpvec.h will be completely removed in the following patch. Thanks, Gao Xiang From zbestahu at gmail.com Fri Jul 15 17:07:37 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 15:07:37 +0800 Subject: [PATCH 05/16] erofs: drop the old pagevec approach In-Reply-To: <20220714132051.46012-6-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-6-hsiangkao@linux.alibaba.com> Message-ID: <20220715150737.00006764.zbestahu@gmail.com> On Thu, 14 Jul 2022 21:20:40 +0800 Gao Xiang wrote: > Remove the old pagevec approach but keep z_erofs_page_type for now. > It will be reworked in the following commits as well. > > Also rename Z_EROFS_NR_INLINE_PAGEVECS as Z_EROFS_INLINE_BVECS with > the new value 2 since it's actually enough to bootstrap. I notice there are 2 comments as below which still use pagevec, should we update it as well? [1] * pagevec) since it can be directly decoded without I/O submission. [2] * for inplace I/O or pagevec (should be processed in strict order.) BTW, utils.c includes needles , we can remove it along with the patchset or remove it later. > > Signed-off-by: Gao Xiang > --- > fs/erofs/zdata.c | 17 +++-- > fs/erofs/zdata.h | 9 +-- > fs/erofs/zpvec.h | 159 ----------------------------------------------- > 3 files changed, 16 insertions(+), 169 deletions(-) > delete mode 100644 fs/erofs/zpvec.h > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index f52c54058f31..e96704db106e 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -27,6 +27,17 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { > _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) > }; > > +/* page type in pagevec for decompress subsystem */ > +enum z_erofs_page_type { > + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ > + Z_EROFS_PAGE_TYPE_EXCLUSIVE, > + > + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, > + > + Z_EROFS_VLE_PAGE_TYPE_HEAD, > + Z_EROFS_VLE_PAGE_TYPE_MAX > +}; > + > struct z_erofs_bvec_iter { > struct page *bvpage; > struct z_erofs_bvset *bvset; > @@ -273,7 +284,6 @@ struct z_erofs_decompress_frontend { > struct inode *const inode; > struct erofs_map_blocks map; > struct z_erofs_bvec_iter biter; > - struct z_erofs_pagevec_ctor vector; > > struct page *candidate_bvpage; > struct z_erofs_pcluster *pcl, *tailpcl; > @@ -636,7 +646,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) > return ret; > } > z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, > - Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt); > + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); > /* since file-backed online pages are traversed in reverse order */ > fe->icpage_ptr = fe->pcl->compressed_pages + > z_erofs_pclusterpages(fe->pcl); > @@ -871,8 +881,7 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, > struct page *old_bvpage; > int i, err = 0; > > - z_erofs_bvec_iter_begin(&biter, &pcl->bvset, > - Z_EROFS_NR_INLINE_PAGEVECS, 0); > + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); > for (i = 0; i < pcl->vcnt; ++i) { > struct z_erofs_bvec bvec; > unsigned int pagenr; > diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h > index d03e333e4fde..a755c5a44d87 100644 > --- a/fs/erofs/zdata.h > +++ b/fs/erofs/zdata.h > @@ -7,10 +7,10 @@ > #define __EROFS_FS_ZDATA_H > > #include "internal.h" > -#include "zpvec.h" > +#include "tagptr.h" > > #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) > -#define Z_EROFS_NR_INLINE_PAGEVECS 3 > +#define Z_EROFS_INLINE_BVECS 2 > > #define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 > #define Z_EROFS_PCLUSTER_LENGTH_BIT 1 > @@ -34,7 +34,7 @@ struct name { \ > struct z_erofs_bvec bvec[total]; \ > }; > __Z_EROFS_BVSET(z_erofs_bvset,) > -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS) > +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS) > > /* > * Structure fields follow one of the following exclusion rules. > @@ -69,9 +69,6 @@ struct z_erofs_pcluster { > unsigned short nr_pages; > > union { > - /* L: inline a certain number of pagevecs for bootstrap */ > - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; > - > /* L: inline a certain number of bvec for bootstrap */ > struct z_erofs_bvset_inline bvset; > > diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h > deleted file mode 100644 > index b05464f4a808..000000000000 > --- a/fs/erofs/zpvec.h > +++ /dev/null > @@ -1,159 +0,0 @@ > -/* SPDX-License-Identifier: GPL-2.0-only */ > -/* > - * Copyright (C) 2018 HUAWEI, Inc. > - * https://www.huawei.com/ > - */ > -#ifndef __EROFS_FS_ZPVEC_H > -#define __EROFS_FS_ZPVEC_H > - > -#include "tagptr.h" > - > -/* page type in pagevec for decompress subsystem */ > -enum z_erofs_page_type { > - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ > - Z_EROFS_PAGE_TYPE_EXCLUSIVE, > - > - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, > - > - Z_EROFS_VLE_PAGE_TYPE_HEAD, > - Z_EROFS_VLE_PAGE_TYPE_MAX > -}; > - > -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") > - __bad_page_type_exclusive(void); > - > -/* pagevec tagged pointer */ > -typedef tagptr2_t erofs_vtptr_t; > - > -/* pagevec collector */ > -struct z_erofs_pagevec_ctor { > - struct page *curr, *next; > - erofs_vtptr_t *pages; > - > - unsigned int nr, index; > -}; > - > -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, > - bool atomic) > -{ > - if (!ctor->curr) > - return; > - > - if (atomic) > - kunmap_atomic(ctor->pages); > - else > - kunmap(ctor->curr); > -} > - > -static inline struct page * > -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, > - unsigned int nr) > -{ > - unsigned int index; > - > - /* keep away from occupied pages */ > - if (ctor->next) > - return ctor->next; > - > - for (index = 0; index < nr; ++index) { > - const erofs_vtptr_t t = ctor->pages[index]; > - const unsigned int tags = tagptr_unfold_tags(t); > - > - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) > - return tagptr_unfold_ptr(t); > - } > - DBG_BUGON(nr >= ctor->nr); > - return NULL; > -} > - > -static inline void > -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, > - bool atomic) > -{ > - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); > - > - z_erofs_pagevec_ctor_exit(ctor, atomic); > - > - ctor->curr = next; > - ctor->next = NULL; > - ctor->pages = atomic ? > - kmap_atomic(ctor->curr) : kmap(ctor->curr); > - > - ctor->nr = PAGE_SIZE / sizeof(struct page *); > - ctor->index = 0; > -} > - > -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, > - unsigned int nr, > - erofs_vtptr_t *pages, > - unsigned int i) > -{ > - ctor->nr = nr; > - ctor->curr = ctor->next = NULL; > - ctor->pages = pages; > - > - if (i >= nr) { > - i -= nr; > - z_erofs_pagevec_ctor_pagedown(ctor, false); > - while (i > ctor->nr) { > - i -= ctor->nr; > - z_erofs_pagevec_ctor_pagedown(ctor, false); > - } > - } > - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); > - ctor->index = i; > -} > - > -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, > - struct page *page, > - enum z_erofs_page_type type, > - bool pvec_safereuse) > -{ > - if (!ctor->next) { > - /* some pages cannot be reused as pvec safely without I/O */ > - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) > - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; > - > - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && > - ctor->index + 1 == ctor->nr) > - return false; > - } > - > - if (ctor->index >= ctor->nr) > - z_erofs_pagevec_ctor_pagedown(ctor, false); > - > - /* exclusive page type must be 0 */ > - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) > - __bad_page_type_exclusive(); > - > - /* should remind that collector->next never equal to 1, 2 */ > - if (type == (uintptr_t)ctor->next) { > - ctor->next = page; > - } > - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); > - return true; > -} > - > -static inline struct page * > -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, > - enum z_erofs_page_type *type) > -{ > - erofs_vtptr_t t; > - > - if (ctor->index >= ctor->nr) { > - DBG_BUGON(!ctor->next); > - z_erofs_pagevec_ctor_pagedown(ctor, true); > - } > - > - t = ctor->pages[ctor->index]; > - > - *type = tagptr_unfold_tags(t); > - > - /* should remind that collector->next never equal to 1, 2 */ > - if (*type == (uintptr_t)ctor->next) > - ctor->next = tagptr_unfold_ptr(t); > - > - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); > - return tagptr_unfold_ptr(t); > -} > -#endif From hsiangkao at linux.alibaba.com Fri Jul 15 17:19:46 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 15:19:46 +0800 Subject: [PATCH 05/16] erofs: drop the old pagevec approach In-Reply-To: <20220715150737.00006764.zbestahu@gmail.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-6-hsiangkao@linux.alibaba.com> <20220715150737.00006764.zbestahu@gmail.com> Message-ID: On Fri, Jul 15, 2022 at 03:07:37PM +0800, Yue Hu wrote: > On Thu, 14 Jul 2022 21:20:40 +0800 > Gao Xiang wrote: > > > Remove the old pagevec approach but keep z_erofs_page_type for now. > > It will be reworked in the following commits as well. > > > > Also rename Z_EROFS_NR_INLINE_PAGEVECS as Z_EROFS_INLINE_BVECS with > > the new value 2 since it's actually enough to bootstrap. > > I notice there are 2 comments as below which still use pagevec, should we > update it as well? > > [1] > * pagevec) since it can be directly decoded without I/O submission. > [2] > * for inplace I/O or pagevec (should be processed in strict order.) Yeah, thanks for reminder... I will update it in this patch in the next version. > > BTW, utils.c includes needles , we can remove it along with the patchset > or remove it later. That is a completely different stuff, would you mind submitting a patch to remove it if needed? Thanks, Gao Xiang From zbestahu at gmail.com Fri Jul 15 17:45:31 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 15:45:31 +0800 Subject: [PATCH 05/16] erofs: drop the old pagevec approach In-Reply-To: References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-6-hsiangkao@linux.alibaba.com> <20220715150737.00006764.zbestahu@gmail.com> Message-ID: <20220715154531.00005153.zbestahu@gmail.com> On Fri, 15 Jul 2022 15:19:46 +0800 Gao Xiang wrote: > On Fri, Jul 15, 2022 at 03:07:37PM +0800, Yue Hu wrote: > > On Thu, 14 Jul 2022 21:20:40 +0800 > > Gao Xiang wrote: > > > > > Remove the old pagevec approach but keep z_erofs_page_type for now. > > > It will be reworked in the following commits as well. > > > > > > Also rename Z_EROFS_NR_INLINE_PAGEVECS as Z_EROFS_INLINE_BVECS with > > > the new value 2 since it's actually enough to bootstrap. > > > > I notice there are 2 comments as below which still use pagevec, should we > > update it as well? > > > > [1] > > * pagevec) since it can be directly decoded without I/O submission. > > [2] > > * for inplace I/O or pagevec (should be processed in strict order.) > > Yeah, thanks for reminder... I will update it in this patch in the next > version. > > > > > BTW, utils.c includes needles , we can remove it along with the patchset > > or remove it later. > > That is a completely different stuff, would you mind submitting a patch > to remove it if needed? ok, may submit later. > > Thanks, > Gao Xiang From zbestahu at gmail.com Fri Jul 15 17:53:23 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 15 Jul 2022 15:53:23 +0800 Subject: [PATCH 07/16] erofs: switch compressed_pages[] to bufvec In-Reply-To: <20220714132051.46012-8-hsiangkao@linux.alibaba.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-8-hsiangkao@linux.alibaba.com> Message-ID: <20220715155323.00005df4.zbestahu@gmail.com> On Thu, 14 Jul 2022 21:20:42 +0800 Gao Xiang wrote: > Convert compressed_pages[] to bufvec in order to avoid using > page->private to keep onlinepage_index (decompressed offset) > for inplace I/O pages. > > In the future, we only rely on folio->private to keep a countdown > to unlock folios and set folio_uptodate. > > Signed-off-by: Gao Xiang > --- > fs/erofs/zdata.c | 113 +++++++++++++++++++++++------------------------ > fs/erofs/zdata.h | 4 +- > 2 files changed, 57 insertions(+), 60 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index 757d352bc2c7..f2e3f07baad7 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -134,7 +134,7 @@ static int z_erofs_create_pcluster_pool(void) > > for (pcs = pcluster_pool; > pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { > - size = struct_size(a, compressed_pages, pcs->maxpages); > + size = struct_size(a, compressed_bvecs, pcs->maxpages); > > sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); > pcs->slab = kmem_cache_create(pcs->name, size, 0, > @@ -287,16 +287,16 @@ struct z_erofs_decompress_frontend { > > struct page *candidate_bvpage; > struct z_erofs_pcluster *pcl, *tailpcl; > - /* a pointer used to pick up inplace I/O pages */ > - struct page **icpage_ptr; > z_erofs_next_pcluster_t owned_head; > - > enum z_erofs_collectmode mode; > > bool readahead; > /* used for applying cache strategy on the fly */ > bool backmost; > erofs_off_t headoffset; > + > + /* a pointer used to pick up inplace I/O pages */ > + unsigned int icur; not a pointer? > }; > > #define DECOMPRESS_FRONTEND_INIT(__i) { \ > @@ -319,24 +319,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, > */ > gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | > __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; > - struct page **pages; > - pgoff_t index; > + unsigned int i; > > if (fe->mode < COLLECT_PRIMARY_FOLLOWED) > return; > > - pages = pcl->compressed_pages; > - index = pcl->obj.index; > - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { > + for (i = 0; i < pcl->pclusterpages; ++i) { > struct page *page; > compressed_page_t t; > struct page *newpage = NULL; > > /* the compressed page was loaded before */ > - if (READ_ONCE(*pages)) > + if (READ_ONCE(pcl->compressed_bvecs[i].page)) > continue; > > - page = find_get_page(mc, index); > + page = find_get_page(mc, pcl->obj.index + i); > > if (page) { > t = tag_compressed_page_justfound(page); > @@ -357,7 +354,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, > } > } > > - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) > + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, > + tagptr_cast_ptr(t))) > continue; > > if (page) > @@ -388,7 +386,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, > * therefore no need to worry about available decompression users. > */ > for (i = 0; i < pcl->pclusterpages; ++i) { > - struct page *page = pcl->compressed_pages[i]; > + struct page *page = pcl->compressed_bvecs[i].page; > > if (!page) > continue; > @@ -401,7 +399,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, > continue; > > /* barrier is implied in the following 'unlock_page' */ > - WRITE_ONCE(pcl->compressed_pages[i], NULL); > + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); > detach_page_private(page); > unlock_page(page); > } > @@ -411,36 +409,39 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, > int erofs_try_to_free_cached_page(struct page *page) > { > struct z_erofs_pcluster *const pcl = (void *)page_private(page); > - int ret = 0; /* 0 - busy */ > + int ret, i; > > - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { > - unsigned int i; > + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) > + return 0; > > - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); > - for (i = 0; i < pcl->pclusterpages; ++i) { > - if (pcl->compressed_pages[i] == page) { > - WRITE_ONCE(pcl->compressed_pages[i], NULL); > - ret = 1; > - break; > - } > + ret = 0; > + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); > + for (i = 0; i < pcl->pclusterpages; ++i) { > + if (pcl->compressed_bvecs[i].page == page) { > + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); > + ret = 1; > + break; > } > - erofs_workgroup_unfreeze(&pcl->obj, 1); > - > - if (ret) > - detach_page_private(page); > } > + erofs_workgroup_unfreeze(&pcl->obj, 1); > + if (ret) > + detach_page_private(page); > return ret; > } > > /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ > static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, > - struct page *page) > + struct z_erofs_bvec *bvec) > { > struct z_erofs_pcluster *const pcl = fe->pcl; > > - while (fe->icpage_ptr > pcl->compressed_pages) > - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) > + while (fe->icur > 0) { > + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, > + NULL, bvec->page)) { > + pcl->compressed_bvecs[fe->icur] = *bvec; > return true; > + } > + } > return false; > } > > @@ -454,7 +455,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, > if (fe->mode >= COLLECT_PRIMARY && > type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { > /* give priority for inplaceio to use file pages first */ > - if (z_erofs_try_inplace_io(fe, bvec->page)) > + if (z_erofs_try_inplace_io(fe, bvec)) > return 0; > /* otherwise, check if it can be used as a bvpage */ > if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && > @@ -648,8 +649,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) > z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, > Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); > /* since file-backed online pages are traversed in reverse order */ > - fe->icpage_ptr = fe->pcl->compressed_pages + > - z_erofs_pclusterpages(fe->pcl); > + fe->icur = z_erofs_pclusterpages(fe->pcl); > return 0; > } > > @@ -769,7 +769,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, > goto err_out; > } > get_page(fe->map.buf.page); > - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); > + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, > + fe->map.buf.page); > fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; > } else { > /* bind cache first when cached decompression is preferred */ > @@ -927,8 +928,9 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, > *overlapped = false; > > for (i = 0; i < pclusterpages; ++i) { > - unsigned int pagenr; > - struct page *page = pcl->compressed_pages[i]; > + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; > + struct page *page = bvec->page; > + unsigned int pgnr; > > /* compressed pages ought to be present before decompressing */ > if (!page) { > @@ -951,21 +953,15 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, > continue; > } > > - /* > - * only if non-head page can be selected > - * for inplace decompression > - */ > - pagenr = z_erofs_onlinepage_index(page); > - > - DBG_BUGON(pagenr >= pcl->nr_pages); > - if (pages[pagenr]) { > + pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; > + DBG_BUGON(pgnr >= pcl->nr_pages); > + if (pages[pgnr]) { > DBG_BUGON(1); > - SetPageError(pages[pagenr]); > - z_erofs_onlinepage_endio(pages[pagenr]); > + SetPageError(pages[pgnr]); > + z_erofs_onlinepage_endio(pages[pgnr]); > err = -EFSCORRUPTED; > } > - pages[pagenr] = page; > - > + pages[pgnr] = page; > *overlapped = true; > } > > @@ -1067,19 +1063,19 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, > out: > /* must handle all compressed pages before actual file pages */ > if (z_erofs_is_inline_pcluster(pcl)) { > - page = pcl->compressed_pages[0]; > - WRITE_ONCE(pcl->compressed_pages[0], NULL); > + page = pcl->compressed_bvecs[0].page; > + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); > put_page(page); > } else { > for (i = 0; i < pclusterpages; ++i) { > - page = pcl->compressed_pages[i]; > + page = pcl->compressed_bvecs[i].page; > > if (erofs_page_is_managed(sbi, page)) > continue; > > /* recycle all individual short-lived pages */ > (void)z_erofs_put_shortlivedpage(pagepool, page); > - WRITE_ONCE(pcl->compressed_pages[i], NULL); > + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); > } > } > kfree(compressed_pages); > @@ -1193,7 +1189,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, > int justfound; > > repeat: > - page = READ_ONCE(pcl->compressed_pages[nr]); > + page = READ_ONCE(pcl->compressed_bvecs[nr].page); > oldpage = page; > > if (!page) > @@ -1209,7 +1205,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, > * otherwise, it will go inplace I/O path instead. > */ > if (page->private == Z_EROFS_PREALLOCATED_PAGE) { > - WRITE_ONCE(pcl->compressed_pages[nr], page); > + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); > set_page_private(page, 0); > tocache = true; > goto out_tocache; > @@ -1235,14 +1231,14 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, > > /* the page is still in manage cache */ > if (page->mapping == mc) { > - WRITE_ONCE(pcl->compressed_pages[nr], page); > + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); > > ClearPageError(page); > if (!PagePrivate(page)) { > /* > * impossible to be !PagePrivate(page) for > * the current restriction as well if > - * the page is already in compressed_pages[]. > + * the page is already in compressed_bvecs[]. > */ > DBG_BUGON(!justfound); > > @@ -1271,7 +1267,8 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, > put_page(page); > out_allocpage: > page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); > - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { > + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, > + oldpage, page)) { > erofs_pagepool_add(pagepool, page); > cond_resched(); > goto repeat; > diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h > index a755c5a44d87..a70f1b73e901 100644 > --- a/fs/erofs/zdata.h > +++ b/fs/erofs/zdata.h > @@ -87,8 +87,8 @@ struct z_erofs_pcluster { > /* I: compression algorithm format */ > unsigned char algorithmformat; > > - /* A: compressed pages (can be cached or inplaced pages) */ > - struct page *compressed_pages[]; > + /* A: compressed bvecs (can be cached or inplaced pages) */ > + struct z_erofs_bvec compressed_bvecs[]; > }; > > /* let's avoid the valid 32-bit kernel addresses */ From hsiangkao at linux.alibaba.com Fri Jul 15 17:59:17 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 15:59:17 +0800 Subject: [PATCH 07/16] erofs: switch compressed_pages[] to bufvec In-Reply-To: <20220715155323.00005df4.zbestahu@gmail.com> References: <20220714132051.46012-1-hsiangkao@linux.alibaba.com> <20220714132051.46012-8-hsiangkao@linux.alibaba.com> <20220715155323.00005df4.zbestahu@gmail.com> Message-ID: On Fri, Jul 15, 2022 at 03:53:23PM +0800, Yue Hu wrote: > On Thu, 14 Jul 2022 21:20:42 +0800 > Gao Xiang wrote: > > > Convert compressed_pages[] to bufvec in order to avoid using > > page->private to keep onlinepage_index (decompressed offset) > > for inplace I/O pages. > > > > In the future, we only rely on folio->private to keep a countdown > > to unlock folios and set folio_uptodate. > > > > Signed-off-by: Gao Xiang > > --- > > fs/erofs/zdata.c | 113 +++++++++++++++++++++++------------------------ > > fs/erofs/zdata.h | 4 +- > > 2 files changed, 57 insertions(+), 60 deletions(-) > > > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > > index 757d352bc2c7..f2e3f07baad7 100644 > > --- a/fs/erofs/zdata.c > > +++ b/fs/erofs/zdata.c > > @@ -134,7 +134,7 @@ static int z_erofs_create_pcluster_pool(void) > > > > for (pcs = pcluster_pool; > > pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { > > - size = struct_size(a, compressed_pages, pcs->maxpages); > > + size = struct_size(a, compressed_bvecs, pcs->maxpages); > > > > sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); > > pcs->slab = kmem_cache_create(pcs->name, size, 0, > > @@ -287,16 +287,16 @@ struct z_erofs_decompress_frontend { > > > > struct page *candidate_bvpage; > > struct z_erofs_pcluster *pcl, *tailpcl; > > - /* a pointer used to pick up inplace I/O pages */ > > - struct page **icpage_ptr; > > z_erofs_next_pcluster_t owned_head; > > - > > enum z_erofs_collectmode mode; > > > > bool readahead; > > /* used for applying cache strategy on the fly */ > > bool backmost; > > erofs_off_t headoffset; > > + > > + /* a pointer used to pick up inplace I/O pages */ > > + unsigned int icur; > > not a pointer? Here `pointer' means a cursor or called sub-index. Thanks, Gao Xiang From jnhuang at linux.alibaba.com Fri Jul 15 19:53:59 2022 From: jnhuang at linux.alibaba.com (Huang Jianan) Date: Fri, 15 Jul 2022 17:53:59 +0800 Subject: [PATCH] erofs-utils: fuse: introduce xattr support Message-ID: <20220715095359.37534-1-jnhuang@linux.alibaba.com> This implements xattr functionalities for erofsfuse. A large amount of code was adapted from Linux kernel. Signed-off-by: Huang Jianan --- fuse/main.c | 32 +++ include/erofs/internal.h | 8 + include/erofs/xattr.h | 21 ++ lib/xattr.c | 508 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 569 insertions(+) diff --git a/fuse/main.c b/fuse/main.c index f4c2476..30a0bed 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -139,7 +139,39 @@ static int erofsfuse_readlink(const char *path, char *buffer, size_t size) return 0; } +static int erofsfuse_getxattr(const char *path, const char *name, char *value, + size_t size) +{ + int ret; + struct erofs_inode vi; + + erofs_dbg("getxattr(%s): name=%s size=%llu", path, name, size); + + ret = erofs_ilookup(path, &vi); + if (ret) + return ret; + + return erofs_getxattr(&vi, name, value, size); +} + +static int erofsfuse_listxattr(const char *path, char *list, size_t size) +{ + int ret; + struct erofs_inode vi; + int i; + + erofs_dbg("listxattr(%s): size=%llu", path, size); + + ret = erofs_ilookup(path, &vi); + if (ret) + return ret; + + return erofs_listxattr(&vi, list, size); +} + static struct fuse_operations erofs_ops = { + .getxattr = erofsfuse_getxattr, + .listxattr = erofsfuse_listxattr, .readlink = erofsfuse_readlink, .getattr = erofsfuse_getattr, .readdir = erofsfuse_readdir, diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 6a70f11..991635f 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -180,6 +180,9 @@ struct erofs_inode { unsigned int xattr_isize; unsigned int extent_isize; + unsigned int xattr_shared_count; + unsigned int *xattr_shared_xattrs; + erofs_nid_t nid; struct erofs_buffer_head *bh; struct erofs_buffer_head *bh_inline, *bh_data; @@ -351,6 +354,11 @@ static inline int erofs_get_occupied_size(const struct erofs_inode *inode, return 0; } +/* data.c */ +int erofs_getxattr(struct erofs_inode *vi, const char *name, char *buffer, + size_t buffer_size); +int erofs_listxattr(struct erofs_inode *vi, char *buffer, size_t buffer_size); + /* zmap.c */ int z_erofs_fill_inode(struct erofs_inode *vi); int z_erofs_map_blocks_iter(struct erofs_inode *vi, diff --git a/include/erofs/xattr.h b/include/erofs/xattr.h index 226e984..a0528c0 100644 --- a/include/erofs/xattr.h +++ b/include/erofs/xattr.h @@ -14,6 +14,27 @@ extern "C" #include "internal.h" +#ifndef ENOATTR +#define ENOATTR ENODATA +#endif + +static inline unsigned int inlinexattr_header_size(struct erofs_inode *vi) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; +} + +static inline erofs_blk_t xattrblock_addr(unsigned int xattr_id) +{ + return sbi.xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +} + +static inline unsigned int xattrblock_offset(unsigned int xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + #define EROFS_INODE_XATTR_ICOUNT(_size) ({\ u32 __size = le16_to_cpu(_size); \ ((__size) == 0) ? 0 : \ diff --git a/lib/xattr.c b/lib/xattr.c index 71ffe3e..aa35ca1 100644 --- a/lib/xattr.c +++ b/lib/xattr.c @@ -716,3 +716,511 @@ char *erofs_export_xattr_ibody(struct list_head *ixattrs, unsigned int size) DBG_BUGON(p > size); return buf; } + +struct xattr_iter { + char page[EROFS_BLKSIZ]; + + void *kaddr; + + erofs_blk_t blkaddr; + unsigned int ofs; +}; + +static int init_inode_xattrs(struct erofs_inode *vi) +{ + struct xattr_iter it; + unsigned int i; + struct erofs_xattr_ibody_header *ih; + int ret = 0; + + /* the most case is that xattrs of this inode are initialized. */ + if (vi->flags & EROFS_I_EA_INITED) + return ret; + + /* + * bypass all xattr operations if ->xattr_isize is not greater than + * sizeof(struct erofs_xattr_ibody_header), in detail: + * 1) it is not enough to contain erofs_xattr_ibody_header then + * ->xattr_isize should be 0 (it means no xattr); + * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk + * undefined right now (maybe use later with some new sb feature). + */ + if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { + erofs_err("xattr_isize %d of nid %llu is not supported yet", + vi->xattr_isize, vi->nid); + return -EOPNOTSUPP; + } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { + if (vi->xattr_isize) { + erofs_err("bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; /* xattr ondisk layout error */ + } + return -ENOATTR; + } + + it.blkaddr = erofs_blknr(iloc(vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(vi->nid) + vi->inode_isize); + + ret = blk_read(0, it.page, it.blkaddr, 1); + if (ret < 0) + return -EIO; + + it.kaddr = it.page; + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = malloc(vi->xattr_shared_count * sizeof(uint)); + if (!vi->xattr_shared_xattrs) + return -ENOMEM; + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (it.ofs >= EROFS_BLKSIZ) { + /* cannot be unaligned */ + DBG_BUGON(it.ofs != EROFS_BLKSIZ); + + ret = blk_read(0, it.page, ++it.blkaddr, 1); + if (ret < 0) { + free(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + return -EIO; + } + + it.kaddr = it.page; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + + vi->flags |= EROFS_I_EA_INITED; + + return ret; +} + +/* + * the general idea for these return values is + * if 0 is returned, go on processing the current xattr; + * 1 (> 0) is returned, skip this round to process the next xattr; + * -err (< 0) is returned, an error (maybe ENOXATTR) occurred + * and need to be handled + */ +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); + int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); + int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); + void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); +}; + +static inline int xattr_iter_fixup(struct xattr_iter *it) +{ + int ret; + + if (it->ofs < EROFS_BLKSIZ) + return 0; + + it->blkaddr += erofs_blknr(it->ofs); + + ret = blk_read(0, it->page, it->blkaddr, 1); + if (ret < 0) + return -EIO; + + it->kaddr = it->page; + it->ofs = erofs_blkoff(it->ofs); + return 0; +} + +static int inline_xattr_iter_pre(struct xattr_iter *it, + struct erofs_inode *vi) +{ + unsigned int xattr_header_sz, inline_xattr_ofs; + int ret; + + xattr_header_sz = inlinexattr_header_size(vi); + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(vi->nid) + inline_xattr_ofs); + + ret = blk_read(0, it->page, it->blkaddr, 1); + if (ret < 0) + return -EIO; + + it->kaddr = it->page; + return vi->xattr_isize - xattr_header_sz; +} + +/* + * Regardless of success or failure, `xattr_foreach' will end up with + * `ofs' pointing to the next xattr item rather than an arbitrary position. + */ +static int xattr_foreach(struct xattr_iter *it, + const struct xattr_iter_handlers *op, + unsigned int *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned int value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + err = xattr_iter_fixup(it); + if (err) + return err; + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit) { + unsigned int entry_sz = erofs_xattr_entry_size(&entry); + + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (*tlimit < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* xattrs should be 4-byte aligned (on-disk constraint) */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err < 0 ? err : 0; +} + +struct getxattr_iter { + struct xattr_iter it; + + int buffer_size, index; + char *buffer; + const char *name; + size_t len; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + + return memcmp(buf, it->name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return !it->buffer ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned int processed, + char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static const struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct erofs_inode *vi, struct getxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_pre(&it->it, vi); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret != -ENOATTR) + break; + } + + return ret ? ret : it->buffer_size; +} + +static int shared_getxattr(struct erofs_inode *vi, struct getxattr_iter *it) +{ + unsigned int i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + ret = blk_read(0, it->it.page, blkaddr, 1); + if (ret < 0) + return -EIO; + + it->it.kaddr = it->it.page; + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret != -ENOATTR) + break; + } + + return ret ? ret : it->buffer_size; +} + +int erofs_getxattr(struct erofs_inode *vi, const char *name, char *buffer, + size_t buffer_size) +{ + int ret; + u8 prefix; + u16 prefixlen; + struct getxattr_iter it; + + if (!name) + return -EINVAL; + + ret = init_inode_xattrs(vi); + if (ret) + return ret; + + if (!match_prefix(name, &prefix, &prefixlen)) + return -ENODATA; + + it.index = prefix; + it.name = name + prefixlen; + it.len = strlen(it.name); + if (it.len > EROFS_NAME_LEN) + return -ERANGE; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + ret = inline_getxattr(vi, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(vi, &it); + return ret; +} + +struct listxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned int prefix_len; + const char *prefix; + + prefix = xattr_types[entry->e_name_index].prefix; + prefix_len = xattr_types[entry->e_name_index].prefix_len; + + if (!it->buffer) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static const struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct erofs_inode *vi, struct listxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_pre(&it->it, vi); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret) + break; + } + + return ret ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct erofs_inode *vi, struct listxattr_iter *it) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + ret = blk_read(0, it->it.page, blkaddr, 1); + if (ret < 0) + return -EIO; + + it->it.kaddr = it->it.page; + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret) + break; + } + + return ret ? ret : it->buffer_ofs; +} + +int erofs_listxattr(struct erofs_inode *vi, char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + ret = init_inode_xattrs(vi); + if (ret == -ENOATTR) + return 0; + if (ret) + return ret; + + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = inline_listxattr(vi, &it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(vi, &it); +} -- 2.34.0 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:47 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:47 +0800 Subject: [PATCH v2 00/16] erofs: prepare for folios, deduplication and kill PG_error Message-ID: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Hi folks, I've been doing this for almost 2 months, the main point of this is to support large folios and rolling hash deduplication for compressed data. This patchset is as a start of this work targeting for the next 5.20, it introduces a flexable range representation for (de)compressed buffers instead of too relying on page(s) directly themselves, so large folios can laterly base on this work. Also, this patchset gets rid of all PG_error flags in the decompression code. It's a cleanup as a result as well. In addition, this patchset kicks off rolling hash deduplication for compressed data by introducing fully-referenced multi-reference pclusters first instead of reporting fs corruption if one pcluster is introduced by several differnt extents. The full implementation is expected to be finished in the merge window after the next. One of my colleagues is actively working on the userspace part of this feature. However, it's still easy to verify fully-referenced multi-reference pcluster by constructing some image by hand (see attachment): Dataset: 300M seq-read (data-deduplicated, read_ahead_kb 8192): 1095MiB/s seq-read (data-deduplicated, read_ahead_kb 4096): 771MiB/s seq-read (data-deduplicated, read_ahead_kb 512): 577MiB/s seq-read (vanilla, read_ahead_kb 8192): 364MiB/s Finally, this patchset survives ro-fsstress on my side. Thanks, Gao Xiang Changes since v1: - rename left pagevec words to bvpage (Yue Hu); Gao Xiang (16): erofs: get rid of unneeded `inode', `map' and `sb' erofs: clean up z_erofs_collector_begin() erofs: introduce `z_erofs_parse_out_bvecs()' erofs: introduce bufvec to store decompressed buffers erofs: drop the old pagevec approach erofs: introduce `z_erofs_parse_in_bvecs' erofs: switch compressed_pages[] to bufvec erofs: rework online page handling erofs: get rid of `enum z_erofs_page_type' erofs: clean up `enum z_erofs_collectmode' erofs: get rid of `z_pagemap_global' erofs: introduce struct z_erofs_decompress_backend erofs: try to leave (de)compressed_pages on stack if possible erofs: introduce z_erofs_do_decompressed_bvec() erofs: record the longest decompressed size in this round erofs: introduce multi-reference pclusters (fully-referenced) fs/erofs/compress.h | 2 +- fs/erofs/decompressor.c | 2 +- fs/erofs/zdata.c | 785 +++++++++++++++++++++++----------------- fs/erofs/zdata.h | 119 +++--- fs/erofs/zpvec.h | 159 -------- 5 files changed, 496 insertions(+), 571 deletions(-) delete mode 100644 fs/erofs/zpvec.h -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:52 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:52 +0800 Subject: [PATCH v2 05/16] erofs: drop the old pagevec approach In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-6-hsiangkao@linux.alibaba.com> Remove the old pagevec approach but keep z_erofs_page_type for now. It will be reworked in the following commits as well. Also rename Z_EROFS_NR_INLINE_PAGEVECS as Z_EROFS_INLINE_BVECS with the new value 2 since it's actually enough to bootstrap. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 21 +++++-- fs/erofs/zdata.h | 9 +-- fs/erofs/zpvec.h | 159 ----------------------------------------------- 3 files changed, 18 insertions(+), 171 deletions(-) delete mode 100644 fs/erofs/zpvec.h diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f52c54058f31..6295f3312f6f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -27,6 +27,17 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +/* (obsoleted) page type for online pages */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + struct z_erofs_bvec_iter { struct page *bvpage; struct z_erofs_bvset *bvset; @@ -248,7 +259,7 @@ enum z_erofs_collectmode { * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or - * pagevec) since it can be directly decoded without I/O submission. + * bvpage) since it can be directly decoded without I/O submission. */ COLLECT_PRIMARY_FOLLOWED_NOINPLACE, /* @@ -273,7 +284,6 @@ struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; @@ -636,7 +646,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) return ret; } z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, - Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt); + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -776,7 +786,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); @@ -871,8 +881,7 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, struct page *old_bvpage; int i, err = 0; - z_erofs_bvec_iter_begin(&biter, &pcl->bvset, - Z_EROFS_NR_INLINE_PAGEVECS, 0); + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; unsigned int pagenr; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index f8daadb19e37..468f6308fc90 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,10 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 +#define Z_EROFS_INLINE_BVECS 2 #define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 #define Z_EROFS_PCLUSTER_LENGTH_BIT 1 @@ -34,7 +34,7 @@ struct name { \ struct z_erofs_bvec bvec[total]; \ } __Z_EROFS_BVSET(z_erofs_bvset,); -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); /* * Structure fields follow one of the following exclusion rules. @@ -69,9 +69,6 @@ struct z_erofs_pcluster { unsigned short nr_pages; union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; - /* L: inline a certain number of bvec for bootstrap */ struct z_erofs_bvset_inline bvset; diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:48 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:48 +0800 Subject: [PATCH v2 01/16] erofs: get rid of unneeded `inode', `map' and `sb' In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-2-hsiangkao@linux.alibaba.com> Since commit 5c6dcc57e2e5 ("erofs: get rid of `struct z_erofs_collector'"), these arguments can be dropped as well. No logic changes. Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..1b6816dd235f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -404,10 +404,9 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; struct z_erofs_pcluster *pcl = fe->pcl; unsigned int length; @@ -449,10 +448,9 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, return 0; } -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -494,7 +492,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,10 +518,9 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; struct erofs_workgroup *grp; int ret; @@ -541,19 +538,19 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, goto tailpacking; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(fe->inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); + ret = z_erofs_register_pcluster(fe); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_pcluster(fe, inode, map); + ret = z_erofs_lookup_pcluster(fe); if (ret) { erofs_workgroup_put(&fe->pcl->obj); return ret; @@ -663,7 +660,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) goto err_out; @@ -1259,13 +1256,13 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1314,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1366,14 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1471,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1520,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:49 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:49 +0800 Subject: [PATCH v2 02/16] erofs: clean up z_erofs_collector_begin() In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-3-hsiangkao@linux.alibaba.com> Rearrange the code and get rid of all gotos. Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 1b6816dd235f..c7be447ac64d 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -521,7 +521,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; - struct erofs_workgroup *grp; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -530,33 +530,31 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(fe->inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: ret = z_erofs_register_pcluster(fe); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; } - ret = z_erofs_lookup_pcluster(fe); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + ret = z_erofs_lookup_pcluster(fe); + if (ret) { + erofs_workgroup_put(&fe->pcl->obj); + return ret; + } + } else if (ret) { return ret; } -out: z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->pagevec, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:50 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:50 +0800 Subject: [PATCH v2 03/16] erofs: introduce `z_erofs_parse_out_bvecs()' In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-4-hsiangkao@linux.alibaba.com> `z_erofs_decompress_pcluster()' is too long therefore it'd be better to introduce another helper to parse decompressed pages (or laterly, decompressed bvecs.) BTW, since `decompressed_bvecs' is too long as a part of the function name, `out_bvecs' is used instead. Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 81 +++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c7be447ac64d..c183cd0bc42b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -778,18 +778,58 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } +static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, + struct page **pages, struct page **pagepool) +{ + struct z_erofs_pagevec_ctor ctor; + enum z_erofs_page_type page_type; + int i, err = 0; + + z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, + pcl->pagevec, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type); + unsigned int pagenr; + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (z_erofs_put_shortlivedpage(pagepool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= pcl->nr_pages); + /* + * currently EROFS doesn't support multiref(dedup), + * so here erroring out one multiref page. + */ + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + } + z_erofs_pagevec_ctor_exit(&ctor, true); + return err; +} + static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; struct page **pages, **compressed_pages, *page; - enum z_erofs_page_type page_type; bool overlapped, partial; int err; @@ -823,42 +863,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, for (i = 0; i < nr_pages; ++i) pages[i] = NULL; - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; - - page = z_erofs_pagevec_dequeue(&ctor, &page_type); - - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; - - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; - } - z_erofs_pagevec_ctor_exit(&ctor, true); + err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); overlapped = false; compressed_pages = pcl->compressed_pages; -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:54 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:54 +0800 Subject: [PATCH v2 07/16] erofs: switch compressed_pages[] to bufvec In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-8-hsiangkao@linux.alibaba.com> Convert compressed_pages[] to bufvec in order to avoid using page->private to keep onlinepage_index (decompressed offset) for inplace I/O pages. In the future, we only rely on folio->private to keep a countdown to unlock folios and set folio_uptodate. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 113 +++++++++++++++++++++++------------------------ fs/erofs/zdata.h | 4 +- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 423d4daf7ed9..2ea8c97be5b6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -134,7 +134,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -287,16 +287,16 @@ struct z_erofs_decompress_frontend { struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - enum z_erofs_collectmode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ @@ -319,24 +319,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; if (fe->mode < COLLECT_PRIMARY_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -357,7 +354,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -388,7 +386,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -401,7 +399,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -411,36 +409,39 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } @@ -454,7 +455,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, if (fe->mode >= COLLECT_PRIMARY && type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec->page)) + if (z_erofs_try_inplace_io(fe, bvec)) return 0; /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && @@ -648,8 +649,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -769,7 +769,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto err_out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ @@ -927,8 +928,9 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; - struct page *page = pcl->compressed_pages[i]; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; + unsigned int pgnr; /* compressed pages ought to be present before decompressing */ if (!page) { @@ -951,21 +953,15 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, continue; } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= pcl->nr_pages); - if (pages[pagenr]) { + pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= pcl->nr_pages); + if (pages[pgnr]) { DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); + SetPageError(pages[pgnr]); + z_erofs_onlinepage_endio(pages[pgnr]); err = -EFSCORRUPTED; } - pages[pagenr] = page; - + pages[pgnr] = page; *overlapped = true; } @@ -1067,19 +1063,19 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = pcl->compressed_pages[0]; - WRITE_ONCE(pcl->compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } kfree(compressed_pages); @@ -1193,7 +1189,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1209,7 +1205,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1235,14 +1231,14 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1271,7 +1267,8 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 468f6308fc90..5d236c8b40c5 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -87,8 +87,8 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:57 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:57 +0800 Subject: [PATCH v2 10/16] erofs: clean up `enum z_erofs_collectmode' In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-11-hsiangkao@linux.alibaba.com> `enum z_erofs_collectmode' is really ambiguous, but I'm not quite sure if there are better naming, basically it's used to judge whether inplace I/O can be used due to the current status of pclusters in the chain. Rename it as `enum z_erofs_pclustermode' instead. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 63 ++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f2a513299d82..d1f907f4757d 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -227,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -261,12 +260,12 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { @@ -277,7 +276,7 @@ struct z_erofs_decompress_frontend { struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; z_erofs_next_pcluster_t owned_head; - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ @@ -290,7 +289,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); @@ -310,7 +309,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pcl->pclusterpages; ++i) { @@ -358,7 +357,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -439,12 +438,12 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, { int ret; - if (fe->mode >= COLLECT_PRIMARY && exclusive) { + if (exclusive) { /* give priority for inplaceio to use file pages first */ if (z_erofs_try_inplace_io(fe, bvec)) return 0; /* otherwise, check if it can be used as a bvpage */ - if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } @@ -463,7 +462,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -474,12 +473,12 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) @@ -554,7 +553,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -676,7 +675,7 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -756,7 +755,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, get_page(fe->map.buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -774,8 +773,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -785,7 +784,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:53 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:53 +0800 Subject: [PATCH v2 06/16] erofs: introduce `z_erofs_parse_in_bvecs' In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-7-hsiangkao@linux.alibaba.com> `z_erofs_decompress_pcluster()' is too long therefore it'd be better to introduce another helper to parse compressed pages (or laterly, compressed bvecs.) BTW, since `compressed_bvecs' is too long as a part of the function name, `in_bvecs' is used here instead. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 132 ++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 52 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6295f3312f6f..423d4daf7ed9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -913,6 +913,76 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, return err; } +static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, struct page **pages, + struct page **pagepool, bool *overlapped) +{ + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + struct page **compressed_pages; + int i, err = 0; + + /* XXX: will have a better approach in the following commits */ + compressed_pages = kmalloc_array(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + *overlapped = false; + + for (i = 0; i < pclusterpages; ++i) { + unsigned int pagenr; + struct page *page = pcl->compressed_pages[i]; + + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + compressed_pages[i] = page; + + if (z_erofs_is_inline_pcluster(pcl)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + DBG_BUGON(z_erofs_page_is_invalidated(page)); + if (!z_erofs_is_shortlived_page(page)) { + if (erofs_page_is_managed(sbi, page)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + /* + * only if non-head page can be selected + * for inplace decompression + */ + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= pcl->nr_pages); + if (pages[pagenr]) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + + *overlapped = true; + } + + /* PG_error needs checking for all non-managed pages */ + if (PageError(page)) { + DBG_BUGON(PageUptodate(page)); + err = -EIO; + } + } + + if (err) { + kfree(compressed_pages); + return ERR_PTR(err); + } + return compressed_pages; +} + static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) @@ -957,54 +1027,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, pages[i] = NULL; err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); - - overlapped = false; - compressed_pages = pcl->compressed_pages; - - for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; - - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); - - if (z_erofs_is_inline_pcluster(pcl)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - - DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; - - overlapped = true; - } - - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, + pagepool, &overlapped); + if (IS_ERR(compressed_pages)) { + err = PTR_ERR(compressed_pages); + compressed_pages = NULL; } if (err) @@ -1040,21 +1067,22 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_pages[0]; + WRITE_ONCE(pcl->compressed_pages[0], NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_pages[i], NULL); } } + kfree(compressed_pages); for (i = 0; i < nr_pages; ++i) { page = pages[i]; -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:51 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:51 +0800 Subject: [PATCH v2 04/16] erofs: introduce bufvec to store decompressed buffers In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-5-hsiangkao@linux.alibaba.com> For each pcluster, the total compressed buffers are determined in advance, yet the number of decompressed buffers actually vary. Too many decompressed pages can be recorded if one pcluster is highly compressed or its pcluster size is large. That takes extra memory footprints compared to uncompressed filesystems, especially a lot of I/O in flight on low-ended devices. Therefore, similar to inplace I/O, pagevec was introduced to reuse page cache to store these pointers in the time-sharing way since these pages are actually unused before decompressing. In order to make it more flexable, a cleaner bufvec is used to replace the old pagevec stuffs so that - Decompressed offsets can be stored inline, thus it can be used for the upcoming feature like compressed data deduplication. It's calculated by `page_offset(page) - map->m_la'; - Towards supporting large folios for compressed inodes since our final goal is to completely avoid page->private but use folio->private only for all page cache pages. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 177 +++++++++++++++++++++++++++++++++++------------ fs/erofs/zdata.h | 26 +++++-- 2 files changed, 153 insertions(+), 50 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c183cd0bc42b..f52c54058f31 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -195,9 +272,10 @@ enum z_erofs_collectmode { struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; - + struct z_erofs_bvec_iter biter; struct z_erofs_pagevec_ctor vector; + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; /* a pointer used to pick up inplace I/O pages */ struct page **icpage_ptr; @@ -358,21 +436,24 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, + enum z_erofs_page_type type) { int ret; - /* give priority for inplaceio */ if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec->page)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -554,9 +635,8 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } - - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_NR_INLINE_PAGEVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -588,9 +668,14 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. @@ -712,22 +797,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), page_type); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); goto err_out; + } index = page->index - (map->m_la >> PAGE_SHIFT); @@ -781,29 +867,24 @@ static bool z_erofs_page_is_invalidated(struct page *page) static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, struct page **pages, struct page **pagepool) { - struct z_erofs_pagevec_ctor ctor; - enum z_erofs_page_type page_type; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; int i, err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, + Z_EROFS_NR_INLINE_PAGEVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { - struct page *page = z_erofs_pagevec_dequeue(&ctor, &page_type); + struct z_erofs_bvec bvec; unsigned int pagenr; - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + if (old_bvpage) + z_erofs_put_shortlivedpage(pagepool, old_bvpage); + pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pagenr >= pcl->nr_pages); + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); /* * currently EROFS doesn't support multiref(dedup), * so here erroring out one multiref page. @@ -814,9 +895,12 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, z_erofs_onlinepage_endio(pages[pagenr]); err = -EFSCORRUPTED; } - pages[pagenr] = page; + pages[pagenr] = bvec.page; } - z_erofs_pagevec_ctor_exit(&ctor, true); + + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(pagepool, old_bvpage); return err; } @@ -986,6 +1070,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, kvfree(pages); pcl->nr_pages = 0; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..f8daadb19e37 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -21,6 +21,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_NR_INLINE_PAGEVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -41,22 +56,25 @@ struct z_erofs_pcluster { /* A: lower limit of decompressed length and if full length or not */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ + /* L: maximum relative page index in bvecs */ unsigned short nr_pages; - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { /* L: inline a certain number of pagevecs for bootstrap */ erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:56 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:56 +0800 Subject: [PATCH v2 09/16] erofs: get rid of `enum z_erofs_page_type' In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-10-hsiangkao@linux.alibaba.com> Remove it since pagevec[] is no longer used. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2d5e2ed3e5f5..f2a513299d82 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -27,17 +27,6 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; -/* (obsoleted) page type for online pages */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - struct z_erofs_bvec_iter { struct page *bvpage; struct z_erofs_bvset *bvset; @@ -429,7 +418,6 @@ int erofs_try_to_free_cached_page(struct page *page) return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { @@ -447,13 +435,11 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec, - enum z_erofs_page_type type) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { + if (fe->mode >= COLLECT_PRIMARY && exclusive) { /* give priority for inplaceio to use file pages first */ if (z_erofs_try_inplace_io(fe, bvec)) return 0; @@ -718,10 +704,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; unsigned int cur, end, spiltted, index; int err = 0; @@ -798,12 +783,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); @@ -812,7 +792,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, .page = page, .offset = offset - map->m_la, .end = end, - }), page_type); + }), exclusive); /* should allocate an additional short-lived page for bvset */ if (err == -EAGAIN && !fe->candidate_bvpage) { fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:42:01 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:42:01 +0800 Subject: [PATCH v2 14/16] erofs: introduce z_erofs_do_decompressed_bvec() In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-15-hsiangkao@linux.alibaba.com> Both out_bvecs and in_bvecs share the common logic for decompressed buffers. So let's make a helper for this. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 49 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d93ba0adcf9e..d4db2c1d53a6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -861,6 +861,26 @@ struct z_erofs_decompress_backend { unsigned int onstack_used; }; +static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + unsigned int pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + struct page *oldpage; + + DBG_BUGON(pgnr >= be->pcl->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; + + /* error out if one pcluster is refenenced multiple times. */ + if (oldpage) { + DBG_BUGON(1); + z_erofs_page_mark_eio(oldpage); + z_erofs_onlinepage_endio(oldpage); + return -EFSCORRUPTED; + } + return 0; +} + static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; @@ -871,27 +891,14 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; - unsigned int pgnr; z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - pgnr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= pcl->nr_pages); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (be->decompressed_pages[pgnr]) { - DBG_BUGON(1); - z_erofs_page_mark_eio(be->decompressed_pages[pgnr]); - z_erofs_onlinepage_endio(be->decompressed_pages[pgnr]); - err = -EFSCORRUPTED; - } - be->decompressed_pages[pgnr] = bvec.page; + err = z_erofs_do_decompressed_bvec(be, &bvec); } old_bvpage = z_erofs_bvec_iter_end(&biter); @@ -911,7 +918,6 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, for (i = 0; i < pclusterpages; ++i) { struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - unsigned int pgnr; /* compressed pages ought to be present before decompressing */ if (!page) { @@ -933,18 +939,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, err = -EIO; continue; } - - pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= pcl->nr_pages); - if (be->decompressed_pages[pgnr]) { - DBG_BUGON(1); - z_erofs_page_mark_eio( - be->decompressed_pages[pgnr]); - z_erofs_onlinepage_endio( - be->decompressed_pages[pgnr]); - err = -EFSCORRUPTED; - } - be->decompressed_pages[pgnr] = page; + err = z_erofs_do_decompressed_bvec(be, bvec); *overlapped = true; } } -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:59 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:59 +0800 Subject: [PATCH v2 12/16] erofs: introduce struct z_erofs_decompress_backend In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-13-hsiangkao@linux.alibaba.com> Let's introduce struct z_erofs_decompress_backend in order to pass on the decompression backend context between helper functions more easier and avoid too many arguments. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 140 +++++++++++++++++++++++++---------------------- fs/erofs/zdata.h | 3 +- 2 files changed, 76 insertions(+), 67 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3f735ca0415e..1cf377ed1452 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -847,9 +847,22 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, - struct page **pages, struct page **pagepool) +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; + + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; + + struct page **pagepool; +}; + +static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { + struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; struct page *old_bvpage; int i, err = 0; @@ -857,39 +870,39 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; - unsigned int pagenr; + unsigned int pgnr; z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); if (old_bvpage) - z_erofs_put_shortlivedpage(pagepool, old_bvpage); + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pagenr >= pcl->nr_pages); + pgnr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= pcl->nr_pages); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); /* * currently EROFS doesn't support multiref(dedup), * so here erroring out one multiref page. */ - if (pages[pagenr]) { + if (be->decompressed_pages[pgnr]) { DBG_BUGON(1); - z_erofs_page_mark_eio(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); + z_erofs_page_mark_eio(be->decompressed_pages[pgnr]); + z_erofs_onlinepage_endio(be->decompressed_pages[pgnr]); err = -EFSCORRUPTED; } - pages[pagenr] = bvec.page; + be->decompressed_pages[pgnr] = bvec.page; } old_bvpage = z_erofs_bvec_iter_end(&biter); if (old_bvpage) - z_erofs_put_shortlivedpage(pagepool, old_bvpage); + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); return err; } -static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, - struct z_erofs_pcluster *pcl, struct page **pages, - struct page **pagepool, bool *overlapped) +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) { + struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct page **compressed_pages; int i, err = 0; @@ -919,7 +932,7 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; @@ -927,60 +940,58 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, pgnr = (bvec->offset + pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= pcl->nr_pages); - if (pages[pgnr]) { + if (be->decompressed_pages[pgnr]) { DBG_BUGON(1); - z_erofs_page_mark_eio(pages[pgnr]); - z_erofs_onlinepage_endio(pages[pgnr]); + z_erofs_page_mark_eio( + be->decompressed_pages[pgnr]); + z_erofs_onlinepage_endio( + be->decompressed_pages[pgnr]); err = -EFSCORRUPTED; } - pages[pgnr] = page; + be->decompressed_pages[pgnr] = page; *overlapped = true; } } if (err) { kfree(compressed_pages); - return ERR_PTR(err); + return err; } - return compressed_pages; + be->compressed_pages = compressed_pages; + return 0; } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool, int err) +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; + struct page *page; int err2; bool overlapped, partial; - might_sleep(); DBG_BUGON(!READ_ONCE(pcl->nr_pages)); - mutex_lock(&pcl->lock); nr_pages = pcl->nr_pages; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) - pages = pages_onstack; - else - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); - - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; + if (nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * nr_pages); + } else { + be->decompressed_pages = + kvcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + } - err2 = z_erofs_parse_out_bvecs(pcl, pages, pagepool); + err2 = z_erofs_parse_out_bvecs(be); + if (err2) + err = err2; + err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; - compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, - pagepool, &overlapped); - if (IS_ERR(compressed_pages)) { - err = PTR_ERR(compressed_pages); - compressed_pages = NULL; - } if (err) goto out; @@ -1000,9 +1011,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, @@ -1010,7 +1021,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = partial - }, pagepool); + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ @@ -1026,29 +1037,29 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } - kfree(compressed_pages); + kfree(be->compressed_pages); for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; if (err) z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); pcl->nr_pages = 0; pcl->bvset.nextpage = NULL; @@ -1063,23 +1074,23 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool, - io->eio ? -EIO : 0); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1105,7 +1116,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 43c91bd2d84f..be0f19aa0d2d 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -173,7 +173,6 @@ static inline void z_erofs_onlinepage_endio(struct page *page) } } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_ONSTACK_PAGES 32 #endif -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:42:00 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:42:00 +0800 Subject: [PATCH v2 13/16] erofs: try to leave (de)compressed_pages on stack if possible In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-14-hsiangkao@linux.alibaba.com> For the most cases, small pclusters can be decompressed with page arrays on stack. Try to leave both (de)compressed_pages on stack if possible as before. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 1cf377ed1452..d93ba0adcf9e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -858,6 +858,7 @@ struct z_erofs_decompress_backend { struct page **compressed_pages; struct page **pagepool; + unsigned int onstack_used; }; static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) @@ -904,14 +905,9 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct page **compressed_pages; int i, err = 0; - /* XXX: will have a better approach in the following commits */ - compressed_pages = kmalloc_array(pclusterpages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); *overlapped = false; - for (i = 0; i < pclusterpages; ++i) { struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; @@ -922,7 +918,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, DBG_BUGON(1); continue; } - compressed_pages[i] = page; + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -953,11 +949,8 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, } } - if (err) { - kfree(compressed_pages); + if (err) return err; - } - be->compressed_pages = compressed_pages; return 0; } @@ -976,15 +969,28 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, mutex_lock(&pcl->lock); nr_pages = pcl->nr_pages; + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; if (nr_pages <= Z_EROFS_ONSTACK_PAGES) { be->decompressed_pages = be->onstack_pages; + be->onstack_used = nr_pages; memset(be->decompressed_pages, 0, sizeof(struct page *) * nr_pages); - } else { + } + + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) be->decompressed_pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); - } + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); err2 = z_erofs_parse_out_bvecs(be); if (err2) @@ -1041,7 +1047,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } - kfree(be->compressed_pages); + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); for (i = 0; i < nr_pages; ++i) { page = be->decompressed_pages[i]; -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:42:03 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:42:03 +0800 Subject: [PATCH v2 16/16] erofs: introduce multi-reference pclusters (fully-referenced) In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-17-hsiangkao@linux.alibaba.com> Let's introduce multi-reference pclusters at runtime. In details, if one pcluster is requested by multiple extents at almost the same time (even belong to different files), the longest extent will be decompressed as representative and the other extents are actually copied from the longest one. After this patch, fully-referenced extents can be correctly handled and the full decoding check needs to be bypassed for partial-referenced extents. Signed-off-by: Gao Xiang --- fs/erofs/compress.h | 2 +- fs/erofs/decompressor.c | 2 +- fs/erofs/zdata.c | 118 ++++++++++++++++++++++++++++------------ fs/erofs/zdata.h | 3 + 4 files changed, 87 insertions(+), 38 deletions(-) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 45be8f4aeb68..2d55569f96ac 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0ef672372a69..d2b8bc20965c 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -467,7 +467,8 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; f->mode = Z_EROFS_PCLUSTER_HOOKED; @@ -480,20 +481,8 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) { - struct erofs_map_blocks *map = &fe->map; struct z_erofs_pcluster *pcl = fe->pcl; - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -785,6 +774,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && fe->pcl->length == map->m_llen) @@ -842,36 +833,90 @@ struct z_erofs_decompress_backend { /* pages to keep the compressed data */ struct page **compressed_pages; + struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; }; -static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, - struct z_erofs_bvec *bvec) +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; + +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) { - unsigned int pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; - struct page *oldpage; + struct z_erofs_bvec_item *item; - DBG_BUGON(pgnr >= be->nr_pages); - oldpage = be->decompressed_pages[pgnr]; - be->decompressed_pages[pgnr] = bvec->page; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - /* error out if one pcluster is refenenced multiple times. */ - if (oldpage) { - DBG_BUGON(1); - z_erofs_page_mark_eio(oldpage); - z_erofs_onlinepage_endio(oldpage); - return -EFSCORRUPTED; + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; + + if (!oldpage) + return; + } + + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} + +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); } - return 0; } -static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; struct page *old_bvpage; - int i, err = 0; + int i; z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { @@ -883,13 +928,12 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); - err = z_erofs_do_decompressed_bvec(be, &bvec); + z_erofs_do_decompressed_bvec(be, &bvec); } old_bvpage = z_erofs_bvec_iter_end(&biter); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - return err; } static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, @@ -924,7 +968,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, err = -EIO; continue; } - err = z_erofs_do_decompressed_bvec(be, bvec); + z_erofs_do_decompressed_bvec(be, bvec); *overlapped = true; } } @@ -971,13 +1015,10 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, kvcalloc(pclusterpages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); - err2 = z_erofs_parse_out_bvecs(be); - if (err2) - err = err2; + z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; - if (err) goto out; @@ -997,6 +1038,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, }, be->pagepool); out: @@ -1020,6 +1062,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; @@ -1041,6 +1084,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->length = 0; pcl->partial = true; + pcl->multibases = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1056,6 +1100,8 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct z_erofs_decompress_backend be = { .sb = io->sb, .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), }; z_erofs_next_pcluster_t owned = io->head; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 4ae3b763bc27..e7f04c4fbb81 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -84,6 +84,9 @@ struct z_erofs_pcluster { /* L: whether partial decompression or not */ bool partial; + /* L: indicate several pageofs_outs or not */ + bool multibases; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:55 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:55 +0800 Subject: [PATCH v2 08/16] erofs: rework online page handling In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-9-hsiangkao@linux.alibaba.com> Since all decompressed offsets have been integrated to bvecs[], this patch avoids all sub-indexes so that page->private only includes a part count and an eio flag, thus in the future folio->private can have the same meaning. In addition, PG_error will not be used anymore after this patch and we're heading to use page->private (later folio->private) and page->mapping (later folio->mapping) only. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 57 ++++++++++++++++------------------------ fs/erofs/zdata.h | 68 ++++++++++++++---------------------------------- 2 files changed, 42 insertions(+), 83 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2ea8c97be5b6..2d5e2ed3e5f5 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -743,7 +743,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -755,7 +755,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -766,7 +766,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, @@ -823,16 +823,15 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, if (err) { DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); - goto err_out; + goto out; } - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); - + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; + /* also update nr_pages */ + index = page->index - (map->m_la >> PAGE_SHIFT); fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); next_part: /* can be used for verification */ @@ -843,16 +842,13 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -901,7 +897,7 @@ static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, */ if (pages[pagenr]) { DBG_BUGON(1); - SetPageError(pages[pagenr]); + z_erofs_page_mark_eio(pages[pagenr]); z_erofs_onlinepage_endio(pages[pagenr]); err = -EFSCORRUPTED; } @@ -957,19 +953,13 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, DBG_BUGON(pgnr >= pcl->nr_pages); if (pages[pgnr]) { DBG_BUGON(1); - SetPageError(pages[pgnr]); + z_erofs_page_mark_eio(pages[pgnr]); z_erofs_onlinepage_endio(pages[pgnr]); err = -EFSCORRUPTED; } pages[pgnr] = page; *overlapped = true; } - - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } } if (err) { @@ -981,16 +971,15 @@ static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct page **pagepool) + struct page **pagepool, int err) { struct erofs_sb_info *const sbi = EROFS_SB(sb); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; struct page **pages, **compressed_pages, *page; - + int err2; bool overlapped, partial; - int err; might_sleep(); DBG_BUGON(!READ_ONCE(pcl->nr_pages)); @@ -1022,7 +1011,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, for (i = 0; i < nr_pages; ++i) pages[i] = NULL; - err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); + err2 = z_erofs_parse_out_bvecs(pcl, pages, pagepool); + if (err2) + err = err2; compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, pagepool, &overlapped); if (IS_ERR(compressed_pages)) { @@ -1090,10 +1081,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } @@ -1129,7 +1118,8 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + z_erofs_decompress_pcluster(io->sb, pcl, pagepool, + io->eio ? -EIO : 0); erofs_workgroup_put(&pcl->obj); } } @@ -1233,7 +1223,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, if (page->mapping == mc) { WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for @@ -1305,6 +1294,7 @@ jobqueue_init(struct super_block *sb, q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1365,15 +1355,14 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 5d236c8b40c5..75f6fd435388 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -109,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -123,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -162,45 +143,34 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } #define Z_EROFS_VMAP_ONSTACK_PAGES \ -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:42:02 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:42:02 +0800 Subject: [PATCH v2 15/16] erofs: record the longest decompressed size in this round In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-16-hsiangkao@linux.alibaba.com> Currently, `pcl->length' records the longest decompressed length as long as the pcluster itself isn't reclaimed. However, such number is unneeded for the general cases since it doesn't indicate the exact decompressed size in this round. Instead, let's record the decompressed size for this round instead, thus `pcl->nr_pages' can be completely dropped and pageofs_out is also designed to be kept in sync with `pcl->length'. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 78 +++++++++++++++++------------------------------- fs/erofs/zdata.h | 11 +++---- 2 files changed, 31 insertions(+), 58 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d4db2c1d53a6..0ef672372a69 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -482,7 +482,6 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; /* to avoid unexpected loop formed by corrupted images */ if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { @@ -495,24 +494,6 @@ static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) return -EFSCORRUPTED; } - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -543,9 +524,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; @@ -703,7 +683,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -806,12 +786,17 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - index = page->index - (map->m_la >> PAGE_SHIFT); - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) @@ -858,7 +843,7 @@ struct z_erofs_decompress_backend { struct page **compressed_pages; struct page **pagepool; - unsigned int onstack_used; + unsigned int onstack_used, nr_pages; }; static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, @@ -867,7 +852,7 @@ static int z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, unsigned int pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; struct page *oldpage; - DBG_BUGON(pgnr >= be->pcl->nr_pages); + DBG_BUGON(pgnr >= be->nr_pages); oldpage = be->decompressed_pages[pgnr]; be->decompressed_pages[pgnr] = bvec->page; @@ -955,24 +940,23 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *page; + unsigned int i, inputsize; int err2; - bool overlapped, partial; + struct page *page; + bool overlapped; - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; /* allocate (de)compressed page arrays if cannot be kept on stack */ be->decompressed_pages = NULL; be->compressed_pages = NULL; be->onstack_used = 0; - if (nr_pages <= Z_EROFS_ONSTACK_PAGES) { + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { be->decompressed_pages = be->onstack_pages; - be->onstack_used = nr_pages; + be->onstack_used = be->nr_pages; memset(be->decompressed_pages, 0, - sizeof(struct page *) * nr_pages); + sizeof(struct page *) * be->nr_pages); } if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) @@ -980,7 +964,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (!be->decompressed_pages) be->decompressed_pages = - kvcalloc(nr_pages, sizeof(struct page *), + kvcalloc(be->nr_pages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = @@ -997,15 +981,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else @@ -1018,10 +993,10 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial + .partial_decoding = pcl->partial, }, be->pagepool); out: @@ -1046,7 +1021,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); - for (i = 0; i < nr_pages; ++i) { + for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; if (!page) continue; @@ -1064,7 +1039,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (be->decompressed_pages != be->onstack_pages) kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index be0f19aa0d2d..4ae3b763bc27 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -12,9 +12,6 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 - /* * let's leave a type here in case of introducing * another tagged pointer later. @@ -53,7 +50,7 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; /* L: total number of bvecs */ @@ -65,9 +62,6 @@ struct z_erofs_pcluster { /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in bvecs */ - unsigned short nr_pages; - union { /* L: inline a certain number of bvec for bootstrap */ struct z_erofs_bvset_inline bvset; @@ -87,6 +81,9 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; + /* L: whether partial decompression or not */ + bool partial; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; -- 2.24.4 From hsiangkao at linux.alibaba.com Sat Jul 16 01:41:58 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 15 Jul 2022 23:41:58 +0800 Subject: [PATCH v2 11/16] erofs: get rid of `z_pagemap_global' In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <20220715154203.48093-12-hsiangkao@linux.alibaba.com> In order to introduce multi-reference pclusters for compressed data deduplication, let's get rid of the global page array for now since it needs to be re-designed then at least. Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 28 ++++------------------------ fs/erofs/zdata.h | 1 - 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d1f907f4757d..3f735ca0415e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -291,9 +291,6 @@ struct z_erofs_decompress_frontend { .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); - static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, struct page **pagepool) @@ -966,26 +963,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, mutex_lock(&pcl->lock); nr_pages = pcl->nr_pages; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { + if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; - - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; - + else pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); - - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } - } + GFP_KERNEL | __GFP_NOFAIL); for (i = 0; i < nr_pages; ++i) pages[i] = NULL; @@ -1065,9 +1047,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) + if (pages != pages_onstack) kvfree(pages); pcl->nr_pages = 0; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 75f6fd435388..43c91bd2d84f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -175,6 +175,5 @@ static inline void z_erofs_onlinepage_endio(struct page *page) #define Z_EROFS_VMAP_ONSTACK_PAGES \ min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 #endif -- 2.24.4 From lkp at intel.com Sat Jul 16 14:01:21 2022 From: lkp at intel.com (kernel test robot) Date: Sat, 16 Jul 2022 12:01:21 +0800 Subject: [xiang-erofs:dev] BUILD SUCCESS 410bae521617656ad1acf84e0b68c643bcb1beab Message-ID: <62d23811.ncnAyaTN94FW6AC8%lkp@intel.com> tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev branch HEAD: 410bae521617656ad1acf84e0b68c643bcb1beab erofs: introduce multi-reference pclusters (fully-referenced) elapsed time: 723m configs tested: 114 configs skipped: 3 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: arm64 allyesconfig arm defconfig arm allyesconfig i386 randconfig-c001 arm footbridge_defconfig xtensa defconfig openrisc or1ksim_defconfig arm mvebu_v7_defconfig powerpc mpc85xx_cds_defconfig parisc generic-32bit_defconfig arm u8500_defconfig sh urquell_defconfig s390 zfcpdump_defconfig m68k alldefconfig mips jazz_defconfig arm simpad_defconfig arm multi_v4t_defconfig powerpc ep8248e_defconfig sh sh7763rdp_defconfig mips maltaup_xpa_defconfig mips loongson3_defconfig arm keystone_defconfig powerpc tqm8xx_defconfig xtensa xip_kc705_defconfig powerpc tqm8555_defconfig sh sh7785lcr_defconfig mips ar7_defconfig sh ecovec24_defconfig m68k m5307c3_defconfig ia64 alldefconfig openrisc simple_smp_defconfig riscv nommu_virt_defconfig riscv rv32_defconfig riscv nommu_k210_defconfig riscv allnoconfig i386 debian-10.3-kselftests i386 debian-10.3 ia64 allmodconfig csky allnoconfig alpha allnoconfig arc allnoconfig m68k allyesconfig m68k allmodconfig arc allyesconfig alpha allyesconfig powerpc allnoconfig mips allyesconfig powerpc allmodconfig sh allmodconfig i386 allyesconfig i386 defconfig x86_64 randconfig-a004 x86_64 randconfig-a002 x86_64 randconfig-a006 i386 randconfig-a001 i386 randconfig-a003 i386 randconfig-a005 x86_64 randconfig-a011 x86_64 randconfig-a013 x86_64 randconfig-a015 i386 randconfig-a012 i386 randconfig-a014 i386 randconfig-a016 arc randconfig-r043-20220716 arc randconfig-r043-20220715 s390 randconfig-r044-20220716 riscv randconfig-r042-20220716 x86_64 rhel-8.3-kselftests x86_64 allyesconfig um i386_defconfig um x86_64_defconfig x86_64 defconfig x86_64 rhel-8.3 x86_64 rhel-8.3-func x86_64 rhel-8.3-syz x86_64 rhel-8.3-kunit clang tested configs: mips pic32mzda_defconfig mips mtx1_defconfig powerpc ppc64e_defconfig arm aspeed_g4_defconfig arm moxart_defconfig powerpc ppa8548_defconfig mips maltaaprp_defconfig arm mvebu_v5_defconfig arm bcm2835_defconfig powerpc socrates_defconfig powerpc mpc834x_itxgp_defconfig arm netwinder_defconfig mips lemote2f_defconfig arm milbeaut_m10v_defconfig powerpc ge_imp3a_defconfig arm collie_defconfig mips cu1830-neo_defconfig powerpc obs600_defconfig arm dove_defconfig x86_64 randconfig-k001 x86_64 randconfig-a001 x86_64 randconfig-a003 x86_64 randconfig-a005 i386 randconfig-a002 i386 randconfig-a006 i386 randconfig-a004 x86_64 randconfig-a012 x86_64 randconfig-a014 x86_64 randconfig-a016 i386 randconfig-a015 i386 randconfig-a011 i386 randconfig-a013 riscv randconfig-r042-20220715 s390 randconfig-r044-20220715 hexagon randconfig-r045-20220715 hexagon randconfig-r041-20220715 hexagon randconfig-r045-20220716 hexagon randconfig-r041-20220716 -- 0-DAY CI Kernel Test Service https://01.org/lkp From jnyk18230111 at ares.eonet.ne.jp Mon Jul 18 09:36:11 2022 From: jnyk18230111 at ares.eonet.ne.jp (=?ISO-2022-JP?B?GyRCS0w7M0AwOXwxIRsoQg==?=) Date: Sun, 17 Jul 2022 23:36:11 +0000 Subject: =?ISO-2022-JP?B?GyRCIVpLTDszQDA5fDEhIVskKkxkJCQ5ZyRvJDskIiRqJCwkSCQmJDQkNiQkJF4kNyQ/GyhC?= Message-ID: <309b4cd3cb26b10ce1e530d78f8e750a@taiyodo3.heteml.net> ? You have unread messages (2) from Debbie! View Messages: https://letsg0dancing.page.link/go?ygjh ?? ?????????????????? ????????????????????????????? ???????: linux-erofs at lists.ozlabs.org ????: 420193377095 ???????: 8gs4v6 ---------------------------------------- ?????? ?????????????????????????? http://kitayama-s.com From dan.carpenter at oracle.com Mon Jul 18 21:20:08 2022 From: dan.carpenter at oracle.com (Dan Carpenter) Date: Mon, 18 Jul 2022 14:20:08 +0300 Subject: [PATCH] erofs: clean up a loop Message-ID: It's easier to see what this loop is doing when the decrement is in the normal place. Signed-off-by: Dan Carpenter --- fs/erofs/zdata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 601cfcb07c50..2691100eb231 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -419,8 +419,8 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + while (fe->icur--) { + if (!cmpxchg(&pcl->compressed_bvecs[fe->icur].page, NULL, bvec->page)) { pcl->compressed_bvecs[fe->icur] = *bvec; return true; -- 2.35.1 From hsiangkao at linux.alibaba.com Mon Jul 18 21:36:14 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Mon, 18 Jul 2022 19:36:14 +0800 Subject: [PATCH] erofs: clean up a loop In-Reply-To: References: Message-ID: Hi Dan, On Mon, Jul 18, 2022 at 02:20:08PM +0300, Dan Carpenter wrote: > It's easier to see what this loop is doing when the decrement is in > the normal place. > > Signed-off-by: Dan Carpenter > --- > fs/erofs/zdata.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index 601cfcb07c50..2691100eb231 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -419,8 +419,8 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, > { > struct z_erofs_pcluster *const pcl = fe->pcl; > > - while (fe->icur > 0) { > - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, > + while (fe->icur--) { Thanks for your patch! Yet at a quick glance, on my side, that doesn't equal to be honest... .. What we're trying to do here is to find a free slot for inplace i/o, but also need to leave fe->icur as 0 when going out the loop since z_erofs_try_inplace_io() can be called again the next time when attaching another page but it will overflow then... Thanks, Gao Xiang > + if (!cmpxchg(&pcl->compressed_bvecs[fe->icur].page, > NULL, bvec->page)) { > pcl->compressed_bvecs[fe->icur] = *bvec; > return true; > -- > 2.35.1 From dan.carpenter at oracle.com Mon Jul 18 22:23:13 2022 From: dan.carpenter at oracle.com (Dan Carpenter) Date: Mon, 18 Jul 2022 15:23:13 +0300 Subject: [PATCH] erofs: clean up a loop In-Reply-To: References:

Message-ID: <20220718122313.GX2316@kadam> On Mon, Jul 18, 2022 at 07:36:14PM +0800, Gao Xiang wrote: > Hi Dan, > > On Mon, Jul 18, 2022 at 02:20:08PM +0300, Dan Carpenter wrote: > > It's easier to see what this loop is doing when the decrement is in > > the normal place. > > > > Signed-off-by: Dan Carpenter > > --- > > fs/erofs/zdata.c | 4 ++-- > > 1 file changed, 2 insertions(+), 2 deletions(-) > > > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > > index 601cfcb07c50..2691100eb231 100644 > > --- a/fs/erofs/zdata.c > > +++ b/fs/erofs/zdata.c > > @@ -419,8 +419,8 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, > > { > > struct z_erofs_pcluster *const pcl = fe->pcl; > > > > - while (fe->icur > 0) { > > - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, > > + while (fe->icur--) { > > Thanks for your patch! > Yet at a quick glance, on my side, that doesn't equal > to be honest... > > .. What we're trying to do here is to find a free slot > for inplace i/o, but also need to leave fe->icur as 0 > when going out the loop since z_erofs_try_inplace_io() > can be called again the next time when attaching > another page but it will overflow then... Ah. Sorry. I never thought about it being called twice in a row. regards, dan carpenter From hsiangkao at linux.alibaba.com Tue Jul 19 20:21:30 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Tue, 19 Jul 2022 18:21:30 +0800 Subject: [PATCH] erofs-utils: fuse: introduce xattr support In-Reply-To: <20220715095359.37534-1-jnhuang@linux.alibaba.com> References: <20220715095359.37534-1-jnhuang@linux.alibaba.com> Message-ID: Hi Jianan, On Fri, Jul 15, 2022 at 05:53:59PM +0800, Huang Jianan wrote: > This implements xattr functionalities for erofsfuse. A large amount > of code was adapted from Linux kernel. > > Signed-off-by: Huang Jianan > --- > fuse/main.c | 32 +++ > include/erofs/internal.h | 8 + > include/erofs/xattr.h | 21 ++ > lib/xattr.c | 508 +++++++++++++++++++++++++++++++++++++++ > 4 files changed, 569 insertions(+) > > diff --git a/fuse/main.c b/fuse/main.c > index f4c2476..30a0bed 100644 > --- a/fuse/main.c > +++ b/fuse/main.c > @@ -139,7 +139,39 @@ static int erofsfuse_readlink(const char *path, char *buffer, size_t size) > return 0; > } > > +static int erofsfuse_getxattr(const char *path, const char *name, char *value, > + size_t size) > +{ > + int ret; > + struct erofs_inode vi; > + > + erofs_dbg("getxattr(%s): name=%s size=%llu", path, name, size); > + > + ret = erofs_ilookup(path, &vi); > + if (ret) > + return ret; > + > + return erofs_getxattr(&vi, name, value, size); > +} > + > +static int erofsfuse_listxattr(const char *path, char *list, size_t size) > +{ > + int ret; > + struct erofs_inode vi; > + int i; As we discussed offline, this line should be unneeded. Thanks, Gao Xiang From hsiangkao at linux.alibaba.com Wed Jul 20 18:22:29 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Wed, 20 Jul 2022 16:22:29 +0800 Subject: [PATCH v2] erofs: get rid of erofs_prepare_dio() helper Message-ID: <20220720082229.12172-1-hsiangkao@linux.alibaba.com> Fold in erofs_prepare_dio() in order to simplify the code. Signed-off-by: Gao Xiang --- v1: https://lore.kernel.org/r/20220506194612.117120-1-hsiangkao at linux.alibaba.com fs/erofs/data.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fbb037ba326e..fe8ac0e163f7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return iomap_bmap(mapping, block, &erofs_iomap_ops); } -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - loff_t align = iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - if (align & blksize_mask) - return -EINVAL; - return 0; -} - -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX - if (IS_DAX(iocb->ki_filp->f_mapping->host)) + if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if (iocb->ki_flags & IOCB_DIRECT) { - int err = erofs_prepare_dio(iocb, to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; - if (!err) - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, NULL, 0); - if (err < 0) - return err; + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); } return filemap_read(iocb, to, 0); } -- 2.24.4 From linux-erofs at lists.ozlabs.org Thu Jul 21 04:20:36 2022 From: linux-erofs at lists.ozlabs.org (2022-07-21 02:20:44) Date: Thu, 21 Jul 2022 02:20:36 +0800 Subject: =?utf-8?B?5Zyo5a6F5Yuk5YuZ44Gr6YGp44GX44Gf44Ki44Or44OQ44Kk44OI44Gn44GZYw==?= Message-ID: <20220721022044735881@myqhix.shop> ????LINE?ID??a9585 ????????????????OK [????]???????????????????? ????????????? ?????[????]00:00?00:00 ????????????? ???????????????????? ??????????? ????????????? ??????????????OK??? ?????? ??????????????????????? ???????????????? ??????????? ???????? ???????W?????? 20??40??????????????????? ??????????????????????????????? -------------- next part -------------- An HTML attachment was scrubbed... URL: From mail at aparcar.org Thu Jul 21 06:38:15 2022 From: mail at aparcar.org (Paul Spooren) Date: Wed, 20 Jul 2022 22:38:15 +0200 Subject: Tail packing feature on 5.15 Kernel Message-ID: Hi all, I?m currently in the process[1] to evaluate erofs as a replacement of squashfs on OpenWrt. Since 5.15 will be our next Kernel release but tail packing is only available starting from 5.17, did anyone already do the work of back porting the required patches? If not, could anyone please give me pointers which patches are required? Thank you very much for all further advice! Best, Paul [1]: https://github.com/openwrt/openwrt/pull/9968 -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 833 bytes Desc: Message signed with OpenPGP URL: From xiang at kernel.org Thu Jul 21 10:58:10 2022 From: xiang at kernel.org (Gao Xiang) Date: Thu, 21 Jul 2022 08:58:10 +0800 Subject: Tail packing feature on 5.15 Kernel In-Reply-To: References: Message-ID: Hi Paul, On Wed, Jul 20, 2022 at 10:38:15PM +0200, Paul Spooren wrote: > Hi all, > > I?m currently in the process[1] to evaluate erofs as a replacement of squashfs on OpenWrt. > > Since 5.15 will be our next Kernel release but tail packing is only available starting from 5.17, did anyone already do the work of back porting the required patches? If not, could anyone please give me pointers which patches are required? > > Thank you very much for all further advice! Thanks for your interest. EROFS is now actively developing so you could see new features on each new Linux version (I believe many active in-kernel features behave like this, for example iouring.) The initial EROFS version was formed as an optimized solution to compress in 4KiB pcluster so it has minimized memory footprints and best random performance on Android smartphones, for now the optimized and recommended configuration is still this one (4KiB, lz4hc) even though things are quickly changing since recent features add more possibility but most of these are still quite new and need to go with the next LTS version (maybe 6.0?). Also if you'd like to maximize the compression ratio you probably need `fragments` features which is still under development by Yue Hu [1]. As I said to you before [2], I still suggest that Openwrt takes EROFS as an _alternative approach_ instead of a replacement of Squashfs at least this year since we still need time to optimize the maximum compression ratio scenarios in addition to 4KiB, lz4hc (we also need to wait the next stable XZ-utils version first.) [1] https://lore.kernel.org/r/YpXnhI8gBlSgHEBW at B-P7TQMD6M-0146.local [2] https://lore.kernel.org/r/cover.1657528899.git.huyue2 at coolpad.com Thanks, Gao Xiang > > Best, > Paul > > [1]: https://github.com/openwrt/openwrt/pull/9968 From lkp at intel.com Thu Jul 21 13:40:24 2022 From: lkp at intel.com (kernel test robot) Date: Thu, 21 Jul 2022 11:40:24 +0800 Subject: [xiang-erofs:dev] BUILD SUCCESS cd084cb3db6e65535fd13f933a15747cdb6240c1 Message-ID: <62d8caa8.AZfyIAF45V4wu5Dq%lkp@intel.com> tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev branch HEAD: cd084cb3db6e65535fd13f933a15747cdb6240c1 erofs: get rid of erofs_prepare_dio() helper elapsed time: 732m configs tested: 114 configs skipped: 6 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: arm64 allyesconfig arm defconfig arm allyesconfig i386 randconfig-c001 arm iop32x_defconfig sh rts7751r2d1_defconfig arm nhk8815_defconfig xtensa audio_kc705_defconfig powerpc adder875_defconfig m68k m5272c3_defconfig mips bigsur_defconfig csky allnoconfig um i386_defconfig arm integrator_defconfig arm mvebu_v7_defconfig m68k virt_defconfig sh se7206_defconfig sh shx3_defconfig sh defconfig powerpc cell_defconfig nios2 defconfig openrisc simple_smp_defconfig arc nsimosci_hs_smp_defconfig arm badge4_defconfig sparc allnoconfig arm mini2440_defconfig arc nsimosci_defconfig sparc allyesconfig mips maltasmvp_eva_defconfig sh secureedge5410_defconfig m68k mvme147_defconfig nios2 10m50_defconfig xtensa defconfig m68k mac_defconfig arm viper_defconfig powerpc ep88xc_defconfig powerpc pcm030_defconfig alpha allyesconfig ia64 bigsur_defconfig arm exynos_defconfig s390 defconfig s390 allmodconfig arc defconfig alpha defconfig s390 allyesconfig riscv nommu_virt_defconfig riscv rv32_defconfig riscv nommu_k210_defconfig riscv allnoconfig i386 debian-10.3-kselftests i386 debian-10.3 loongarch defconfig loongarch allnoconfig arm randconfig-c002-20220718 i386 randconfig-c001-20220718 x86_64 randconfig-c001-20220718 alpha allnoconfig arc allnoconfig m68k allyesconfig m68k allmodconfig arc allyesconfig powerpc allnoconfig mips allyesconfig powerpc allmodconfig sh allmodconfig i386 allyesconfig i386 defconfig x86_64 randconfig-a012-20220718 x86_64 randconfig-a011-20220718 x86_64 randconfig-a014-20220718 x86_64 randconfig-a016-20220718 x86_64 randconfig-a013-20220718 x86_64 randconfig-a015-20220718 i386 randconfig-a015-20220718 i386 randconfig-a011-20220718 i386 randconfig-a012-20220718 i386 randconfig-a014-20220718 i386 randconfig-a016-20220718 i386 randconfig-a013-20220718 i386 randconfig-a012 i386 randconfig-a016 s390 randconfig-r044-20220718 riscv randconfig-r042-20220718 arc randconfig-r043-20220718 x86_64 rhel-8.3-kselftests um x86_64_defconfig x86_64 defconfig x86_64 allyesconfig x86_64 rhel-8.3 x86_64 rhel-8.3-func x86_64 rhel-8.3-syz x86_64 rhel-8.3-kunit clang tested configs: arm tct_hammer_defconfig mips ip28_defconfig powerpc mpc8315_rdb_defconfig powerpc mpc832x_mds_defconfig powerpc gamecube_defconfig mips ath25_defconfig powerpc acadia_defconfig x86_64 randconfig-k001 x86_64 randconfig-a005 x86_64 randconfig-a003 x86_64 randconfig-a001 i386 randconfig-a002 i386 randconfig-a006 i386 randconfig-a004 i386 randconfig-a001-20220718 i386 randconfig-a006-20220718 i386 randconfig-a002-20220718 i386 randconfig-a004-20220718 i386 randconfig-a005-20220718 i386 randconfig-a003-20220718 hexagon randconfig-r041-20220718 hexagon randconfig-r045-20220718 -- 0-DAY CI Kernel Test Service https://01.org/lkp From chao at kernel.org Fri Jul 22 00:31:53 2022 From: chao at kernel.org (Chao Yu) Date: Thu, 21 Jul 2022 22:31:53 +0800 Subject: [PATCH v2 00/16] erofs: prepare for folios, deduplication and kill PG_error In-Reply-To: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> References: <20220715154203.48093-1-hsiangkao@linux.alibaba.com> Message-ID: <3801a538-f059-2b62-6dbb-cbe67478371b@kernel.org> On 2022/7/15 23:41, Gao Xiang wrote: > Hi folks, > > I've been doing this for almost 2 months, the main point of this is > to support large folios and rolling hash deduplication for compressed > data. > > This patchset is as a start of this work targeting for the next 5.20, > it introduces a flexable range representation for (de)compressed buffers > instead of too relying on page(s) directly themselves, so large folios > can laterly base on this work. Also, this patchset gets rid of all > PG_error flags in the decompression code. It's a cleanup as a result > as well. > > In addition, this patchset kicks off rolling hash deduplication for > compressed data by introducing fully-referenced multi-reference > pclusters first instead of reporting fs corruption if one pcluster > is introduced by several differnt extents. The full implementation > is expected to be finished in the merge window after the next. One > of my colleagues is actively working on the userspace part of this > feature. > > However, it's still easy to verify fully-referenced multi-reference > pcluster by constructing some image by hand (see attachment): > > Dataset: 300M > seq-read (data-deduplicated, read_ahead_kb 8192): 1095MiB/s > seq-read (data-deduplicated, read_ahead_kb 4096): 771MiB/s > seq-read (data-deduplicated, read_ahead_kb 512): 577MiB/s > seq-read (vanilla, read_ahead_kb 8192): 364MiB/s > > Finally, this patchset survives ro-fsstress on my side. For this patchset, Acked-by: Chao Yu Thanks, > > Thanks, > Gao Xiang > > Changes since v1: > - rename left pagevec words to bvpage (Yue Hu); > > Gao Xiang (16): > erofs: get rid of unneeded `inode', `map' and `sb' > erofs: clean up z_erofs_collector_begin() > erofs: introduce `z_erofs_parse_out_bvecs()' > erofs: introduce bufvec to store decompressed buffers > erofs: drop the old pagevec approach > erofs: introduce `z_erofs_parse_in_bvecs' > erofs: switch compressed_pages[] to bufvec > erofs: rework online page handling > erofs: get rid of `enum z_erofs_page_type' > erofs: clean up `enum z_erofs_collectmode' > erofs: get rid of `z_pagemap_global' > erofs: introduce struct z_erofs_decompress_backend > erofs: try to leave (de)compressed_pages on stack if possible > erofs: introduce z_erofs_do_decompressed_bvec() > erofs: record the longest decompressed size in this round > erofs: introduce multi-reference pclusters (fully-referenced) > > fs/erofs/compress.h | 2 +- > fs/erofs/decompressor.c | 2 +- > fs/erofs/zdata.c | 785 +++++++++++++++++++++++----------------- > fs/erofs/zdata.h | 119 +++--- > fs/erofs/zpvec.h | 159 -------- > 5 files changed, 496 insertions(+), 571 deletions(-) > delete mode 100644 fs/erofs/zpvec.h > From huyue2 at coolpad.com Fri Jul 22 12:49:37 2022 From: huyue2 at coolpad.com (Yue Hu) Date: Fri, 22 Jul 2022 10:49:37 +0800 Subject: [PATCH] erofs-utils: fix a memory leak of multiple devices Message-ID: <20220722024903.21550-1-huyue2@coolpad.com> The memory allocated for multiple devices should be freed when to exit. Let's add a helper to fix it since there is more than one to use it. Signed-off-by: Yue Hu --- dump/main.c | 7 ++++--- fsck/main.c | 7 ++++--- fuse/main.c | 5 +++-- include/erofs/internal.h | 1 + lib/super.c | 6 ++++++ 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dump/main.c b/dump/main.c index 40e850a..c9b3a8f 100644 --- a/dump/main.c +++ b/dump/main.c @@ -615,7 +615,7 @@ int main(int argc, char **argv) err = erofs_read_superblock(); if (err) { erofs_err("failed to read superblock"); - goto exit_dev_close; + goto exit_put_super; } if (!dumpcfg.totalshow) { @@ -630,13 +630,14 @@ int main(int argc, char **argv) if (dumpcfg.show_extent && !dumpcfg.show_inode) { usage(); - goto exit_dev_close; + goto exit_put_super; } if (dumpcfg.show_inode) erofsdump_show_fileinfo(dumpcfg.show_extent); -exit_dev_close: +exit_put_super: + erofs_put_super(); dev_close(); exit: blob_closeall(); diff --git a/fsck/main.c b/fsck/main.c index 5a2f659..a8f0e24 100644 --- a/fsck/main.c +++ b/fsck/main.c @@ -813,12 +813,12 @@ int main(int argc, char **argv) err = erofs_read_superblock(); if (err) { erofs_err("failed to read superblock"); - goto exit_dev_close; + goto exit_put_super; } if (erofs_sb_has_sb_chksum() && erofs_check_sb_chksum()) { erofs_err("failed to verify superblock checksum"); - goto exit_dev_close; + goto exit_put_super; } err = erofsfsck_check_inode(sbi.root_nid, sbi.root_nid); @@ -843,7 +843,8 @@ int main(int argc, char **argv) } } -exit_dev_close: +exit_put_super: + erofs_put_super(); dev_close(); exit: blob_closeall(); diff --git a/fuse/main.c b/fuse/main.c index 95f939e..95f7abc 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -295,11 +295,12 @@ int main(int argc, char *argv[]) ret = erofs_read_superblock(); if (ret) { fprintf(stderr, "failed to read erofs super block\n"); - goto err_dev_close; + goto err_put_super; } ret = fuse_main(args.argc, args.argv, &erofs_ops, NULL); -err_dev_close: +err_put_super: + erofs_put_super(); blob_closeall(); dev_close(); err_fuse_free_args: diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 6a70f11..48498fe 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -318,6 +318,7 @@ struct erofs_map_dev { /* super.c */ int erofs_read_superblock(void); +void erofs_put_super(void); /* namei.c */ int erofs_read_inode_from_disk(struct erofs_inode *vi); diff --git a/lib/super.c b/lib/super.c index f486eb7..913d2fb 100644 --- a/lib/super.c +++ b/lib/super.c @@ -109,3 +109,9 @@ int erofs_read_superblock(void) memcpy(&sbi.uuid, dsb->uuid, sizeof(dsb->uuid)); return erofs_init_devices(&sbi, dsb); } + +void erofs_put_super(void) +{ + if (sbi.devs) + free(sbi.devs); +} -- 2.17.1 -------------- next part -------------- An HTML attachment was scrubbed... URL: From lkp at intel.com Fri Jul 22 13:09:05 2022 From: lkp at intel.com (kernel test robot) Date: Fri, 22 Jul 2022 11:09:05 +0800 Subject: [xiang-erofs:dev] BUILD SUCCESS d0e5ad05996ec1f0813feb4a5058af50dd1a2a60 Message-ID: <62da14d1.sEZSZi308cTZdVR8%lkp@intel.com> tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev branch HEAD: d0e5ad05996ec1f0813feb4a5058af50dd1a2a60 erofs: get rid of the leftover PAGE_SIZE in dir.c elapsed time: 724m configs tested: 82 configs skipped: 2 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: arm defconfig arm allyesconfig arm64 allyesconfig i386 randconfig-c001 sh j2_defconfig arc axs101_defconfig alpha defconfig mips cobalt_defconfig arm jornada720_defconfig sh ul2_defconfig powerpc cell_defconfig riscv nommu_k210_sdcard_defconfig mips xway_defconfig mips rb532_defconfig powerpc mpc834x_mds_defconfig m68k mvme147_defconfig mips loongson1b_defconfig arm at91_dt_defconfig nios2 10m50_defconfig sh alldefconfig powerpc tqm8555_defconfig arm pxa_defconfig powerpc tqm8xx_defconfig mips gcw0_defconfig arm oxnas_v6_defconfig csky allnoconfig alpha allnoconfig arc allnoconfig riscv allnoconfig m68k allyesconfig m68k allmodconfig arc allyesconfig alpha allyesconfig sh allmodconfig powerpc allnoconfig mips allyesconfig powerpc allmodconfig i386 defconfig i386 allyesconfig x86_64 randconfig-a004 x86_64 randconfig-a002 x86_64 randconfig-a006 i386 randconfig-a005 i386 randconfig-a001 i386 randconfig-a003 x86_64 randconfig-a015 x86_64 randconfig-a013 x86_64 randconfig-a011 i386 randconfig-a012 i386 randconfig-a014 i386 randconfig-a016 arc randconfig-r043-20220721 x86_64 rhel-8.3-kselftests um i386_defconfig um x86_64_defconfig x86_64 defconfig x86_64 rhel-8.3 x86_64 allyesconfig x86_64 rhel-8.3-func x86_64 rhel-8.3-syz x86_64 rhel-8.3-kunit clang tested configs: arm ixp4xx_defconfig powerpc mpc866_ads_defconfig mips malta_kvm_defconfig arm defconfig arm spitz_defconfig x86_64 randconfig-k001 x86_64 randconfig-a005 x86_64 randconfig-a001 x86_64 randconfig-a003 i386 randconfig-a004 i386 randconfig-a006 i386 randconfig-a002 x86_64 randconfig-a016 x86_64 randconfig-a014 x86_64 randconfig-a012 i386 randconfig-a013 i386 randconfig-a011 i386 randconfig-a015 riscv randconfig-r042-20220721 s390 randconfig-r044-20220721 hexagon randconfig-r041-20220721 hexagon randconfig-r045-20220721 -- 0-DAY CI Kernel Test Service https://01.org/lkp From zbestahu at gmail.com Fri Jul 22 13:10:08 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 22 Jul 2022 11:10:08 +0800 Subject: [PATCH RESEND] erofs-utils: fix a memory leak of multiple devices Message-ID: <20220722031008.21819-1-huyue2@coolpad.com> The memory allocated for multiple devices should be freed when to exit. Let's add a helper to fix it since there is more than one to use it. Signed-off-by: Yue Hu --- dump/main.c | 7 ++++--- fsck/main.c | 7 ++++--- fuse/main.c | 5 +++-- include/erofs/internal.h | 1 + lib/super.c | 6 ++++++ 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dump/main.c b/dump/main.c index 40e850a..c9b3a8f 100644 --- a/dump/main.c +++ b/dump/main.c @@ -615,7 +615,7 @@ int main(int argc, char **argv) err = erofs_read_superblock(); if (err) { erofs_err("failed to read superblock"); - goto exit_dev_close; + goto exit_put_super; } if (!dumpcfg.totalshow) { @@ -630,13 +630,14 @@ int main(int argc, char **argv) if (dumpcfg.show_extent && !dumpcfg.show_inode) { usage(); - goto exit_dev_close; + goto exit_put_super; } if (dumpcfg.show_inode) erofsdump_show_fileinfo(dumpcfg.show_extent); -exit_dev_close: +exit_put_super: + erofs_put_super(); dev_close(); exit: blob_closeall(); diff --git a/fsck/main.c b/fsck/main.c index 5a2f659..a8f0e24 100644 --- a/fsck/main.c +++ b/fsck/main.c @@ -813,12 +813,12 @@ int main(int argc, char **argv) err = erofs_read_superblock(); if (err) { erofs_err("failed to read superblock"); - goto exit_dev_close; + goto exit_put_super; } if (erofs_sb_has_sb_chksum() && erofs_check_sb_chksum()) { erofs_err("failed to verify superblock checksum"); - goto exit_dev_close; + goto exit_put_super; } err = erofsfsck_check_inode(sbi.root_nid, sbi.root_nid); @@ -843,7 +843,8 @@ int main(int argc, char **argv) } } -exit_dev_close: +exit_put_super: + erofs_put_super(); dev_close(); exit: blob_closeall(); diff --git a/fuse/main.c b/fuse/main.c index 95f939e..95f7abc 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -295,11 +295,12 @@ int main(int argc, char *argv[]) ret = erofs_read_superblock(); if (ret) { fprintf(stderr, "failed to read erofs super block\n"); - goto err_dev_close; + goto err_put_super; } ret = fuse_main(args.argc, args.argv, &erofs_ops, NULL); -err_dev_close: +err_put_super: + erofs_put_super(); blob_closeall(); dev_close(); err_fuse_free_args: diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 6a70f11..48498fe 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -318,6 +318,7 @@ struct erofs_map_dev { /* super.c */ int erofs_read_superblock(void); +void erofs_put_super(void); /* namei.c */ int erofs_read_inode_from_disk(struct erofs_inode *vi); diff --git a/lib/super.c b/lib/super.c index f486eb7..913d2fb 100644 --- a/lib/super.c +++ b/lib/super.c @@ -109,3 +109,9 @@ int erofs_read_superblock(void) memcpy(&sbi.uuid, dsb->uuid, sizeof(dsb->uuid)); return erofs_init_devices(&sbi, dsb); } + +void erofs_put_super(void) +{ + if (sbi.devs) + free(sbi.devs); +} -- 2.17.1 From hsiangkao at linux.alibaba.com Fri Jul 22 13:28:27 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Fri, 22 Jul 2022 11:28:27 +0800 Subject: [PATCH RESEND] erofs-utils: fix a memory leak of multiple devices In-Reply-To: <20220722031008.21819-1-huyue2@coolpad.com> References: <20220722031008.21819-1-huyue2@coolpad.com> Message-ID: Hi Yue, On Fri, Jul 22, 2022 at 11:10:08AM +0800, Yue Hu wrote: > The memory allocated for multiple devices should be freed when to exit. > Let's add a helper to fix it since there is more than one to use it. > > Signed-off-by: Yue Hu > --- > dump/main.c | 7 ++++--- > fsck/main.c | 7 ++++--- > fuse/main.c | 5 +++-- > include/erofs/internal.h | 1 + > lib/super.c | 6 ++++++ > 5 files changed, 18 insertions(+), 8 deletions(-) > > diff --git a/dump/main.c b/dump/main.c > index 40e850a..c9b3a8f 100644 > --- a/dump/main.c > +++ b/dump/main.c > @@ -615,7 +615,7 @@ int main(int argc, char **argv) > err = erofs_read_superblock(); > if (err) { > erofs_err("failed to read superblock"); > - goto exit_dev_close; > + goto exit_put_super; > } > > if (!dumpcfg.totalshow) { > @@ -630,13 +630,14 @@ int main(int argc, char **argv) > > if (dumpcfg.show_extent && !dumpcfg.show_inode) { > usage(); > - goto exit_dev_close; > + goto exit_put_super; > } > > if (dumpcfg.show_inode) > erofsdump_show_fileinfo(dumpcfg.show_extent); > > -exit_dev_close: > +exit_put_super: > + erofs_put_super(); > dev_close(); > exit: > blob_closeall(); > diff --git a/fsck/main.c b/fsck/main.c > index 5a2f659..a8f0e24 100644 > --- a/fsck/main.c > +++ b/fsck/main.c > @@ -813,12 +813,12 @@ int main(int argc, char **argv) > err = erofs_read_superblock(); > if (err) { > erofs_err("failed to read superblock"); > - goto exit_dev_close; > + goto exit_put_super; Why do we call erofs_put_super() again here? I think we don't need to call erofs_put_super for all failed paths. Thanks, Gao Xiang From zbestahu at gmail.com Fri Jul 22 14:31:19 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 22 Jul 2022 12:31:19 +0800 Subject: [PATCH RESEND] erofs-utils: fix a memory leak of multiple devices In-Reply-To: References: <20220722031008.21819-1-huyue2@coolpad.com> Message-ID: <20220722123119.00003b90.zbestahu@gmail.com> On Fri, 22 Jul 2022 11:28:27 +0800 Gao Xiang wrote: > Hi Yue, > > On Fri, Jul 22, 2022 at 11:10:08AM +0800, Yue Hu wrote: > > The memory allocated for multiple devices should be freed when to exit. > > Let's add a helper to fix it since there is more than one to use it. > > > > Signed-off-by: Yue Hu > > --- > > dump/main.c | 7 ++++--- > > fsck/main.c | 7 ++++--- > > fuse/main.c | 5 +++-- > > include/erofs/internal.h | 1 + > > lib/super.c | 6 ++++++ > > 5 files changed, 18 insertions(+), 8 deletions(-) > > > > diff --git a/dump/main.c b/dump/main.c > > index 40e850a..c9b3a8f 100644 > > --- a/dump/main.c > > +++ b/dump/main.c > > @@ -615,7 +615,7 @@ int main(int argc, char **argv) > > err = erofs_read_superblock(); > > if (err) { > > erofs_err("failed to read superblock"); > > - goto exit_dev_close; > > + goto exit_put_super; > > } > > > > if (!dumpcfg.totalshow) { > > @@ -630,13 +630,14 @@ int main(int argc, char **argv) > > > > if (dumpcfg.show_extent && !dumpcfg.show_inode) { > > usage(); > > - goto exit_dev_close; > > + goto exit_put_super; > > } > > > > if (dumpcfg.show_inode) > > erofsdump_show_fileinfo(dumpcfg.show_extent); > > > > -exit_dev_close: > > +exit_put_super: > > + erofs_put_super(); > > dev_close(); > > exit: > > blob_closeall(); > > diff --git a/fsck/main.c b/fsck/main.c > > index 5a2f659..a8f0e24 100644 > > --- a/fsck/main.c > > +++ b/fsck/main.c > > @@ -813,12 +813,12 @@ int main(int argc, char **argv) > > err = erofs_read_superblock(); > > if (err) { > > erofs_err("failed to read superblock"); > > - goto exit_dev_close; > > + goto exit_put_super; > > Why do we call erofs_put_super() again here? I think we don't need to > call erofs_put_super for all failed paths. There is a call to dev_read() which may fails after allocating memory. Let me send v2 for this. > > Thanks, > Gao Xiang From zbestahu at gmail.com Fri Jul 22 15:36:10 2022 From: zbestahu at gmail.com (Yue Hu) Date: Fri, 22 Jul 2022 13:36:10 +0800 Subject: [PATCH v2] erofs-utils: fix a memory leak of multiple devices Message-ID: <20220722053610.23912-1-huyue2@coolpad.com> The memory allocated for multiple devices should be freed after use. Let's add a helper to fix it since there is more than one to use it. Signed-off-by: Yue Hu --- dump/main.c | 4 +++- fsck/main.c | 4 +++- fuse/main.c | 2 ++ include/erofs/internal.h | 1 + lib/super.c | 12 +++++++++++- 5 files changed, 20 insertions(+), 3 deletions(-) diff --git a/dump/main.c b/dump/main.c index 40e850a..f2a09b6 100644 --- a/dump/main.c +++ b/dump/main.c @@ -630,12 +630,14 @@ int main(int argc, char **argv) if (dumpcfg.show_extent && !dumpcfg.show_inode) { usage(); - goto exit_dev_close; + goto exit_put_super; } if (dumpcfg.show_inode) erofsdump_show_fileinfo(dumpcfg.show_extent); +exit_put_super: + erofs_put_super(); exit_dev_close: dev_close(); exit: diff --git a/fsck/main.c b/fsck/main.c index 5a2f659..8ed3fc5 100644 --- a/fsck/main.c +++ b/fsck/main.c @@ -818,7 +818,7 @@ int main(int argc, char **argv) if (erofs_sb_has_sb_chksum() && erofs_check_sb_chksum()) { erofs_err("failed to verify superblock checksum"); - goto exit_dev_close; + goto exit_put_super; } err = erofsfsck_check_inode(sbi.root_nid, sbi.root_nid); @@ -843,6 +843,8 @@ int main(int argc, char **argv) } } +exit_put_super: + erofs_put_super(); exit_dev_close: dev_close(); exit: diff --git a/fuse/main.c b/fuse/main.c index 95f939e..3e55bb8 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -299,6 +299,8 @@ int main(int argc, char *argv[]) } ret = fuse_main(args.argc, args.argv, &erofs_ops, NULL); + + erofs_put_super(); err_dev_close: blob_closeall(); dev_close(); diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 6a70f11..48498fe 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -318,6 +318,7 @@ struct erofs_map_dev { /* super.c */ int erofs_read_superblock(void); +void erofs_put_super(void); /* namei.c */ int erofs_read_inode_from_disk(struct erofs_inode *vi); diff --git a/lib/super.c b/lib/super.c index f486eb7..b267412 100644 --- a/lib/super.c +++ b/lib/super.c @@ -46,14 +46,18 @@ static int erofs_init_devices(struct erofs_sb_info *sbi, sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; sbi->devs = calloc(ondisk_extradevs, sizeof(*sbi->devs)); + if (!sbi->devs) + return -ENOMEM; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; for (i = 0; i < ondisk_extradevs; ++i) { struct erofs_deviceslot dis; int ret; ret = dev_read(0, &dis, pos, sizeof(dis)); - if (ret < 0) + if (ret < 0) { + free(sbi->devs); return ret; + } sbi->devs[i].mapped_blkaddr = dis.mapped_blkaddr; sbi->total_blocks += dis.blocks; @@ -109,3 +113,9 @@ int erofs_read_superblock(void) memcpy(&sbi.uuid, dsb->uuid, sizeof(dsb->uuid)); return erofs_init_devices(&sbi, dsb); } + +void erofs_put_super(void) +{ + if (sbi.devs) + free(sbi.devs); +} -- 2.17.1 From jefflexu at linux.alibaba.com Fri Jul 22 16:10:35 2022 From: jefflexu at linux.alibaba.com (JeffleXu) Date: Fri, 22 Jul 2022 14:10:35 +0800 Subject: [PATCH] erofs: get rid of the leftover PAGE_SIZE in dir.c In-Reply-To: <20220619150940.121005-1-hsiangkao@linux.alibaba.com> References: <20220619150940.121005-1-hsiangkao@linux.alibaba.com> Message-ID: <528c0378-90c2-8bcd-032c-837fc82bb321@linux.alibaba.com> On 6/19/22 11:09 PM, Gao Xiang wrote: > Convert the last hardcoded PAGE_SIZEs of uncompressed cases. > > Signed-off-by: Gao Xiang > --- > fs/erofs/dir.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c > index 18e59821c597..723f5223a4fa 100644 > --- a/fs/erofs/dir.c > +++ b/fs/erofs/dir.c > @@ -90,7 +90,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) > > nameoff = le16_to_cpu(de->nameoff); > if (nameoff < sizeof(struct erofs_dirent) || > - nameoff >= PAGE_SIZE) { > + nameoff >= EROFS_BLKSIZ) { > erofs_err(dir->i_sb, > "invalid de[0].nameoff %u @ nid %llu", > nameoff, EROFS_I(dir)->nid); > @@ -99,7 +99,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) > } > > maxsize = min_t(unsigned int, > - dirsize - ctx->pos + ofs, PAGE_SIZE); > + dirsize - ctx->pos + ofs, EROFS_BLKSIZ); > > /* search dirents at the arbitrary position */ > if (initial) { LGTM. Reviewed-by: Jeffle Xu -- Thanks, Jeffle From jefflexu at linux.alibaba.com Fri Jul 22 18:16:26 2022 From: jefflexu at linux.alibaba.com (JeffleXu) Date: Fri, 22 Jul 2022 16:16:26 +0800 Subject: [PATCH v3] erofs: update ctx->pos for every emitted dirent In-Reply-To: <20220629081550.23501-1-hongnan.li@linux.alibaba.com> References: <20220527072536.68516-1-hongnan.li@linux.alibaba.com> <20220629081550.23501-1-hongnan.li@linux.alibaba.com> Message-ID: The patch itself looks good to me. On 6/29/22 4:15 PM, Hongnan Li wrote: > erofs_readdir update ctx->pos after filling a batch of dentries > and it may cause dir/files duplication for NFS readdirplus which > depends on ctx->pos to fill dir correctly. So update ctx->pos for > every emitted dirent in erofs_fill_dentries to fix it. > > Fixes: 3e917cc305c6 ("erofs: make filesystem exportable") > Signed-off-by: Hongnan Li > --- > fs/erofs/dir.c | 16 +++++++--------- > 1 file changed, 7 insertions(+), 9 deletions(-) > > diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c > index 18e59821c597..6fc325052853 100644 > --- a/fs/erofs/dir.c > +++ b/fs/erofs/dir.c > @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, > } > > static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, > - void *dentry_blk, unsigned int *ofs, > + void *dentry_blk, struct erofs_dirent *de, > unsigned int nameoff, unsigned int maxsize) > { > - struct erofs_dirent *de = dentry_blk + *ofs; > const struct erofs_dirent *end = dentry_blk + nameoff; > > while (de < end) { > @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, > /* stopped by some reason */ > return 1; > ++de; > - *ofs += sizeof(struct erofs_dirent); > + ctx->pos += sizeof(struct erofs_dirent); > } > - *ofs = maxsize; > return 0; > } > > @@ -95,7 +93,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) > "invalid de[0].nameoff %u @ nid %llu", > nameoff, EROFS_I(dir)->nid); > err = -EFSCORRUPTED; > - goto skip_this; > + break; > } > > maxsize = min_t(unsigned int, > @@ -106,17 +104,17 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) > initial = false; > > ofs = roundup(ofs, sizeof(struct erofs_dirent)); > + ctx->pos = blknr_to_addr(i) + ofs; > if (ofs >= nameoff) > goto skip_this; Besides, I thinks there's another issue with erofs_readdir() here (though unrelated to the issue this patch wants to fix). We need to update ctx->pos correctly if the initial file position has exceeded nameoff. ctx->pos needs to be updated to the end of EROFS_BLKSIZ or directory's i_size, surpassing the remaining name string in the current EROFS block. > } > > - err = erofs_fill_dentries(dir, ctx, de, &ofs, > + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, > nameoff, maxsize); > -skip_this: > - ctx->pos = blknr_to_addr(i) + ofs; > - > if (err) > break; > + ctx->pos = blknr_to_addr(i) + maxsize; It's quite easy to fix the above issue. We only need to move this line beneath skip_this label. > +skip_this:> ++i; > ofs = 0; > } like: skip_this: ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; Thus we'd better fold this simple fix into this patch. -- Thanks, Jeffle From jefflexu at linux.alibaba.com Fri Jul 22 18:27:32 2022 From: jefflexu at linux.alibaba.com (Jeffle Xu) Date: Fri, 22 Jul 2022 16:27:32 +0800 Subject: [PATCH v4] erofs: update ctx->pos for every emitted dirent Message-ID: <20220722082732.30935-1-jefflexu@linux.alibaba.com> From: Hongnan Li erofs_readdir update ctx->pos after filling a batch of dentries and it may cause dir/files duplication for NFS readdirplus which depends on ctx->pos to fill dir correctly. So update ctx->pos for every emitted dirent in erofs_fill_dentries to fix it. Also fix the update of ctx->pos when the initial file position has exceeded nameoff. Fixes: 3e917cc305c6 ("erofs: make filesystem exportable") Signed-off-by: Hongnan Li Signed-off-by: Jeffle Xu --- fs/erofs/dir.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 18e59821c597..47c85f1b80d8 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, } static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, - void *dentry_blk, unsigned int *ofs, + void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) { - struct erofs_dirent *de = dentry_blk + *ofs; const struct erofs_dirent *end = dentry_blk + nameoff; while (de < end) { @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, /* stopped by some reason */ return 1; ++de; - *ofs += sizeof(struct erofs_dirent); + ctx->pos += sizeof(struct erofs_dirent); } - *ofs = maxsize; return 0; } @@ -95,7 +93,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; - goto skip_this; + break; } maxsize = min_t(unsigned int, @@ -106,17 +104,17 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = blknr_to_addr(i) + ofs; if (ofs >= nameoff) goto skip_this; } - err = erofs_fill_dentries(dir, ctx, de, &ofs, + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); -skip_this: - ctx->pos = blknr_to_addr(i) + ofs; - if (err) break; +skip_this: + ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; } -- 2.27.0 From jefflexu at linux.alibaba.com Fri Jul 22 18:50:45 2022 From: jefflexu at linux.alibaba.com (JeffleXu) Date: Fri, 22 Jul 2022 16:50:45 +0800 Subject: [PATCH v2] erofs: get rid of erofs_prepare_dio() helper In-Reply-To: <20220720082229.12172-1-hsiangkao@linux.alibaba.com> References: <20220720082229.12172-1-hsiangkao@linux.alibaba.com> Message-ID: <2dd86247-2fd4-8785-8545-009081291cab@linux.alibaba.com> On 7/20/22 4:22 PM, Gao Xiang wrote: > Fold in erofs_prepare_dio() in order to simplify the code. > > Signed-off-by: Gao Xiang > --- > v1: https://lore.kernel.org/r/20220506194612.117120-1-hsiangkao at linux.alibaba.com > > fs/erofs/data.c | 39 +++++++++++++++------------------------ > 1 file changed, 15 insertions(+), 24 deletions(-) > > diff --git a/fs/erofs/data.c b/fs/erofs/data.c > index fbb037ba326e..fe8ac0e163f7 100644 > --- a/fs/erofs/data.c > +++ b/fs/erofs/data.c > @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) > return iomap_bmap(mapping, block, &erofs_iomap_ops); > } > > -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) > +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > { > struct inode *inode = file_inode(iocb->ki_filp); > - loff_t align = iocb->ki_pos | iov_iter_count(to) | > - iov_iter_alignment(to); > - struct block_device *bdev = inode->i_sb->s_bdev; > - unsigned int blksize_mask; > - > - if (bdev) > - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; > - else > - blksize_mask = (1 << inode->i_blkbits) - 1; > > - if (align & blksize_mask) > - return -EINVAL; > - return 0; > -} > - > -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > -{ > /* no need taking (shared) inode lock since it's a ro filesystem */ > if (!iov_iter_count(to)) > return 0; > > #ifdef CONFIG_FS_DAX > - if (IS_DAX(iocb->ki_filp->f_mapping->host)) > + if (IS_DAX(inode)) > return dax_iomap_rw(iocb, to, &erofs_iomap_ops); > #endif > if (iocb->ki_flags & IOCB_DIRECT) { > - int err = erofs_prepare_dio(iocb, to); > + struct block_device *bdev = inode->i_sb->s_bdev; > + unsigned int blksize_mask; > + > + if (bdev) > + blksize_mask = bdev_logical_block_size(bdev) - 1; > + else > + blksize_mask = (1 << inode->i_blkbits) - 1; > + > + if ((iocb->ki_pos | iov_iter_count(to) | > + iov_iter_alignment(to)) & blksize_mask) > + return -EINVAL; > > - if (!err) > - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, > - NULL, 0, NULL, 0); > - if (err < 0) > - return err; > + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, > + NULL, 0, NULL, 0); > } > return filemap_read(iocb, to, 0); > } LGTM. Reviewed-by: Jeffle Xu -- Thanks, Jeffle From outreach at globalkingprojectmanagement.com Sat Jul 23 04:47:06 2022 From: outreach at globalkingprojectmanagement.com (Global King Project Management) Date: Fri, 22 Jul 2022 18:47:06 +0000 Subject: invitation for Project Monitoring and Evaluation with Data Management and Analysis Course Message-ID: <4uhf96nx1rdm.Fik9OYrPP3UZu62Lkprfbw2@tracking.globalkingprojectmanagement.com> ? Project Monitoring and Evaluation with Data Management and Analysis Course August15 2022 to August 26 2022 Mombasa Kenya?Venue:? ?Onsite Mombasa or Nairobi Kenya or online platform.?Registration:? Use this link to register? Download Admission Form:? Download Our Calendar:? View all our courses? Reach us through WhatsApp?Reach us: training at globalkingprojectmanagement.org / +254 114 830 889 ?About the course This is a comprehensive 10 days M&E course that covers the principles and practices for results-based monitoring and evaluation for the entire project life cycle. This course equips participants with skills in setting up and implementing results-based monitoring and evaluation systems including M&E data management, analysis, and reporting. The participants will benefit from the latest M&E thinking and practices including the results and participatory approaches. This course is designed to enable the participants become experts in monitoring and evaluating their development projects. The course covers all the key elements of a robust M&E system coupled with a practical project to illustrate the M&E concepts.? Target Participants This course is designed for researchers, project staff, development practitioners, managers and decision makers who are responsible for project, program or organization-level M&E. The course aims to enhance the skills of professionals who need to research, supervise, manage, plan, implement, monitor and evaluate development projects. Course duration 10 days Course objectives Develop project results levels Design a project using logical framework Develop indicators and targets for each result level Track performance indicators over the life of the project Evaluation a project against the set results Develop and implement M&E systems Develop a comprehensive monitoring and evaluation plan Use data analysis software (Stata/SPSS/R) Collect data using mobile data collection tools Carryout impact evaluation Use GIS to analyze and share project data Course Outline Introduction to Results Based Project Management Fundamentals of Results Based Management Why is RBM important? Results based management vs traditional projects management RBM Lifecycle (seven phases) Areas of focus of RBM Fundamentals of Monitoring and Evaluation Definition of Monitoring and Evaluation Why Monitoring and Evaluation is important Key principles and concepts in M&E M&E in project lifecycle Participatory M&E Project Analysis Situation Analysis Needs Assessment Strategy Analysis Design of Results in Monitoring and Evaluation Results chain approaches: Impact, outcomes, outputs and activities Results framework M&E causal pathway Principles of planning, monitoring and evaluating for results M&E Indicators Indicators definition Indicator metrics Linking indicators to results Indicator matrix Tracking of indicators Logical Framework Approach LFA ? Analysis and Planning phase Design of logframe Risk rating in logframe Horizontal and vertical logic in logframe Using logframe to create schedules: Activity and Budget schedules Using logframe as a project management tool Theory of Change Overview of theory of change Developing theory of change Theory of Change vs Log Frame Case study: Theory of change M&E Systems What is an M&E System? Elements of M&E System Steps for developing Results based M&E System M&E Planning Importance of an M&E Plan Documenting M&E System in the M&E Plan Components of an M&E Plan-Monitoring, Evaluation, Data management, Reporting Using M&E Plan to implement M&E in a Project M&E plan vs Performance Management Plan (PMP) Base Survey in Results based M&E Importance of baseline studies Process of conducting baseline studies Baseline study vs evaluation Project Performance Evaluation Process and progress evaluations Evaluation research design Evaluation questions Evaluation report Dissemination M&E Data Management Different sources of M&E data Qualitative data collection methods Quantitative data collection methods Participatory methods of data collection Data Quality Assessment M&E Results Use and Dissemination Stakeholder?s information needs Use of M&E results to improve and strengthen projects Use of M&E Lessons learnt and Best Practices Organization knowledge champions M&E reporting format M&E results communication strategies Gender Perspective in M&E Importance of gender in M&E Integrating gender into program logic Setting gender sensitive indicators Collecting gender disaggregated data Analyzing M&E data from a gender perspective Appraisal of projects from a gender perspective Data Collection Tools and Techniques Sources of M&E data ?primary and secondary Sampling during data collection Qualitative data collection methods Quantitative data collection methods Participatory data collection methods Introduction to data triangulation Data Quality What is data quality? Why data quality? Data quality standards Data flow and data quality Data Quality Assessments M&E system design for data quality ICT in Monitoring and Evaluation Mobile based data collection using ODK Data visualization - info graphics and dashboards Use of ICT tools for Real-time monitoring and evaluation Qualitative Data Analysis Principles of qualitative data analysis Data preparation for qualitative analysis Linking and integrating multiple data sets in different forms Thematic analysis for qualitative data Content analysis for qualitative data Manipulation and analysis of data using NVivo Quantitative Data Analysis ? (Using SPSS/Stata) Introduction to statistical concepts Creating variables and data entry Data reconstruction Variables manipulation Descriptive statistics Understanding data weighting Inferential statistics: hypothesis testing, T-test, ANOVA, regression analysis Impact Assessment Introduction to impact evaluation Attribution in impact evaluation Estimation of counterfactual Impact evaluation methods: Double difference, Propensity score matching GIS in M&E Introduction to GIS in M&E GIS analysis and mapping techniques Data preparation for geospatial analysis Geospatial analysis using GIS software (QGIS) ? General Notes ????????? All our courses can be Tailor-made to participants needs The participant must be conversant with English ?Presentations are well guided, practical exercises, web-based tutorials, and group work. Our facilitators are experts?with more than 10years of experience ?Upon completion of training, the participant will be issued with a Global King Project Management certificate Training will be done at the Global King Project Management Centers (Nairobi Kenya, Mombasa Kenya, Kigali Rwanda, Dubai, Lagos Nigeria and More others) Course duration is flexible and the contents can be modified to fit any number of days.?????? Payment should be done two weeks before commencement of the training, to the Global King Project Management account, so as to enable us to prepare better for you. ????????? The course fee for onsite training includes facilitation training materials, 2 coffee breaks, a buffet lunch, and a Certificate of successful completion of Training. Participants will be responsible for their own travel expenses and arrangements, airport transfers, visa application dinners, health/accident insurance, and other personal expenses. ????????? ???????Accommodation, pickup, freight booking, and Visa processing arrangement, are done on request, at discounted prices.????????????????????? ????????? ??Tablet and Laptops are provided to participants on request as an add-on cost to the training fee. ????????? ??One-year?free Consultation and Coaching provided after the course. ????????? ?Register as a group of more than two and enjoy a discount of (10% to 50%) ????????? ???? Payment should be done before commence of the training or as agreed by the parties, to the GLOBAL KING PROJECT MANAGEMENT account, so as to enable us to prepare better for you. ????????? ?For any inquiry to:??training at globalkingprojectmanagement.org ?or ?+254 114 830 889? ? ????????? ?Website:? www. globalkingprojectmanagement.org? ? Indicative Budget The registration cost is indicated in every course for online and onsite trainings find the group rate discount below: Group Size Discount Rate for Group 2-7 ??? 20% ?8-21 ??? 25% 22-31 ??? 30% 32-41 ??? 35% 42-51 ??? 40% 52>Above ??? 50% ? ? ? Important? GKPM Links:?? All Courses Project Management Trainings Research training Courses Leadership Training Courses ? Download our Calendar Yours Sincerely,? ??? ? Virginia Wamaitha Project Management Expert and PMP certified Global King Project Management Tele: +254114830889 Email Address: training at globalkingprojectmanagement.org? Transnational Plaza 3rd Floor Nairobi, Kenya ??? ? UNSUBSCRIBE -------------- next part -------------- An HTML attachment was scrubbed... URL: From lkp at intel.com Sat Jul 23 12:25:40 2022 From: lkp at intel.com (kernel test robot) Date: Sat, 23 Jul 2022 10:25:40 +0800 Subject: [xiang-erofs:dev] BUILD SUCCESS cc2a171372c68ee64916eb65a962b3aba9ea56ad Message-ID: <62db5c24.Pg30R36qSsF5RtMC%lkp@intel.com> tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev branch HEAD: cc2a171372c68ee64916eb65a962b3aba9ea56ad erofs: get rid of the leftover PAGE_SIZE in dir.c elapsed time: 725m configs tested: 86 configs skipped: 3 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: arm defconfig arm allyesconfig arm64 allyesconfig i386 randconfig-c001 arc nsim_700_defconfig sparc sparc64_defconfig s390 allmodconfig m68k bvme6000_defconfig m68k atari_defconfig sh r7785rp_defconfig m68k defconfig powerpc mpc834x_itx_defconfig m68k m5407c3_defconfig mips jazz_defconfig powerpc mpc8540_ads_defconfig openrisc simple_smp_defconfig nios2 3c120_defconfig sh sh03_defconfig xtensa alldefconfig powerpc sequoia_defconfig powerpc ppc64_defconfig csky defconfig arm gemini_defconfig powerpc mgcoge_defconfig sh se7751_defconfig arc vdk_hs38_smp_defconfig alpha defconfig parisc64 alldefconfig loongarch defconfig loongarch allnoconfig ia64 allmodconfig csky allnoconfig alpha allnoconfig arc allnoconfig riscv allnoconfig m68k allmodconfig arc allyesconfig alpha allyesconfig m68k allyesconfig powerpc allnoconfig mips allyesconfig powerpc allmodconfig sh allmodconfig i386 defconfig i386 allyesconfig x86_64 randconfig-a004 x86_64 randconfig-a002 x86_64 randconfig-a006 i386 randconfig-a001 i386 randconfig-a003 i386 randconfig-a005 x86_64 randconfig-a015 x86_64 randconfig-a013 x86_64 randconfig-a011 i386 randconfig-a014 i386 randconfig-a016 i386 randconfig-a012 arc randconfig-r043-20220721 x86_64 rhel-8.3-kselftests um i386_defconfig um x86_64_defconfig x86_64 defconfig x86_64 allyesconfig x86_64 rhel-8.3 x86_64 rhel-8.3-func x86_64 rhel-8.3-syz x86_64 rhel-8.3-kunit clang tested configs: arm hackkit_defconfig arm colibri_pxa300_defconfig x86_64 randconfig-k001 x86_64 randconfig-a005 x86_64 randconfig-a003 x86_64 randconfig-a001 i386 randconfig-a006 i386 randconfig-a002 i386 randconfig-a004 x86_64 randconfig-a012 x86_64 randconfig-a014 x86_64 randconfig-a016 i386 randconfig-a015 i386 randconfig-a013 i386 randconfig-a011 hexagon randconfig-r041-20220721 hexagon randconfig-r045-20220721 riscv randconfig-r042-20220721 s390 randconfig-r044-20220721 -- 0-DAY CI Kernel Test Service https://01.org/lkp From zbestahu at gmail.com Sat Jul 23 17:17:12 2022 From: zbestahu at gmail.com (Yue Hu) Date: Sat, 23 Jul 2022 15:17:12 +0800 Subject: [RFC PATCH v2 0/3] erofs-utils: compressed fragments feature Message-ID: In order to achieve greater compression ratio, let's introduce compressed fragments feature which can merge tail of per-file or the whole files into one special inode to reach the target. And we can also set pcluster size to fragments inode for different compression requirments. In this patchset, we also improve the uncompressed data layout of compressed files. Just write it from 'clusterofs' instead of 0 since it can benefit from in-place I/O. For now, it only goes with fragments. The main idea above is from Xiang. Here is some test data of Linux 5.10.87 source code under Ubuntu 18.04: linux-5.10.87 (erofs, uncompressed) 1.1G linux-5.10.87 (erofs, lz4hc,12 4k fragments,4k) 301M linux-5.10.87 (erofs, lz4hc,12 8k fragments,8k) 268M linux-5.10.87 (erofs, lz4hc,12 16k fragments,16k) 242M linux-5.10.87 (erofs, lz4hc,12 32k fragments,32k) 225M linux-5.10.87 (erofs, lz4hc,12 64k fragments,64k) 217M linux-5.10.87 (erofs, lz4hc,12 4k vanilla) 396M linux-5.10.87 (erofs, lz4hc,12 8k vanilla) 376M linux-5.10.87 (erofs, lz4hc,12 16k vanilla) 364M linux-5.10.87 (erofs, lz4hc,12 32k vanilla) 359M linux-5.10.87 (erofs, lz4hc,12 64k vanilla) 358M Usage: mkfs.erofs -zlz4hc,12 -C65536 -Efragments,65536 foo.erofs.img foo/ Changes since v1: - mainly optimize index space for fragment inode; - add merging tail with len <= pclustersize into fragments directly; - use a inode instead of nid to avoid multiple load fragments; - fix memory leak of building fragments; - minor change to diff special fragments with normal inode. - rebase to commit cb058526 with patch [1]; - code cleanup. Note that inode will be extended version (64 bytes) due to mtime, may use 'force-inode-compact' option to reduce the size if mtime careless. [1] https://lore.kernel.org/linux-erofs/20220722053610.23912-1-huyue2 at coolpad.com/ Yue Hu (3): erofs-utils: lib: add support for fragments data decompression erofs-utils: lib: support on-disk offset for shifted decompression erofs-utils: introduce compressed fragments support include/erofs/compress.h | 3 +- include/erofs/config.h | 3 +- include/erofs/decompress.h | 3 ++ include/erofs/fragments.h | 25 +++++++++ include/erofs/inode.h | 2 + include/erofs/internal.h | 14 +++++ include/erofs_fs.h | 24 ++++++--- lib/Makefile.am | 4 +- lib/compress.c | 108 +++++++++++++++++++++++++++---------- lib/data.c | 58 ++++++++++++++++++-- lib/decompress.c | 10 +++- lib/fragments.c | 76 ++++++++++++++++++++++++++ lib/inode.c | 43 ++++++++++----- lib/super.c | 24 ++++++++- lib/zmap.c | 18 +++++-- mkfs/main.c | 64 +++++++++++++++++++--- 16 files changed, 412 insertions(+), 67 deletions(-) create mode 100644 include/erofs/fragments.h create mode 100644 lib/fragments.c -- 2.17.1 From zbestahu at gmail.com Sat Jul 23 17:17:14 2022 From: zbestahu at gmail.com (Yue Hu) Date: Sat, 23 Jul 2022 15:17:14 +0800 Subject: [RFC PATCH v2 2/3] erofs-utils: lib: support on-disk offset for shifted decompression In-Reply-To: References: Message-ID: Add support to uncompressed data layout with on-disk offset for compressed files. Signed-off-by: Yue Hu --- include/erofs/decompress.h | 3 +++ lib/data.c | 8 +++++++- lib/decompress.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/erofs/decompress.h b/include/erofs/decompress.h index 82bf7b8..b622df5 100644 --- a/include/erofs/decompress.h +++ b/include/erofs/decompress.h @@ -23,6 +23,9 @@ struct z_erofs_decompress_req { unsigned int decodedskip; unsigned int inputsize, decodedlength; + /* head offset of uncompressed data */ + unsigned int shiftedhead; + /* indicate the algorithm will be used for decompression */ unsigned int alg; bool partial_decoding; diff --git a/lib/data.c b/lib/data.c index 2f3ebb8..16e2ffd 100644 --- a/lib/data.c +++ b/lib/data.c @@ -226,7 +226,7 @@ static int z_erofs_do_read_data(struct erofs_inode *inode, char *buffer, }; struct erofs_map_dev mdev; bool partial; - unsigned int bufsize = 0; + unsigned int bufsize = 0, head; char *raw = NULL; int ret = 0; @@ -308,10 +308,16 @@ static int z_erofs_do_read_data(struct erofs_inode *inode, char *buffer, if (ret < 0) break; + head = 0; + if (erofs_sb_has_fragments() && + map.m_algorithmformat == Z_EROFS_COMPRESSION_SHIFTED) + head = erofs_blkoff(end); + ret = z_erofs_decompress(&(struct z_erofs_decompress_req) { .in = raw, .out = buffer + end - offset, .decodedskip = skip, + .shiftedhead = head, .inputsize = map.m_plen, .decodedlength = length, .alg = map.m_algorithmformat, diff --git a/lib/decompress.c b/lib/decompress.c index 1661f91..08a0861 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -132,14 +132,20 @@ out: int z_erofs_decompress(struct z_erofs_decompress_req *rq) { if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { + unsigned int count, rightpart; + if (rq->inputsize > EROFS_BLKSIZ) return -EFSCORRUPTED; DBG_BUGON(rq->decodedlength > EROFS_BLKSIZ); DBG_BUGON(rq->decodedlength < rq->decodedskip); - memcpy(rq->out, rq->in + rq->decodedskip, - rq->decodedlength - rq->decodedskip); + count = rq->decodedlength - rq->decodedskip; + rightpart = min(EROFS_BLKSIZ - rq->shiftedhead, count); + + memcpy(rq->out, rq->in + (erofs_sb_has_fragments() ? + rq->shiftedhead : rq->decodedskip), rightpart); + memcpy(rq->out + rightpart, rq->in, count - rightpart); return 0; } -- 2.17.1 From zbestahu at gmail.com Sat Jul 23 17:17:13 2022 From: zbestahu at gmail.com (Yue Hu) Date: Sat, 23 Jul 2022 15:17:13 +0800 Subject: [RFC PATCH v2 1/3] erofs-utils: lib: add support for fragments data decompression In-Reply-To: References: Message-ID: <1385bcc612436d7fa149950515b816347528e5d5.1658556336.git.huyue2@coolpad.com> Add compressed fragments support for erofsfuse. Signed-off-by: Yue Hu --- include/erofs/internal.h | 13 +++++++++++ include/erofs_fs.h | 19 +++++++++++---- lib/data.c | 50 ++++++++++++++++++++++++++++++++++++++-- lib/zmap.c | 18 ++++++++++++--- 4 files changed, 90 insertions(+), 10 deletions(-) diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 48498fe..ce235de 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -102,6 +102,7 @@ struct erofs_sb_info { u16 devt_slotoff; /* used for mkfs */ u16 device_id_mask; /* used for others */ }; + struct erofs_inode *fragments_inode; }; /* global sbi */ @@ -132,6 +133,7 @@ EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) #define EROFS_I_EA_INITED (1 << 0) @@ -190,6 +192,8 @@ struct erofs_inode { void *eof_tailraw; unsigned int eof_tailrawsize; + erofs_off_t fragmentoff; + union { void *compressmeta; void *chunkindexes; @@ -201,6 +205,7 @@ struct erofs_inode { uint64_t z_tailextent_headlcn; unsigned int z_idataoff; #define z_idata_size idata_size +#define z_fragmentoff fragmentoff }; }; #ifdef WITH_ANDROID @@ -208,6 +213,11 @@ struct erofs_inode { #endif }; +static inline erofs_off_t z_erofs_map_header_pos(struct erofs_inode *vi) +{ + return round_up(iloc(vi->nid) + vi->inode_isize + vi->xattr_isize, 8); +} + static inline bool is_inode_layout_compression(struct erofs_inode *inode) { return erofs_inode_is_data_compressed(inode->datalayout); @@ -276,6 +286,7 @@ enum { BH_Mapped, BH_Encoded, BH_FullMapped, + BH_Fragments, }; /* Has a disk mapping */ @@ -286,6 +297,8 @@ enum { #define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +/* Located in fragments */ +#define EROFS_MAP_FRAGMENTS (1 << BH_Fragments) struct erofs_map_blocks { char mpage[EROFS_BLKSIZ]; diff --git a/include/erofs_fs.h b/include/erofs_fs.h index 08f9761..d957ebb 100644 --- a/include/erofs_fs.h +++ b/include/erofs_fs.h @@ -25,13 +25,15 @@ #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING) + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -294,16 +296,21 @@ struct z_erofs_lzma_cfgs { * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : fragment pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0010 struct z_erofs_map_header { - __le16 h_reserved1; - /* record the size of tailpacking data */ - __le16 h_idata_size; + union { + /* direct addressing for fragment offset */ + __le32 h_fragmentoff; + /* record the size of tailpacking data */ + __le16 h_idata_size; + }; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); @@ -312,12 +319,14 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-7 : reserved. + * bit 3-6 : reserved; + * bit 7 : merge the whole file into fragments or not. */ __u8 h_clusterbits; }; #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 +#define Z_EROFS_FRAGMENT_INODE_BIT 7 /* * Fixed-sized output compression ondisk Logical Extent cluster type: diff --git a/lib/data.c b/lib/data.c index 6bc554d..2f3ebb8 100644 --- a/lib/data.c +++ b/lib/data.c @@ -217,8 +217,8 @@ static int erofs_read_raw_data(struct erofs_inode *inode, char *buffer, return 0; } -static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, - erofs_off_t size, erofs_off_t offset) +static int z_erofs_do_read_data(struct erofs_inode *inode, char *buffer, + erofs_off_t size, erofs_off_t offset) { erofs_off_t end, length, skip; struct erofs_map_blocks map = { @@ -275,6 +275,27 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, continue; } + if (map.m_flags & EROFS_MAP_FRAGMENTS) { + char *out; + + out = malloc(length); + if (!out) { + ret = -ENOMEM; + break; + } + ret = z_erofs_do_read_data(sbi.fragments_inode, out, + length, + inode->z_fragmentoff); + if (ret < 0) { + free(out); + break; + } + memcpy(buffer + end - offset, out + skip, length - + skip); + free(out); + continue; + } + if (map.m_plen > bufsize) { bufsize = map.m_plen; raw = realloc(raw, bufsize); @@ -304,6 +325,31 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, return ret < 0 ? ret : 0; } +static int z_erofs_read_data(struct erofs_inode *inode, char *buffer, + erofs_off_t size, erofs_off_t offset) +{ + struct erofs_inode *vi = inode; + + if (erofs_sb_has_fragments()) { + int ret; + struct z_erofs_map_header *h; + char buf[sizeof(struct z_erofs_map_header)]; + + ret = dev_read(0, buf, z_erofs_map_header_pos(inode), + sizeof(buf)); + if (ret < 0) + return -EIO; + + h = (struct z_erofs_map_header *)buf; + + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi = sbi.fragments_inode; + offset += le32_to_cpu(h->h_fragmentoff); + } + } + return z_erofs_do_read_data(vi, buffer, size, offset); +} + int erofs_pread(struct erofs_inode *inode, char *buf, erofs_off_t count, erofs_off_t offset) { diff --git a/lib/zmap.c b/lib/zmap.c index 95745c5..812bf32 100644 --- a/lib/zmap.c +++ b/lib/zmap.c @@ -32,7 +32,6 @@ int z_erofs_fill_inode(struct erofs_inode *vi) static int z_erofs_fill_inode_lazy(struct erofs_inode *vi) { int ret; - erofs_off_t pos; struct z_erofs_map_header *h; char buf[sizeof(struct z_erofs_map_header)]; @@ -42,9 +41,8 @@ static int z_erofs_fill_inode_lazy(struct erofs_inode *vi) DBG_BUGON(!erofs_sb_has_big_pcluster() && !erofs_sb_has_ztailpacking() && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); - pos = round_up(iloc(vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - ret = dev_read(0, buf, pos, sizeof(buf)); + ret = dev_read(0, buf, z_erofs_map_header_pos(vi), sizeof(buf)); if (ret < 0) return -EIO; @@ -83,6 +81,17 @@ static int z_erofs_fill_inode_lazy(struct erofs_inode *vi) if (ret < 0) return ret; } + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) { + struct erofs_map_blocks map = { .index = UINT_MAX }; + + ret = z_erofs_do_map_blocks(vi, &map, + EROFS_GET_BLOCKS_FINDTAIL); + if (ret < 0) { + erofs_err("failed to find tail for fragment pcluster"); + return ret; + } + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + } vi->flags |= EROFS_I_Z_INITED; return 0; } @@ -546,6 +555,7 @@ static int z_erofs_do_map_blocks(struct erofs_inode *vi, int flags) { bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool in_fragments = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = vi, .map = map, @@ -609,6 +619,8 @@ static int z_erofs_do_map_blocks(struct erofs_inode *vi, map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; + } else if (in_fragments && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENTS; } else { map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); -- 2.17.1 From zbestahu at gmail.com Sat Jul 23 17:17:15 2022 From: zbestahu at gmail.com (Yue Hu) Date: Sat, 23 Jul 2022 15:17:15 +0800 Subject: [RFC PATCH v2 3/3] erofs-utils: introduce compressed fragments support In-Reply-To: References: Message-ID: This approach can merge tail pclusters or the whole files into a special inode in order to achieve greater compression ratio. And an option of pcluster size is provided for different compression requirments. Meanwhile, we change to write the uncompressed data from 'clusterofs' when compressing files since it can benefit from in-place I/O. For now, this change goes with the fragments. Signed-off-by: Yue Hu --- include/erofs/compress.h | 3 +- include/erofs/config.h | 3 +- include/erofs/fragments.h | 25 +++++++++ include/erofs/inode.h | 2 + include/erofs/internal.h | 1 + include/erofs_fs.h | 5 +- lib/Makefile.am | 4 +- lib/compress.c | 108 ++++++++++++++++++++++++++++---------- lib/fragments.c | 76 +++++++++++++++++++++++++++ lib/inode.c | 43 ++++++++++----- lib/super.c | 24 ++++++++- mkfs/main.c | 64 +++++++++++++++++++--- 12 files changed, 304 insertions(+), 54 deletions(-) create mode 100644 include/erofs/fragments.h create mode 100644 lib/fragments.c diff --git a/include/erofs/compress.h b/include/erofs/compress.h index 24f6204..d17aadb 100644 --- a/include/erofs/compress.h +++ b/include/erofs/compress.h @@ -18,7 +18,8 @@ extern "C" #define EROFS_CONFIG_COMPR_MIN_SZ (32 * 1024) void z_erofs_drop_inline_pcluster(struct erofs_inode *inode); -int erofs_write_compressed_file(struct erofs_inode *inode); +int erofs_write_compressed_file_from_fd(struct erofs_inode *inode, int fd, + bool is_src); int z_erofs_compress_init(struct erofs_buffer_head *bh); int z_erofs_compress_exit(void); diff --git a/include/erofs/config.h b/include/erofs/config.h index 6c6d71f..b677c54 100644 --- a/include/erofs/config.h +++ b/include/erofs/config.h @@ -44,6 +44,7 @@ struct erofs_configure { char c_chunkbits; bool c_noinline_data; bool c_ztailpacking; + bool c_fragments; bool c_ignore_mtime; bool c_showprogress; @@ -62,7 +63,7 @@ struct erofs_configure { /* < 0, xattr disabled and INT_MAX, always use inline xattrs */ int c_inline_xattr_tolerance; - u32 c_pclusterblks_max, c_pclusterblks_def; + u32 c_pclusterblks_max, c_pclusterblks_def, c_pclusterblks_frags; u32 c_max_decompressed_extent_bytes; u32 c_dict_size; u64 c_unix_timestamp; diff --git a/include/erofs/fragments.h b/include/erofs/fragments.h new file mode 100644 index 0000000..89f0f18 --- /dev/null +++ b/include/erofs/fragments.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */ +/* + * Copyright (C), 2022, Coolpad Group Limited. + */ +#ifndef __EROFS_FRAGMENTS_H +#define __EROFS_FRAGMENTS_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "erofs/internal.h" + +int z_erofs_fill_fragments(struct erofs_inode *inode, void *data, + unsigned int len); +struct erofs_inode *erofs_mkfs_build_fragments(void); +int erofs_fragments_init(void); +void erofs_fragments_exit(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/erofs/inode.h b/include/erofs/inode.h index 79b39b0..0a87c58 100644 --- a/include/erofs/inode.h +++ b/include/erofs/inode.h @@ -21,6 +21,8 @@ unsigned int erofs_iput(struct erofs_inode *inode); erofs_nid_t erofs_lookupnid(struct erofs_inode *inode); struct erofs_inode *erofs_mkfs_build_tree_from_path(struct erofs_inode *parent, const char *path); +int erofs_prepare_inode_buffer(struct erofs_inode *inode); +struct erofs_inode *erofs_generate_inode(struct stat64 *st, const char *path); #ifdef __cplusplus } diff --git a/include/erofs/internal.h b/include/erofs/internal.h index ce235de..0f2a217 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -193,6 +193,7 @@ struct erofs_inode { unsigned int eof_tailrawsize; erofs_off_t fragmentoff; + unsigned int fragment_size; union { void *compressmeta; diff --git a/include/erofs_fs.h b/include/erofs_fs.h index d957ebb..1eaad68 100644 --- a/include/erofs_fs.h +++ b/include/erofs_fs.h @@ -75,7 +75,9 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved2[38]; + __u8 reserved[6]; + __le64 fragments_nid; + __u8 reserved2[24]; }; /* @@ -265,6 +267,7 @@ struct erofs_inode_chunk_index { /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +#define Z_EROFS_PCLUSTER_MAX_BLKS (Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) /* available compression algorithm types (for h_algorithmtype) */ enum { diff --git a/lib/Makefile.am b/lib/Makefile.am index 3fad357..95f1d55 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -22,12 +22,14 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \ $(top_srcdir)/include/erofs/trace.h \ $(top_srcdir)/include/erofs/xattr.h \ $(top_srcdir)/include/erofs/compress_hints.h \ + $(top_srcdir)/include/erofs/fragments.h \ $(top_srcdir)/lib/liberofs_private.h noinst_HEADERS += compressor.h liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \ namei.c data.c compress.c compressor.c zmap.c decompress.c \ - compress_hints.c hashmap.c sha256.c blobchunk.c dir.c + compress_hints.c hashmap.c sha256.c blobchunk.c dir.c \ + fragments.c liberofs_la_CFLAGS = -Wall -I$(top_srcdir)/include if ENABLE_LZ4 liberofs_la_CFLAGS += ${LZ4_CFLAGS} diff --git a/lib/compress.c b/lib/compress.c index ee3b856..713c105 100644 --- a/lib/compress.c +++ b/lib/compress.c @@ -18,6 +18,7 @@ #include "compressor.h" #include "erofs/block_list.h" #include "erofs/compress_hints.h" +#include "erofs/fragments.h" static struct erofs_compress compresshandle; static unsigned int algorithmtype[2]; @@ -74,9 +75,9 @@ static void vle_write_indexes(struct z_erofs_vle_compress_ctx *ctx, if (!d1) { /* * A lcluster cannot have three parts with the middle one which - * is well-compressed for !ztailpacking cases. + * is well-compressed for !ztailpacking and !fragments cases. */ - DBG_BUGON(!raw && !cfg.c_ztailpacking); + DBG_BUGON(!raw && !cfg.c_ztailpacking && !cfg.c_fragments); type = raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN : Z_EROFS_VLE_CLUSTER_TYPE_HEAD; advise = cpu_to_le16(type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT); @@ -143,7 +144,7 @@ static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx, unsigned int *len, char *dst) { int ret; - unsigned int count; + unsigned int count, offset, rcopied, rzeroed; /* reset clusterofs to 0 if permitted */ if (!erofs_sb_has_lz4_0padding() && ctx->clusterofs && @@ -153,11 +154,21 @@ static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx, ctx->clusterofs = 0; } - /* write uncompressed data */ + /* + * write uncompressed data from clusterofs which can benefit from + * in-place I/O, loop shift right when to exceed EROFS_BLKSIZ. + */ count = min(EROFS_BLKSIZ, *len); - memcpy(dst, ctx->queue + ctx->head, count); - memset(dst + count, 0, EROFS_BLKSIZ - count); + offset = cfg.c_fragments ? ctx->clusterofs : 0; + rcopied = min(EROFS_BLKSIZ - offset, count); + rzeroed = EROFS_BLKSIZ - offset - rcopied; + + memcpy(dst + offset, ctx->queue + ctx->head, rcopied); + memcpy(dst, ctx->queue + ctx->head + rcopied, count - rcopied); + + memset(dst + offset + rcopied, 0, rzeroed); + memset(dst + count - rcopied, 0, EROFS_BLKSIZ - count - rzeroed); erofs_dbg("Writing %u uncompressed data to block %u", count, ctx->blkaddr); @@ -167,8 +178,11 @@ static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx, return count; } -static unsigned int z_erofs_get_max_pclusterblks(struct erofs_inode *inode) +static unsigned int z_erofs_get_max_pclusterblks(struct erofs_inode *inode, + bool is_src) { + if (cfg.c_fragments && !is_src) + return cfg.c_pclusterblks_frags; #ifndef NDEBUG if (cfg.c_random_pclusterblks) return 1 + rand() % cfg.c_pclusterblks_max; @@ -224,7 +238,7 @@ static void tryrecompress_trailing(void *in, unsigned int *insize, static int vle_compress_one(struct erofs_inode *inode, struct z_erofs_vle_compress_ctx *ctx, - bool final) + bool final, bool is_src) { struct erofs_compress *const h = &compresshandle; unsigned int len = ctx->tail - ctx->head; @@ -234,14 +248,19 @@ static int vle_compress_one(struct erofs_inode *inode, char *const dst = dstbuf + EROFS_BLKSIZ; while (len) { - unsigned int pclustersize = - z_erofs_get_max_pclusterblks(inode) * EROFS_BLKSIZ; + unsigned int pclustersize = EROFS_BLKSIZ * + z_erofs_get_max_pclusterblks(inode, is_src); bool may_inline = (cfg.c_ztailpacking && final); + bool may_merge = (cfg.c_fragments && final && is_src); bool raw; if (len <= pclustersize) { if (!final) break; + if (may_merge) { + count = len; + goto fragments; + } if (!may_inline && len <= EROFS_BLKSIZ) goto nocompression; } @@ -294,6 +313,19 @@ nocompression: return ret; ctx->compressedblks = 1; raw = false; + } else if (may_merge && len == count && ret < pclustersize) { +fragments: + ret = z_erofs_fill_fragments(inode, + ctx->queue + ctx->head, + len); + if (ret < 0) + return ret; + if (inode->i_size == inode->fragment_size) { + ctx->head += len; + return 0; + } + ctx->compressedblks = 0; + raw = false; } else { unsigned int tailused, padding; @@ -546,13 +578,20 @@ static void z_erofs_write_mapheader(struct erofs_inode *inode, { struct z_erofs_map_header h = { .h_advise = cpu_to_le16(inode->z_advise), - .h_idata_size = cpu_to_le16(inode->idata_size), .h_algorithmtype = inode->z_algorithmtype[1] << 4 | inode->z_algorithmtype[0], /* lclustersize */ .h_clusterbits = inode->z_logical_clusterbits - 12, }; + if (cfg.c_fragments) + h.h_fragmentoff = cpu_to_le32(inode->fragmentoff); + else + h.h_idata_size = cpu_to_le16(inode->idata_size); + + if (inode->fragment_size && inode->i_size == inode->fragment_size) + h.h_clusterbits |= 1 << Z_EROFS_FRAGMENT_INODE_BIT; + memset(compressmeta, 0, Z_EROFS_LEGACY_MAP_HEADER_SIZE); /* write out map header */ memcpy(compressmeta, &h, sizeof(struct z_erofs_map_header)); @@ -604,30 +643,25 @@ void z_erofs_drop_inline_pcluster(struct erofs_inode *inode) inode->eof_tailraw = NULL; } -int erofs_write_compressed_file(struct erofs_inode *inode) +int erofs_write_compressed_file_from_fd(struct erofs_inode *inode, int fd, + bool is_src) { struct erofs_buffer_head *bh; static struct z_erofs_vle_compress_ctx ctx; erofs_off_t remaining; erofs_blk_t blkaddr, compressed_blocks; unsigned int legacymetasize; - int ret, fd; + int ret; u8 *compressmeta = malloc(vle_compressmeta_capacity(inode->i_size)); if (!compressmeta) return -ENOMEM; - fd = open(inode->i_srcpath, O_RDONLY | O_BINARY); - if (fd < 0) { - ret = -errno; - goto err_free_meta; - } - /* allocate main data buffer */ bh = erofs_balloc(DATA, 0, 0, 0); if (IS_ERR(bh)) { ret = PTR_ERR(bh); - goto err_close; + goto err_free_meta; } /* initialize per-file compression setting */ @@ -648,6 +682,9 @@ int erofs_write_compressed_file(struct erofs_inode *inode) inode->z_algorithmtype[1] = algorithmtype[1]; inode->z_logical_clusterbits = LOG_BLOCK_SIZE; + inode->idata_size = 0; + inode->fragment_size = 0; + blkaddr = erofs_mapbh(bh->block); /* start_blkaddr */ ctx.blkaddr = blkaddr; ctx.metacur = compressmeta + Z_EROFS_LEGACY_MAP_HEADER_SIZE; @@ -667,7 +704,7 @@ int erofs_write_compressed_file(struct erofs_inode *inode) remaining -= readcount; ctx.tail += readcount; - ret = vle_compress_one(inode, &ctx, !remaining); + ret = vle_compress_one(inode, &ctx, !remaining, is_src); if (ret) goto err_free_idata; } @@ -681,19 +718,20 @@ int erofs_write_compressed_file(struct erofs_inode *inode) vle_write_indexes_final(&ctx); legacymetasize = ctx.metacur - compressmeta; /* estimate if data compression saves space or not */ - if (compressed_blocks * EROFS_BLKSIZ + inode->idata_size + + if (!inode->fragment_size && + compressed_blocks * EROFS_BLKSIZ + inode->idata_size + legacymetasize >= inode->i_size) { ret = -ENOSPC; goto err_free_idata; } z_erofs_write_mapheader(inode, compressmeta); - close(fd); if (compressed_blocks) { ret = erofs_bh_balloon(bh, blknr_to_addr(compressed_blocks)); DBG_BUGON(ret != EROFS_BLKSIZ); } else { - DBG_BUGON(!inode->idata_size); + if (!cfg.c_fragments) + DBG_BUGON(!inode->idata_size); } erofs_info("compressed %s (%llu bytes) into %u blocks", @@ -716,7 +754,8 @@ int erofs_write_compressed_file(struct erofs_inode *inode) DBG_BUGON(ret); } inode->compressmeta = compressmeta; - erofs_droid_blocklist_write(inode, blkaddr, compressed_blocks); + if (is_src) + erofs_droid_blocklist_write(inode, blkaddr, compressed_blocks); return 0; err_free_idata: @@ -726,8 +765,6 @@ err_free_idata: } err_bdrop: erofs_bdrop(bh, true); /* revoke buffer */ -err_close: - close(fd); err_free_meta: free(compressmeta); return ret; @@ -833,14 +870,27 @@ int z_erofs_compress_init(struct erofs_buffer_head *sb_bh) * to be loaded in order to get those compressed block counts. */ if (cfg.c_pclusterblks_max > 1) { - if (cfg.c_pclusterblks_max > - Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + if (cfg.c_pclusterblks_max > Z_EROFS_PCLUSTER_MAX_BLKS) { erofs_err("unsupported clusterblks %u (too large)", cfg.c_pclusterblks_max); return -EINVAL; } + if (cfg.c_pclusterblks_frags > Z_EROFS_PCLUSTER_MAX_BLKS) { + erofs_err("unsupported clusterblks %u (too large for fragments)", + cfg.c_pclusterblks_frags); + return -EINVAL; + } + if (cfg.c_pclusterblks_frags == 1) { + erofs_err("physical cluster size of fragments should > 4096 bytes"); + return -EINVAL; + } erofs_sb_set_big_pcluster(); } + if (!erofs_sb_has_big_pcluster() && cfg.c_pclusterblks_frags > 1) { + erofs_err("invalid clusterblks %u (for fragments)", + cfg.c_pclusterblks_frags); + return -EINVAL; + } if (ret != Z_EROFS_COMPRESSION_LZ4) erofs_sb_set_compr_cfgs(); diff --git a/lib/fragments.c b/lib/fragments.c new file mode 100644 index 0000000..67e79b8 --- /dev/null +++ b/lib/fragments.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 +/* + * Copyright (C), 2022, Coolpad Group Limited. + * Created by Yue Hu + */ +#define _GNU_SOURCE +#include +#include +#include +#include "erofs/err.h" +#include "erofs/inode.h" +#include "erofs/compress.h" +#include "erofs/print.h" +#include "erofs/fragments.h" + +static FILE *fragmentsfp; + +int z_erofs_fill_fragments(struct erofs_inode *inode, void *data, + unsigned int len) +{ + inode->z_advise |= Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + inode->fragmentoff = ftell(fragmentsfp); + inode->fragment_size = len; + + if (write(fileno(fragmentsfp), data, len) < 0) + return -EIO; + + erofs_sb_set_fragments(); + + erofs_dbg("Recording %u fragment data at %lu", inode->fragment_size, + inode->fragmentoff); + return len; +} + +struct erofs_inode *erofs_mkfs_build_fragments(void) +{ + struct stat64 st; + struct erofs_inode *inode; + int ret, fd = fileno(fragmentsfp); + + ret = fstat64(fd, &st); + if (ret) + return ERR_PTR(-errno); + + inode = erofs_generate_inode(&st, NULL); + if (IS_ERR(inode)) + return inode; + + fseek(fragmentsfp, 0, SEEK_SET); + ret = erofs_write_compressed_file_from_fd(inode, fd, false); + if (ret) { + erofs_err("write fragments file error"); + return ERR_PTR(ret); + } + + erofs_prepare_inode_buffer(inode); + return inode; +} + +void erofs_fragments_exit(void) +{ + if (fragmentsfp) + fclose(fragmentsfp); +} + +int erofs_fragments_init(void) +{ +#ifdef HAVE_TMPFILE64 + fragmentsfp = tmpfile64(); +#else + fragmentsfp = tmpfile(); +#endif + if (!fragmentsfp) + return -ENOMEM; + return 0; +} diff --git a/lib/inode.c b/lib/inode.c index f192510..a49c7a7 100644 --- a/lib/inode.c +++ b/lib/inode.c @@ -405,7 +405,11 @@ int erofs_write_file(struct erofs_inode *inode) } if (cfg.c_compr_alg_master && erofs_file_is_compressible(inode)) { - ret = erofs_write_compressed_file(inode); + fd = open(inode->i_srcpath, O_RDONLY | O_BINARY); + if (fd < 0) + return -errno; + ret = erofs_write_compressed_file_from_fd(inode, fd, true); + close(fd); if (!ret || ret != -ENOSPC) return ret; @@ -583,7 +587,7 @@ static int erofs_prepare_tail_block(struct erofs_inode *inode) return 0; } -static int erofs_prepare_inode_buffer(struct erofs_inode *inode) +int erofs_prepare_inode_buffer(struct erofs_inode *inode) { unsigned int inodesize; struct erofs_buffer_head *bh, *ibh; @@ -782,6 +786,9 @@ int erofs_droid_inode_fsconfig(struct erofs_inode *inode, const char *fspath; char *decorated = NULL; + if (!path) + return 0; + inode->capabilities = 0; if (!cfg.fs_config_file && !cfg.mount_point) return 0; @@ -868,7 +875,8 @@ static int erofs_fill_inode(struct erofs_inode *inode, return -EINVAL; } - strncpy(inode->i_srcpath, path, sizeof(inode->i_srcpath) - 1); + strncpy(inode->i_srcpath, path ? path : "tmp", + sizeof(inode->i_srcpath) - 1); inode->i_srcpath[sizeof(inode->i_srcpath) - 1] = '\0'; inode->dev = st->st_dev; @@ -907,6 +915,23 @@ static struct erofs_inode *erofs_new_inode(void) return inode; } +struct erofs_inode *erofs_generate_inode(struct stat64 *st, const char *path) +{ + struct erofs_inode *inode; + int ret; + + inode = erofs_new_inode(); + if (IS_ERR(inode)) + return inode; + + ret = erofs_fill_inode(inode, st, path); + if (ret) { + free(inode); + return ERR_PTR(ret); + } + return inode; +} + /* get the inode from the (source) path */ static struct erofs_inode *erofs_iget_from_path(const char *path, bool is_src) { @@ -934,17 +959,7 @@ static struct erofs_inode *erofs_iget_from_path(const char *path, bool is_src) } /* cannot find in the inode cache */ - inode = erofs_new_inode(); - if (IS_ERR(inode)) - return inode; - - ret = erofs_fill_inode(inode, &st, path); - if (ret) { - free(inode); - return ERR_PTR(ret); - } - - return inode; + return erofs_generate_inode(&st, path); } static void erofs_fixup_meta_blkaddr(struct erofs_inode *rootdir) diff --git a/lib/super.c b/lib/super.c index b267412..20db217 100644 --- a/lib/super.c +++ b/lib/super.c @@ -104,6 +104,21 @@ int erofs_read_superblock(void) sbi.xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi.islotbits = EROFS_ISLOTBITS; sbi.root_nid = le16_to_cpu(dsb->root_nid); + sbi.fragments_inode = NULL; + if (erofs_sb_has_fragments()) { + struct erofs_inode *inode; + + inode = calloc(1, sizeof(struct erofs_inode)); + if (!inode) + return -ENOMEM; + inode->nid = le64_to_cpu(dsb->fragments_nid); + ret = erofs_read_inode_from_disk(inode); + if (ret) { + free(inode); + return ret; + } + sbi.fragments_inode = inode; + } sbi.inos = le64_to_cpu(dsb->inos); sbi.checksum = le32_to_cpu(dsb->checksum); @@ -111,11 +126,18 @@ int erofs_read_superblock(void) sbi.build_time_nsec = le32_to_cpu(dsb->build_time_nsec); memcpy(&sbi.uuid, dsb->uuid, sizeof(dsb->uuid)); - return erofs_init_devices(&sbi, dsb); + + ret = erofs_init_devices(&sbi, dsb); + if (ret && sbi.fragments_inode) + free(sbi.fragments_inode); + return ret; } void erofs_put_super(void) { if (sbi.devs) free(sbi.devs); + + if (sbi.fragments_inode) + free(sbi.fragments_inode); } diff --git a/mkfs/main.c b/mkfs/main.c index deb8e1f..b629196 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -23,6 +23,7 @@ #include "erofs/block_list.h" #include "erofs/compress_hints.h" #include "erofs/blobchunk.h" +#include "erofs/fragments.h" #include "../lib/liberofs_private.h" #ifdef HAVE_LIBUUID @@ -129,9 +130,9 @@ static int parse_extended_opts(const char *opts) const char *p = strchr(token, ','); next = NULL; - if (p) + if (p) { next = p + 1; - else { + } else { p = token + strlen(token); next = p; } @@ -198,7 +199,34 @@ static int parse_extended_opts(const char *opts) return -EINVAL; cfg.c_ztailpacking = true; } + + if (MATCH_EXTENTED_OPT("fragments", token, keylen)) { + char *endptr; + u64 i; + + if (vallen || cfg.c_ztailpacking) + return -EINVAL; + cfg.c_fragments = true; + + i = strtoull(next, &endptr, 0); + if (i == 0 || (*endptr != ',' && *endptr != '\0')) { + cfg.c_pclusterblks_frags = 1; + continue; + } + if (i % EROFS_BLKSIZ) { + erofs_err("invalid physical clustersize %llu", + i); + return -EINVAL; + } + cfg.c_pclusterblks_frags = i / EROFS_BLKSIZ; + + if (*endptr == ',') + next = strchr(next, ',') + 1; + else + goto out; + } } +out: return 0; } @@ -438,7 +466,8 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) int erofs_mkfs_update_super_block(struct erofs_buffer_head *bh, erofs_nid_t root_nid, - erofs_blk_t *blocks) + erofs_blk_t *blocks, + erofs_nid_t fragments_nid) { struct erofs_super_block sb = { .magic = cpu_to_le32(EROFS_SUPER_MAGIC_V1), @@ -462,6 +491,7 @@ int erofs_mkfs_update_super_block(struct erofs_buffer_head *bh, *blocks = erofs_mapbh(NULL); sb.blocks = cpu_to_le32(*blocks); sb.root_nid = cpu_to_le16(root_nid); + sb.fragments_nid = cpu_to_le64(fragments_nid); memcpy(sb.uuid, sbi.uuid, sizeof(sb.uuid)); if (erofs_sb_has_compr_cfgs()) @@ -579,8 +609,8 @@ int main(int argc, char **argv) { int err = 0; struct erofs_buffer_head *sb_bh; - struct erofs_inode *root_inode; - erofs_nid_t root_nid; + struct erofs_inode *root_inode, *fragments_inode; + erofs_nid_t root_nid, fragments_nid; struct stat64 st; erofs_blk_t nblocks; struct timeval t; @@ -650,6 +680,14 @@ int main(int argc, char **argv) erofs_warn("EXPERIMENTAL chunked file feature in use. Use at your own risk!"); if (cfg.c_ztailpacking) erofs_warn("EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); + if (cfg.c_fragments) { + err = erofs_fragments_init(); + if (err) { + erofs_err("failed to initialize fragments"); + return 1; + } + erofs_warn("EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); + } erofs_set_fs_root(cfg.c_src_path); #ifndef NDEBUG if (cfg.c_random_pclusterblks) @@ -719,7 +757,19 @@ int main(int argc, char **argv) goto exit; } - err = erofs_mkfs_update_super_block(sb_bh, root_nid, &nblocks); + fragments_nid = 0; + if (cfg.c_fragments) { + fragments_inode = erofs_mkfs_build_fragments(); + if (IS_ERR(fragments_inode)) { + err = PTR_ERR(fragments_inode); + goto exit; + } + fragments_nid = erofs_lookupnid(fragments_inode); + erofs_iput(fragments_inode); + } + + err = erofs_mkfs_update_super_block(sb_bh, root_nid, &nblocks, + fragments_nid); if (err) goto exit; @@ -741,6 +791,8 @@ exit: erofs_cleanup_exclude_rules(); if (cfg.c_chunkbits) erofs_blob_exit(); + if (cfg.c_fragments) + erofs_fragments_exit(); erofs_exit_configure(); if (err) { -- 2.17.1 From zbestahu at gmail.com Mon Jul 25 14:30:10 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 25 Jul 2022 12:30:10 +0800 Subject: [PATCH] erofs-utils: mkfs: fix a memory leak of compression type configration Message-ID: <20220725043010.23134-1-huyue2@coolpad.com> Release the memory allocated for compression type configuration. And no need to consider !optarg case since getopt_long() will do that. Signed-off-by: Yue Hu --- lib/config.c | 3 +++ mkfs/main.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/config.c b/lib/config.c index 3963df2..c316a54 100644 --- a/lib/config.c +++ b/lib/config.c @@ -55,6 +55,9 @@ void erofs_exit_configure(void) #endif if (cfg.c_img_path) free(cfg.c_img_path); + + if (cfg.c_compr_alg_master) + free(cfg.c_compr_alg_master); } static unsigned int fullpath_prefix; /* root directory prefix length */ diff --git a/mkfs/main.c b/mkfs/main.c index deb8e1f..9f5f1dc 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -212,10 +212,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) long_options, NULL)) != -1) { switch (opt) { case 'z': - if (!optarg) { - cfg.c_compr_alg_master = "(default)"; - break; - } /* get specified compression level */ for (i = 0; optarg[i] != '\0'; ++i) { if (optarg[i] == ',') { -- 2.17.1 From zbestahu at gmail.com Mon Jul 25 15:45:49 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 25 Jul 2022 13:45:49 +0800 Subject: [PATCH v2] erofs-utils: mkfs: fix a memory leak of compression type configuration Message-ID: <20220725054549.23562-1-huyue2@coolpad.com> Release the memory allocated for compression type configuration. And no need to consider !optarg case since getopt_long() will do that. Signed-off-by: Yue Hu --- lib/config.c | 3 +++ mkfs/main.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/config.c b/lib/config.c index 3963df2..c316a54 100644 --- a/lib/config.c +++ b/lib/config.c @@ -55,6 +55,9 @@ void erofs_exit_configure(void) #endif if (cfg.c_img_path) free(cfg.c_img_path); + + if (cfg.c_compr_alg_master) + free(cfg.c_compr_alg_master); } static unsigned int fullpath_prefix; /* root directory prefix length */ diff --git a/mkfs/main.c b/mkfs/main.c index deb8e1f..9f5f1dc 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -212,10 +212,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) long_options, NULL)) != -1) { switch (opt) { case 'z': - if (!optarg) { - cfg.c_compr_alg_master = "(default)"; - break; - } /* get specified compression level */ for (i = 0; optarg[i] != '\0'; ++i) { if (optarg[i] == ',') { -- 2.17.1 From hsiangkao at linux.alibaba.com Mon Jul 25 15:56:38 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Mon, 25 Jul 2022 13:56:38 +0800 Subject: [PATCH v2] erofs-utils: mkfs: fix a memory leak of compression type configuration In-Reply-To: <20220725054549.23562-1-huyue2@coolpad.com> References: <20220725054549.23562-1-huyue2@coolpad.com> Message-ID: Hi Yue, On Mon, Jul 25, 2022 at 01:45:49PM +0800, Yue Hu wrote: > Release the memory allocated for compression type configuration. And no > need to consider !optarg case since getopt_long() will do that. > > Signed-off-by: Yue Hu > --- What's the difference between v1? The patch itself looks good to me, but I need to try later. Thanks, Gao Xiang > lib/config.c | 3 +++ > mkfs/main.c | 4 ---- > 2 files changed, 3 insertions(+), 4 deletions(-) > > diff --git a/lib/config.c b/lib/config.c > index 3963df2..c316a54 100644 > --- a/lib/config.c > +++ b/lib/config.c > @@ -55,6 +55,9 @@ void erofs_exit_configure(void) > #endif > if (cfg.c_img_path) > free(cfg.c_img_path); > + > + if (cfg.c_compr_alg_master) > + free(cfg.c_compr_alg_master); > } > > static unsigned int fullpath_prefix; /* root directory prefix length */ > diff --git a/mkfs/main.c b/mkfs/main.c > index deb8e1f..9f5f1dc 100644 > --- a/mkfs/main.c > +++ b/mkfs/main.c > @@ -212,10 +212,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > long_options, NULL)) != -1) { > switch (opt) { > case 'z': > - if (!optarg) { > - cfg.c_compr_alg_master = "(default)"; > - break; > - } > /* get specified compression level */ > for (i = 0; optarg[i] != '\0'; ++i) { > if (optarg[i] == ',') { > -- > 2.17.1 From zbestahu at gmail.com Mon Jul 25 16:03:34 2022 From: zbestahu at gmail.com (Yue Hu) Date: Mon, 25 Jul 2022 14:03:34 +0800 Subject: [PATCH v2] erofs-utils: mkfs: fix a memory leak of compression type configuration In-Reply-To: References: <20220725054549.23562-1-huyue2@coolpad.com> Message-ID: <20220725140334.000006c0.zbestahu@gmail.com> Hi Xiang, On Mon, 25 Jul 2022 13:56:38 +0800 Gao Xiang wrote: > Hi Yue, > > On Mon, Jul 25, 2022 at 01:45:49PM +0800, Yue Hu wrote: > > Release the memory allocated for compression type configuration. And no > > need to consider !optarg case since getopt_long() will do that. > > > > Signed-off-by: Yue Hu > > --- > > What's the difference between v1? just fix 'configration' -> 'configuration' in summary line. > The patch itself looks good to me, but I need to try later. > > Thanks, > Gao Xiang > > > lib/config.c | 3 +++ > > mkfs/main.c | 4 ---- > > 2 files changed, 3 insertions(+), 4 deletions(-) > > > > diff --git a/lib/config.c b/lib/config.c > > index 3963df2..c316a54 100644 > > --- a/lib/config.c > > +++ b/lib/config.c > > @@ -55,6 +55,9 @@ void erofs_exit_configure(void) > > #endif > > if (cfg.c_img_path) > > free(cfg.c_img_path); > > + > > + if (cfg.c_compr_alg_master) > > + free(cfg.c_compr_alg_master); > > } > > > > static unsigned int fullpath_prefix; /* root directory prefix length */ > > diff --git a/mkfs/main.c b/mkfs/main.c > > index deb8e1f..9f5f1dc 100644 > > --- a/mkfs/main.c > > +++ b/mkfs/main.c > > @@ -212,10 +212,6 @@ static int mkfs_parse_options_cfg(int argc, char *argv[]) > > long_options, NULL)) != -1) { > > switch (opt) { > > case 'z': > > - if (!optarg) { > > - cfg.c_compr_alg_master = "(default)"; > > - break; > > - } > > /* get specified compression level */ > > for (i = 0; optarg[i] != '\0'; ++i) { > > if (optarg[i] == ',') { > > -- > > 2.17.1 From trini at konsulko.com Tue Jul 26 08:28:50 2022 From: trini at konsulko.com (Tom Rini) Date: Mon, 25 Jul 2022 18:28:50 -0400 Subject: [PATCH 1/8] fs: fat: unexport file_fat_read_at() In-Reply-To: References: Message-ID: <20220725222850.GA3420905@bill-the-cat> On Wed, Jun 29, 2022 at 07:38:22PM +0800, Qu Wenruo wrote: > That function is only utilized inside fat driver, unexport it. > > Signed-off-by: Qu Wenruo The series has a fails to build on nokia_rx51: https://source.denx.de/u-boot/u-boot/-/jobs/471877#L483 which to me says doing 64bit division (likely related to block size, etc) without using the appropriate helper macros to turn them in to bit shifts instead. -- Tom -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 659 bytes Desc: not available URL: From quwenruo.btrfs at gmx.com Tue Jul 26 11:35:51 2022 From: quwenruo.btrfs at gmx.com (Qu Wenruo) Date: Tue, 26 Jul 2022 09:35:51 +0800 Subject: [PATCH 1/8] fs: fat: unexport file_fat_read_at() In-Reply-To: <20220725222850.GA3420905@bill-the-cat> References: <20220725222850.GA3420905@bill-the-cat> Message-ID: <6271e1a2-db85-fb20-6ea8-d23afcb6bc69@gmx.com> On 2022/7/26 06:28, Tom Rini wrote: > On Wed, Jun 29, 2022 at 07:38:22PM +0800, Qu Wenruo wrote: > >> That function is only utilized inside fat driver, unexport it. >> >> Signed-off-by: Qu Wenruo > > The series has a fails to build on nokia_rx51: > https://source.denx.de/u-boot/u-boot/-/jobs/471877#L483 > which to me says doing 64bit division (likely related to block size, > etc) without using the appropriate helper macros to turn them in to bit > shifts instead. > Should I update and resend the series or just send the incremental update to fix the U64/U32 division? Thanks, Qu From trini at konsulko.com Tue Jul 26 12:12:35 2022 From: trini at konsulko.com (Tom Rini) Date: Mon, 25 Jul 2022 22:12:35 -0400 Subject: [PATCH 1/8] fs: fat: unexport file_fat_read_at() In-Reply-To: <6271e1a2-db85-fb20-6ea8-d23afcb6bc69@gmx.com> References: <20220725222850.GA3420905@bill-the-cat> <6271e1a2-db85-fb20-6ea8-d23afcb6bc69@gmx.com> Message-ID: <20220726021235.GI1146598@bill-the-cat> On Tue, Jul 26, 2022 at 09:35:51AM +0800, Qu Wenruo wrote: > > > On 2022/7/26 06:28, Tom Rini wrote: > > On Wed, Jun 29, 2022 at 07:38:22PM +0800, Qu Wenruo wrote: > > > > > That function is only utilized inside fat driver, unexport it. > > > > > > Signed-off-by: Qu Wenruo > > > > The series has a fails to build on nokia_rx51: > > https://source.denx.de/u-boot/u-boot/-/jobs/471877#L483 > > which to me says doing 64bit division (likely related to block size, > > etc) without using the appropriate helper macros to turn them in to bit > > shifts instead. > > > Should I update and resend the series or just send the incremental > update to fix the U64/U32 division? Please rebase and resend the whole series, thanks. -- Tom -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 659 bytes Desc: not available URL: From wqu at suse.com Tue Jul 26 15:22:10 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:10 +0800 Subject: [PATCH v2 2/8] fs: btrfs: fix a bug which no data get read if the length is not 0 In-Reply-To: References: Message-ID: <8caea01ab60ad356e06558cdf18dfba0db622daf.1658812744.git.wqu@suse.com> [BUG] When testing with unaligned read, if a specific length is passed in, btrfs driver will read out nothing: => load host 0 $kernel_addr_r 5k_file 0x1000 0 0 bytes read in 0 ms But if no length is passed in, it works fine, even if we pass a non-zero length: => load host 0 $kernel_addr_r 5k_file 0 0x1000 1024 bytes read in 0 ms [CAUSE] In btrfs_read() if we have a larger size than our file, we will try to truncate it using the file size. However the real file size is not initialized if @len is not zero, thus we always truncate our length to 0, and cause the problem. [FIX] Fix it by just always do the file size check. In fact btrfs_size() always follow soft link, thus it will return the real file size correctly. Signed-off-by: Qu Wenruo --- fs/btrfs/btrfs.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/btrfs.c b/fs/btrfs/btrfs.c index 741c6e20f533..9145727058d4 100644 --- a/fs/btrfs/btrfs.c +++ b/fs/btrfs/btrfs.c @@ -246,16 +246,17 @@ int btrfs_read(const char *file, void *buf, loff_t offset, loff_t len, return -EINVAL; } - if (!len) { - ret = btrfs_size(file, &real_size); - if (ret < 0) { - error("Failed to get inode size: %s", file); - return ret; - } - len = real_size; + ret = btrfs_size(file, &real_size); + if (ret < 0) { + error("Failed to get inode size: %s", file); + return ret; } - if (len > real_size - offset) + /* + * If the length is 0 (meaning read the whole file) or the range is + * beyond file size, truncate it to the end of the file. + */ + if (!len || len > real_size - offset) len = real_size - offset; ret = btrfs_file_read(root, ino, offset, len, buf); -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:08 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:08 +0800 Subject: [PATCH v2 0/8] U-boot: fs: add generic unaligned read offset handling Message-ID: [CHANGELOG] v2->v1: - Fix a linkage error where (U64 % U32) is called without proper helper Fix it with U64 & (U32 - 1), as the U32 value (@blocksize) should always be power of 2, thus (@blocksize - 1) is the mask we want to calculate the offset inside the block. Above change only affects the 4th patch, everything else is not touched. RFC->v1: - More (manual) testing Unfortunately, in the latest master (75967970850a), the fs-tests.sh always seems to hang at preparing the fs image. Thus still has to do manual testing, tested btrfs, ext4 and fat, with aligned and unaligned read, also added soft link read, all looks fine here. Extra testing is still appreciated. - Two more btrfs specific bug fixes All exposed during manual tests - Remove the tailing unaligned block handling In fact, all fses can easily handle such case, just a min() call is enough. - Remove the support for sandboxfs Since it's using read() calls, really no need to do block alignment check. - Enhanced blocksize check Ensure the returned blocksize is not only non-error, but also non-zero. This patchset can be fetched from github: https://github.com/adam900710/u-boot/tree/fs_unaligned_read [BACKGROUND] Unlike FUSE/Kernel which always pass aligned read range, U-boot fs code just pass the request range to underlying fses. Under most case, this works fine, as U-boot only really needs to read the whole file (aka, 0 for both offset and len, len will be later determined using file size). But if some advanced user/script wants to extract kernel/initramfs from combined image, we may need to do unaligned read in that case. [ADVANTAGE] This patchset will handle unaligned read range in _fs_read(): - Get blocksize of the underlying fs - Read the leading block contianing the unaligned range The full block will be stored in a local buffer, then only copy the bytes in the unaligned range into the destination buffer. If the first block covers the whole range, we just call it aday. - Read the remaining range which starts at block aligned offset For most common case, which is 0 offset and 0 len, the code will not be changed at all. Just one extra get_blocksize() call, and for FAT/Btrfs/EROFS they all have cached blocksize, thus it takes almost no extra cost. Although for EXT4, it doesn't seem to cache the blocksize globally, thus has to do a path resolve and grab the blocksize. [DISADVANTAGE] The involved problem is: - Extra path resolving All those supported fs may have to do one extra path resolve if the read offset is not aligned. For EXT4, it will do one extra path resolve just to grab the blocksize. For data read which starts at offset 0 (the most common case), it should cause *NO* difference in performance. As the extra handling is only for unaligned offset. The common path is not really modified. [SUPPORTED FSES] - Btrfs (manually tested*) - Ext4 (manually tested) - FAT (manually tested) - Erofs - ubifs (unable to test, due to compile failure) *: Failed to get the test cases run, thus have to go sandbox mode, and attach an image with target fs, load the target file (with unaligned range) and compare the result using md5sum. For EXT4/FAT, they may need extra cleanup, as their existing unaligned range handling is no longer needed anymore, cleaning them up should free more code lines than the added one. Just not confident enough to modify them all by myself. [UNSUPPORTED FSES] - Squashfs They don't support non-zero offset, thus it can not handle the block aligned range. Need extra help to add block aligned offset support. - Semihostfs - Sandboxfs They all use read() directly, no need to do alignment check at all. Extra testing/feedback is always appreciated. Qu Wenruo (8): fs: fat: unexport file_fat_read_at() fs: btrfs: fix a bug which no data get read if the length is not 0 fs: btrfs: fix a crash if specified range is beyond file size fs: btrfs: move the unaligned read code to _fs_read() for btrfs fs: ext4: rely on _fs_read() to handle leading unaligned block read fs: fat: rely on higher layer to get block aligned read range fs: ubifs: rely on higher layer to do unaligned read fs: erofs: add unaligned read range handling fs/btrfs/btrfs.c | 33 ++++++++--- fs/btrfs/inode.c | 89 +++-------------------------- fs/erofs/internal.h | 1 + fs/erofs/super.c | 6 ++ fs/ext4/ext4fs.c | 22 +++++++ fs/fat/fat.c | 17 +++++- fs/fs.c | 130 +++++++++++++++++++++++++++++++++++++++--- fs/ubifs/ubifs.c | 13 +++-- include/btrfs.h | 1 + include/erofs.h | 1 + include/ext4fs.h | 1 + include/fat.h | 3 +- include/ubifs_uboot.h | 1 + 13 files changed, 212 insertions(+), 106 deletions(-) -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:09 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:09 +0800 Subject: [PATCH v2 1/8] fs: fat: unexport file_fat_read_at() In-Reply-To: References: Message-ID: That function is only utilized inside fat driver, unexport it. Signed-off-by: Qu Wenruo --- fs/fat/fat.c | 4 ++-- include/fat.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fat/fat.c b/fs/fat/fat.c index df9ea2c028fc..dcceccbcee0a 100644 --- a/fs/fat/fat.c +++ b/fs/fat/fat.c @@ -1243,8 +1243,8 @@ out_free_itr: return ret; } -int file_fat_read_at(const char *filename, loff_t pos, void *buffer, - loff_t maxsize, loff_t *actread) +static int file_fat_read_at(const char *filename, loff_t pos, void *buffer, + loff_t maxsize, loff_t *actread) { fsdata fsdata; fat_itr *itr; diff --git a/include/fat.h b/include/fat.h index bd8e450b33a3..a9756fb4cd1b 100644 --- a/include/fat.h +++ b/include/fat.h @@ -200,8 +200,6 @@ static inline u32 sect_to_clust(fsdata *fsdata, int sect) int file_fat_detectfs(void); int fat_exists(const char *filename); int fat_size(const char *filename, loff_t *size); -int file_fat_read_at(const char *filename, loff_t pos, void *buffer, - loff_t maxsize, loff_t *actread); int file_fat_read(const char *filename, void *buffer, int maxsize); int fat_set_blk_dev(struct blk_desc *rbdd, struct disk_partition *info); int fat_register_device(struct blk_desc *dev_desc, int part_no); -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:12 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:12 +0800 Subject: [PATCH v2 4/8] fs: btrfs: move the unaligned read code to _fs_read() for btrfs In-Reply-To: References: Message-ID: <80c0b3a112c6fef392e7c7893477a194b547023d.1658812744.git.wqu@suse.com> Unlike FUSE or kernel, U-boot filesystem code makes the underly fs code to handle the unaligned read (aka, read range is not aligned to fs block size). This makes underlying fs code harder to implement, as they have to handle unaligned read all by themselves. This patch will change the behavior, starting from btrfs, by moving the unaligned read code into _fs_read(). The idea is pretty simple, if we have an unaligned read request, we handle it in the following steps: 1. Grab the blocksize of the fs 2. Read the leading unaligned range We will read the block that @offset is in, and copy the requested part into buf. The the block we read covers the whole range, we just call it a day. 3. Read the remaining part The tailing part may be unaligned, but all fses handles the tailing part much easier than the leading unaligned part. As they just need to do a min(extent_size, start + len - cur) to calculate the real read size. In fact, for most file reading, the file size is not aligned and we need to handle the tailing part anyway. There is a btrfs specific cleanup involved: - In btrfs_file_read(), merge the tailing unaligned read into the main loop. Just reuse the existing read length calculation is enough. - Remove read_and_truncate_page() call Since there is no explicit leading/tailing unaligned read anymore. This has been tested with a proper randomly populated btrfs file, then tried in sandbox mode with different aligned and unaligned range and compare the output with md5sum. Cc: Marek Behun Cc: linux-btrfs at vger.kernel.org Signed-off-by: Qu Wenruo --- fs/btrfs/btrfs.c | 10 ++++ fs/btrfs/inode.c | 89 +++----------------------------- fs/fs.c | 130 ++++++++++++++++++++++++++++++++++++++++++++--- include/btrfs.h | 1 + 4 files changed, 141 insertions(+), 89 deletions(-) diff --git a/fs/btrfs/btrfs.c b/fs/btrfs/btrfs.c index bf9e1f2f17cf..7c8f4a3dfb87 100644 --- a/fs/btrfs/btrfs.c +++ b/fs/btrfs/btrfs.c @@ -234,6 +234,10 @@ int btrfs_read(const char *file, void *buf, loff_t offset, loff_t len, int ret; ASSERT(fs_info); + + /* Higher layer has ensures it never pass unaligned offset in. */ + ASSERT(IS_ALIGNED(offset, fs_info->sectorsize)); + ret = btrfs_lookup_path(fs_info->fs_root, BTRFS_FIRST_FREE_OBJECTID, file, &root, &ino, &type, 40); if (ret < 0) { @@ -275,6 +279,12 @@ int btrfs_read(const char *file, void *buf, loff_t offset, loff_t len, return 0; } +int btrfs_get_blocksize(const char *filename) +{ + ASSERT(current_fs_info); + return current_fs_info->sectorsize; +} + void btrfs_close(void) { if (current_fs_info) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0173d30cd8ab..aa198c5aaf1f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -617,44 +617,6 @@ check_next: return 1; } -static int read_and_truncate_page(struct btrfs_path *path, - struct btrfs_file_extent_item *fi, - int start, int len, char *dest) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_fs_info *fs_info = leaf->fs_info; - u64 aligned_start = round_down(start, fs_info->sectorsize); - u8 extent_type; - char *buf; - int page_off = start - aligned_start; - int page_len = fs_info->sectorsize - page_off; - int ret; - - ASSERT(start + len <= aligned_start + fs_info->sectorsize); - buf = malloc_cache_aligned(fs_info->sectorsize); - if (!buf) - return -ENOMEM; - - extent_type = btrfs_file_extent_type(leaf, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = btrfs_read_extent_inline(path, fi, buf); - memcpy(dest, buf + page_off, min(page_len, ret)); - free(buf); - return len; - } - - ret = btrfs_read_extent_reg(path, fi, - round_down(start, fs_info->sectorsize), - fs_info->sectorsize, buf); - if (ret < 0) { - free(buf); - return ret; - } - memcpy(dest, buf + page_off, page_len); - free(buf); - return len; -} - int btrfs_file_read(struct btrfs_root *root, u64 ino, u64 file_offset, u64 len, char *dest) { @@ -663,7 +625,6 @@ int btrfs_file_read(struct btrfs_root *root, u64 ino, u64 file_offset, u64 len, struct btrfs_path path; struct btrfs_key key; u64 aligned_start = round_down(file_offset, fs_info->sectorsize); - u64 aligned_end = round_down(file_offset + len, fs_info->sectorsize); u64 next_offset; u64 cur = aligned_start; int ret = 0; @@ -673,34 +634,14 @@ int btrfs_file_read(struct btrfs_root *root, u64 ino, u64 file_offset, u64 len, /* Set the whole dest all zero, so we won't need to bother holes */ memset(dest, 0, len); - /* Read out the leading unaligned part */ - if (aligned_start != file_offset) { - ret = lookup_data_extent(root, &path, ino, aligned_start, - &next_offset); - if (ret < 0) - goto out; - if (ret == 0) { - /* Read the unaligned part out*/ - fi = btrfs_item_ptr(path.nodes[0], path.slots[0], - struct btrfs_file_extent_item); - ret = read_and_truncate_page(&path, fi, file_offset, - round_up(file_offset, fs_info->sectorsize) - - file_offset, dest); - if (ret < 0) - goto out; - cur += fs_info->sectorsize; - } else { - /* The whole file is a hole */ - if (!next_offset) { - memset(dest, 0, len); - return len; - } - cur = next_offset; - } - } + /* + * Ensured by higher layer, which should have already handled the + * first unaligned sector. + */ + ASSERT(aligned_start == file_offset); /* Read the aligned part */ - while (cur < aligned_end) { + while (cur < file_offset + len) { u64 extent_num_bytes; u8 type; @@ -743,27 +684,13 @@ int btrfs_file_read(struct btrfs_root *root, u64 ino, u64 file_offset, u64 len, extent_num_bytes = btrfs_file_extent_num_bytes(path.nodes[0], fi); ret = btrfs_read_extent_reg(&path, fi, cur, - min(extent_num_bytes, aligned_end - cur), + min(extent_num_bytes, file_offset + len - cur), dest + cur - file_offset); if (ret < 0) goto out; - cur += min(extent_num_bytes, aligned_end - cur); + cur += min(extent_num_bytes, file_offset + len - cur); } - /* Read the tailing unaligned part*/ - if (file_offset + len != aligned_end) { - btrfs_release_path(&path); - ret = lookup_data_extent(root, &path, ino, aligned_end, - &next_offset); - /* <0 is error, >0 means no extent */ - if (ret) - goto out; - fi = btrfs_item_ptr(path.nodes[0], path.slots[0], - struct btrfs_file_extent_item); - ret = read_and_truncate_page(&path, fi, aligned_end, - file_offset + len - aligned_end, - dest + aligned_end - file_offset); - } out: btrfs_release_path(&path); if (ret < 0) diff --git a/fs/fs.c b/fs/fs.c index 6de1a3eb6d5d..1e9f778e1f11 100644 --- a/fs/fs.c +++ b/fs/fs.c @@ -28,6 +28,7 @@ #include #include #include +#include DECLARE_GLOBAL_DATA_PTR; @@ -139,6 +140,11 @@ static inline int fs_mkdir_unsupported(const char *dirname) return -1; } +static inline int fs_get_blocksize_unsupported(const char *filename) +{ + return -1; +} + struct fstype_info { int fstype; char *name; @@ -158,6 +164,14 @@ struct fstype_info { int (*size)(const char *filename, loff_t *size); int (*read)(const char *filename, void *buf, loff_t offset, loff_t len, loff_t *actread); + /* + * Report the minimal data blocksize the fs supprts. + * + * This is used to handle unaligned read offset. + * If not supported, read() will handle the unaligned offset all by + * itself. + */ + int (*get_blocksize)(const char *filename); int (*write)(const char *filename, void *buf, loff_t offset, loff_t len, loff_t *actwrite); void (*close)(void); @@ -193,6 +207,7 @@ static struct fstype_info fstypes[] = { .exists = fat_exists, .size = fat_size, .read = fat_read_file, + .get_blocksize = fs_get_blocksize_unsupported, #if CONFIG_IS_ENABLED(FAT_WRITE) .write = file_fat_write, .unlink = fat_unlink, @@ -221,6 +236,7 @@ static struct fstype_info fstypes[] = { .exists = ext4fs_exists, .size = ext4fs_size, .read = ext4_read_file, + .get_blocksize = fs_get_blocksize_unsupported, #ifdef CONFIG_CMD_EXT4_WRITE .write = ext4_write_file, .ln = ext4fs_create_link, @@ -245,6 +261,11 @@ static struct fstype_info fstypes[] = { .exists = sandbox_fs_exists, .size = sandbox_fs_size, .read = fs_read_sandbox, + /* + * Sandbox doesn't need to bother blocksize, as its + * os_read() can handle unaligned range without any problem. + */ + .get_blocksize = fs_get_blocksize_unsupported, .write = fs_write_sandbox, .uuid = fs_uuid_unsupported, .opendir = fs_opendir_unsupported, @@ -264,6 +285,12 @@ static struct fstype_info fstypes[] = { .exists = fs_exists_unsupported, .size = smh_fs_size, .read = smh_fs_read, + /* + * Semihost doesn't need to bother blocksize, as it is using + * read() system calls, and can handle unaligned range without + * any problem. + */ + .get_blocksize = fs_get_blocksize_unsupported, .write = smh_fs_write, .uuid = fs_uuid_unsupported, .opendir = fs_opendir_unsupported, @@ -284,6 +311,7 @@ static struct fstype_info fstypes[] = { .exists = ubifs_exists, .size = ubifs_size, .read = ubifs_read, + .get_blocksize = fs_get_blocksize_unsupported, .write = fs_write_unsupported, .uuid = fs_uuid_unsupported, .opendir = fs_opendir_unsupported, @@ -305,6 +333,7 @@ static struct fstype_info fstypes[] = { .exists = btrfs_exists, .size = btrfs_size, .read = btrfs_read, + .get_blocksize = btrfs_get_blocksize, .write = fs_write_unsupported, .uuid = btrfs_uuid, .opendir = fs_opendir_unsupported, @@ -324,6 +353,7 @@ static struct fstype_info fstypes[] = { .readdir = sqfs_readdir, .ls = fs_ls_generic, .read = sqfs_read, + .get_blocksize = fs_get_blocksize_unsupported, .size = sqfs_size, .close = sqfs_close, .closedir = sqfs_closedir, @@ -345,6 +375,7 @@ static struct fstype_info fstypes[] = { .readdir = erofs_readdir, .ls = fs_ls_generic, .read = erofs_read, + .get_blocksize = fs_get_blocksize_unsupported, .size = erofs_size, .close = erofs_close, .closedir = erofs_closedir, @@ -366,6 +397,7 @@ static struct fstype_info fstypes[] = { .exists = fs_exists_unsupported, .size = fs_size_unsupported, .read = fs_read_unsupported, + .get_blocksize = fs_get_blocksize_unsupported, .write = fs_write_unsupported, .uuid = fs_uuid_unsupported, .opendir = fs_opendir_unsupported, @@ -579,7 +611,11 @@ static int _fs_read(const char *filename, ulong addr, loff_t offset, loff_t len, { struct fstype_info *info = fs_get_info(fs_type); void *buf; + int blocksize; int ret; + loff_t cur = offset; + loff_t bytes_read = 0; + loff_t total_read = 0; #ifdef CONFIG_LMB if (do_lmb_check) { @@ -589,19 +625,97 @@ static int _fs_read(const char *filename, ulong addr, loff_t offset, loff_t len, } #endif + blocksize = info->get_blocksize(filename); /* - * We don't actually know how many bytes are being read, since len==0 - * means read the whole file. + * The fs doesn't report its blocksize, let its read() to handle + * the unaligned read. + */ + if (blocksize < 0) { + buf = map_sysmem(addr, len); + ret = info->read(filename, buf, offset, len, actread); + + /* If we requested a specific number of bytes, check we got it */ + if (ret == 0 && len && *actread != len) + log_debug("** %s shorter than offset + len **\n", filename); + goto out; + } + + if (unlikely(blocksize == 0)) { + log_err("invalid blocksize 0 found\n"); + return -EINVAL; + } + + /* + * @len can be 0, meaning read the whole file. + * And we can not rely on info->size(), as some fses doesn't resolve + * softlinks to their final destinations. */ buf = map_sysmem(addr, len); - ret = info->read(filename, buf, offset, len, actread); - unmap_sysmem(buf); - /* If we requested a specific number of bytes, check we got it */ - if (ret == 0 && len && *actread != len) - log_debug("** %s shorter than offset + len **\n", filename); - fs_close(); + /* Unaligned read offset, handle the unaligned read here. */ + if (!IS_ALIGNED(offset, blocksize)) { + void *block_buf; + const int offset_in_block = offset & (blocksize - 1); + int copy_len; + + block_buf = malloc_cache_aligned(blocksize); + if (!block_buf) { + log_err("** Unable to alloc memory for one block **\n"); + return -ENOMEM; + } + memset(block_buf, 0, blocksize); + + cur = round_down(offset, blocksize); + ret = info->read(filename, block_buf, cur, blocksize, + &bytes_read); + if (ret < 0) { + log_err("** Failed to read %s at offset %llu, %d **\n", + filename, cur, ret); + free(block_buf); + goto out; + } + if (bytes_read <= offset_in_block) { + log_err("** Offset %llu is beyond file size of %s **\n", + offset, filename); + free(block_buf); + ret = -EIO; + goto out; + } + + copy_len = min_t(int, blocksize, bytes_read) - offset_in_block; + memcpy(buf, block_buf + offset_in_block, copy_len); + free(block_buf); + total_read += copy_len; + + /* + * A short read on the block, or we have already covered the + * whole read range, just call it a day. + */ + if (bytes_read < blocksize || + (len && offset + len <= cur + blocksize)) + goto out; + + cur += blocksize; + if (len) + len -= copy_len; + } + + ret = info->read(filename, buf + total_read, cur, len, &bytes_read); + if (ret < 0) { + log_err("** failed to read %s off %llu len %llu, %d **\n", + filename, cur, len, ret); + goto out; + } + if (len && bytes_read < len) + log_debug("** %s short read, off %llu len %llu actual read %llu **\n", + filename, cur, len, bytes_read); + total_read += bytes_read; +out: + unmap_sysmem(buf); + fs_close(); + if (!ret) + *actread = total_read; return ret; } diff --git a/include/btrfs.h b/include/btrfs.h index a7605e158970..bba71ec02893 100644 --- a/include/btrfs.h +++ b/include/btrfs.h @@ -17,6 +17,7 @@ int btrfs_ls(const char *); int btrfs_exists(const char *); int btrfs_size(const char *, loff_t *); int btrfs_read(const char *, void *, loff_t, loff_t, loff_t *); +int btrfs_get_blocksize(const char *); void btrfs_close(void); int btrfs_uuid(char *); void btrfs_list_subvols(void); -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:11 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:11 +0800 Subject: [PATCH v2 3/8] fs: btrfs: fix a crash if specified range is beyond file size In-Reply-To: References: Message-ID: [BUG] When try to read a range beyond file size, btrfs driver will cause crash/segfault: => load host 0 $kernel_addr_r 5k_file 0 0x2000 SEGFAULT [CAUSE] In btrfs_read(), if @len is 0, we will truncated it to file end, but if file end is beyond our file size, this truncation will underflow @len, making it -3K in this case. And later that @len is used to memzero the output buffer, resulting above crash. [FIX] Just error out if @offset is already beyond our file size. Now it will fail properly with correct error message: => load host 0 $kernel_addr_r 5m_origin 0 0x2000 BTRFS: Read range beyond file size, offset 8192 file size 5120 Failed to load '5m_origin' Signed-off-by: Qu Wenruo --- fs/btrfs/btrfs.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/btrfs.c b/fs/btrfs/btrfs.c index 9145727058d4..bf9e1f2f17cf 100644 --- a/fs/btrfs/btrfs.c +++ b/fs/btrfs/btrfs.c @@ -252,6 +252,12 @@ int btrfs_read(const char *file, void *buf, loff_t offset, loff_t len, return ret; } + if (offset >= real_size) { + error("Read range beyond file size, offset %llu file size %llu", + offset, real_size); + return -EINVAL; + } + /* * If the length is 0 (meaning read the whole file) or the range is * beyond file size, truncate it to the end of the file. -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:13 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:13 +0800 Subject: [PATCH v2 5/8] fs: ext4: rely on _fs_read() to handle leading unaligned block read In-Reply-To: References: Message-ID: Just add ext4_get_blocksize() and a new assert() in ext4fs_read_file(). Signed-off-by: Qu Wenruo --- fs/ext4/ext4fs.c | 22 ++++++++++++++++++++++ fs/fs.c | 2 +- include/ext4fs.h | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4fs.c b/fs/ext4/ext4fs.c index 4c89152ce4ad..b0568e77a895 100644 --- a/fs/ext4/ext4fs.c +++ b/fs/ext4/ext4fs.c @@ -69,6 +69,8 @@ int ext4fs_read_file(struct ext2fs_node *node, loff_t pos, short status; struct ext_block_cache cache; + assert(IS_ALIGNED(pos, blocksize)); + ext_cache_init(&cache); /* Adjust len so it we can't read past the end of the file. */ @@ -259,6 +261,26 @@ int ext4_read_file(const char *filename, void *buf, loff_t offset, loff_t len, return ext4fs_read(buf, offset, len, len_read); } +int ext4_get_blocksize(const char *filename) +{ + struct ext_filesystem *fs; + int log2blksz; + int log2_fs_blocksize; + loff_t file_len; + int ret; + + ret = ext4fs_open(filename, &file_len); + if (ret < 0) { + printf("** File not found %s **\n", filename); + return -1; + } + fs = get_fs(); + log2blksz = fs->dev_desc->log2blksz; + log2_fs_blocksize = LOG2_BLOCK_SIZE(ext4fs_file->data) - log2blksz; + + return (1 << (log2_fs_blocksize + log2blksz)); +} + int ext4fs_uuid(char *uuid_str) { if (ext4fs_root == NULL) diff --git a/fs/fs.c b/fs/fs.c index 1e9f778e1f11..3d6cc6b38b26 100644 --- a/fs/fs.c +++ b/fs/fs.c @@ -236,7 +236,7 @@ static struct fstype_info fstypes[] = { .exists = ext4fs_exists, .size = ext4fs_size, .read = ext4_read_file, - .get_blocksize = fs_get_blocksize_unsupported, + .get_blocksize = ext4_get_blocksize, #ifdef CONFIG_CMD_EXT4_WRITE .write = ext4_write_file, .ln = ext4fs_create_link, diff --git a/include/ext4fs.h b/include/ext4fs.h index cb5d9cc0a5c0..0f4cf32dcc2a 100644 --- a/include/ext4fs.h +++ b/include/ext4fs.h @@ -161,6 +161,7 @@ int ext4fs_probe(struct blk_desc *fs_dev_desc, struct disk_partition *fs_partition); int ext4_read_file(const char *filename, void *buf, loff_t offset, loff_t len, loff_t *actread); +int ext4_get_blocksize(const char *filename); int ext4_read_superblock(char *buffer); int ext4fs_uuid(char *uuid_str); void ext_cache_init(struct ext_block_cache *cache); -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:14 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:14 +0800 Subject: [PATCH v2 6/8] fs: fat: rely on higher layer to get block aligned read range In-Reply-To: References: Message-ID: <70c34636e605f0cbcdaa2183bcc7fb1d5de2cb6d.1658812744.git.wqu@suse.com> Just implement fat_get_blocksize() for fat, so that fat_read_file() always get a block aligned read range. Unfortunately I'm not experienced enough to cleanup the fat code, thus further cleanup is appreciated. Cc: Tom Rini Signed-off-by: Qu Wenruo --- fs/fat/fat.c | 13 +++++++++++++ fs/fs.c | 2 +- include/fat.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/fat/fat.c b/fs/fat/fat.c index dcceccbcee0a..e13035e8e6d1 100644 --- a/fs/fat/fat.c +++ b/fs/fat/fat.c @@ -1299,6 +1299,19 @@ int fat_read_file(const char *filename, void *buf, loff_t offset, loff_t len, return ret; } +int fat_get_blocksize(const char *filename) +{ + fsdata fsdata = {0}; + int ret; + + ret = get_fs_info(&fsdata); + if (ret) + return ret; + + free(fsdata.fatbuf); + return fsdata.sect_size; +} + typedef struct { struct fs_dir_stream parent; struct fs_dirent dirent; diff --git a/fs/fs.c b/fs/fs.c index 3d6cc6b38b26..3eb540c5fe30 100644 --- a/fs/fs.c +++ b/fs/fs.c @@ -207,7 +207,7 @@ static struct fstype_info fstypes[] = { .exists = fat_exists, .size = fat_size, .read = fat_read_file, - .get_blocksize = fs_get_blocksize_unsupported, + .get_blocksize = fat_get_blocksize, #if CONFIG_IS_ENABLED(FAT_WRITE) .write = file_fat_write, .unlink = fat_unlink, diff --git a/include/fat.h b/include/fat.h index a9756fb4cd1b..c03a2bebecef 100644 --- a/include/fat.h +++ b/include/fat.h @@ -201,6 +201,7 @@ int file_fat_detectfs(void); int fat_exists(const char *filename); int fat_size(const char *filename, loff_t *size); int file_fat_read(const char *filename, void *buffer, int maxsize); +int fat_get_blocksize(const char *filename); int fat_set_blk_dev(struct blk_desc *rbdd, struct disk_partition *info); int fat_register_device(struct blk_desc *dev_desc, int part_no); -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:15 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:15 +0800 Subject: [PATCH v2 7/8] fs: ubifs: rely on higher layer to do unaligned read In-Reply-To: References: Message-ID: Currently ubifs doesn't support unaligned read offset, thanks to the recent _fs_read() work to handle unaligned read, we only need to implement ubifs_get_blocksize() to take advantage of it. Now ubifs can do unaligned read without any problem. Signed-off-by: Qu Wenruo --- fs/fs.c | 2 +- fs/ubifs/ubifs.c | 13 ++++++++----- include/ubifs_uboot.h | 1 + 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/fs.c b/fs/fs.c index 3eb540c5fe30..2b847d83597b 100644 --- a/fs/fs.c +++ b/fs/fs.c @@ -311,7 +311,7 @@ static struct fstype_info fstypes[] = { .exists = ubifs_exists, .size = ubifs_size, .read = ubifs_read, - .get_blocksize = fs_get_blocksize_unsupported, + .get_blocksize = ubifs_get_blocksize, .write = fs_write_unsupported, .uuid = fs_uuid_unsupported, .opendir = fs_opendir_unsupported, diff --git a/fs/ubifs/ubifs.c b/fs/ubifs/ubifs.c index d3026e310168..a8ab556dd376 100644 --- a/fs/ubifs/ubifs.c +++ b/fs/ubifs/ubifs.c @@ -846,11 +846,9 @@ int ubifs_read(const char *filename, void *buf, loff_t offset, *actread = 0; - if (offset & (PAGE_SIZE - 1)) { - printf("ubifs: Error offset must be a multiple of %d\n", - PAGE_SIZE); - return -1; - } + /* Higher layer should ensure it always pass page aligned range. */ + assert(IS_ALIGNED(offset, PAGE_SIZE)); + assert(IS_ALIGNED(size, PAGE_SIZE)); c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READONLY); /* ubifs_findfile will resolve symlinks, so we know that we get @@ -920,6 +918,11 @@ out: return err; } +int ubifs_get_blocksize(const char *filename) +{ + return PAGE_SIZE; +} + void ubifs_close(void) { } diff --git a/include/ubifs_uboot.h b/include/ubifs_uboot.h index b025779d59ff..bcd21715314a 100644 --- a/include/ubifs_uboot.h +++ b/include/ubifs_uboot.h @@ -29,6 +29,7 @@ int ubifs_exists(const char *filename); int ubifs_size(const char *filename, loff_t *size); int ubifs_read(const char *filename, void *buf, loff_t offset, loff_t size, loff_t *actread); +int ubifs_get_blocksize(const char *filename); void ubifs_close(void); #endif /* __UBIFS_UBOOT_H__ */ -- 2.37.0 From wqu at suse.com Tue Jul 26 15:22:16 2022 From: wqu at suse.com (Qu Wenruo) Date: Tue, 26 Jul 2022 13:22:16 +0800 Subject: [PATCH v2 8/8] fs: erofs: add unaligned read range handling In-Reply-To: References: Message-ID: I'm not an expert on erofs, but my quick glance didn't expose any special handling on unaligned range, thus I think the U-boot erofs driver doesn't really support unaligned read range. This patch will add erofs_get_blocksize() so erofs can benefit from the generic unaligned read support. Cc: Huang Jianan Cc: linux-erofs at lists.ozlabs.org Signed-off-by: Qu Wenruo Reviewed-by: Huang Jianan --- fs/erofs/internal.h | 1 + fs/erofs/super.c | 6 ++++++ fs/fs.c | 2 +- include/erofs.h | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4af7c91560cc..d368a6481bf1 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -83,6 +83,7 @@ struct erofs_sb_info { u16 available_compr_algs; u16 lz4_max_distance; u32 checksum; + u32 blocksize; u16 extra_devices; union { u16 devt_slotoff; /* used for mkfs */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 4cca322b9ead..df01d2e719a7 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -99,7 +99,13 @@ int erofs_read_superblock(void) sbi.build_time = le64_to_cpu(dsb->build_time); sbi.build_time_nsec = le32_to_cpu(dsb->build_time_nsec); + sbi.blocksize = 1 << blkszbits; memcpy(&sbi.uuid, dsb->uuid, sizeof(dsb->uuid)); return erofs_init_devices(&sbi, dsb); } + +int erofs_get_blocksize(const char *filename) +{ + return sbi.blocksize; +} diff --git a/fs/fs.c b/fs/fs.c index 2b847d83597b..23cfb1f5025b 100644 --- a/fs/fs.c +++ b/fs/fs.c @@ -375,7 +375,7 @@ static struct fstype_info fstypes[] = { .readdir = erofs_readdir, .ls = fs_ls_generic, .read = erofs_read, - .get_blocksize = fs_get_blocksize_unsupported, + .get_blocksize = erofs_get_blocksize, .size = erofs_size, .close = erofs_close, .closedir = erofs_closedir, diff --git a/include/erofs.h b/include/erofs.h index 1fbe82bf72cb..18bd6807c538 100644 --- a/include/erofs.h +++ b/include/erofs.h @@ -10,6 +10,7 @@ int erofs_probe(struct blk_desc *fs_dev_desc, struct disk_partition *fs_partition); int erofs_read(const char *filename, void *buf, loff_t offset, loff_t len, loff_t *actread); +int erofs_get_blocksize(const char *filename); int erofs_size(const char *filename, loff_t *size); int erofs_exists(const char *filename); void erofs_close(void); -- 2.37.0 From hsiangkao at linux.alibaba.com Wed Jul 27 20:04:44 2022 From: hsiangkao at linux.alibaba.com (Gao Xiang) Date: Wed, 27 Jul 2022 18:04:44 +0800 Subject: [PATCH v4] erofs: update ctx->pos for every emitted dirent In-Reply-To: <20220722082732.30935-1-jefflexu@linux.alibaba.com> References: <20220722082732.30935-1-jefflexu@linux.alibaba.com> Message-ID: On Fri, Jul 22, 2022 at 04:27:32PM +0800, Jeffle Xu wrote: > From: Hongnan Li > > erofs_readdir update ctx->pos after filling a batch of dentries > and it may cause dir/files duplication for NFS readdirplus which > depends on ctx->pos to fill dir correctly. So update ctx->pos for > every emitted dirent in erofs_fill_dentries to fix it. > > Also fix the update of ctx->pos when the initial file position has > exceeded nameoff. > > Fixes: 3e917cc305c6 ("erofs: make filesystem exportable") > Signed-off-by: Hongnan Li > Signed-off-by: Jeffle Xu Reviewed-by: Gao Xiang Thanks, Gao Xiang From jnhuang at linux.alibaba.com Thu Jul 28 22:09:09 2022 From: jnhuang at linux.alibaba.com (Huang Jianan) Date: Thu, 28 Jul 2022 20:09:09 +0800 Subject: [PATCH v2 1/2] erofs-utils: fuse: introduce xattr support Message-ID: <20220728120910.61636-1-jnhuang@linux.alibaba.com> This implements xattr functionalities for erofsfuse. A large amount of code was adapted from Linux kernel. Signed-off-by: Huang Jianan --- fuse/main.c | 31 +++ include/erofs/internal.h | 8 + include/erofs/xattr.h | 21 ++ lib/xattr.c | 508 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 568 insertions(+) diff --git a/fuse/main.c b/fuse/main.c index 95f939e..dd1508e 100644 --- a/fuse/main.c +++ b/fuse/main.c @@ -139,7 +139,38 @@ static int erofsfuse_readlink(const char *path, char *buffer, size_t size) return 0; } +static int erofsfuse_getxattr(const char *path, const char *name, char *value, + size_t size) +{ + int ret; + struct erofs_inode vi; + + erofs_dbg("getxattr(%s): name=%s size=%llu", path, name, size); + + ret = erofs_ilookup(path, &vi); + if (ret) + return ret; + + return erofs_getxattr(&vi, name, value, size); +} + +static int erofsfuse_listxattr(const char *path, char *list, size_t size) +{ + int ret; + struct erofs_inode vi; + + erofs_dbg("listxattr(%s): size=%llu", path, size); + + ret = erofs_ilookup(path, &vi); + if (ret) + return ret; + + return erofs_listxattr(&vi, list, size); +} + static struct fuse_operations erofs_ops = { + .getxattr = erofsfuse_getxattr, + .listxattr = erofsfuse_listxattr, .readlink = erofsfuse_readlink, .getattr = erofsfuse_getattr, .readdir = erofsfuse_readdir, diff --git a/include/erofs/internal.h b/include/erofs/internal.h index 6a70f11..991635f 100644 --- a/include/erofs/internal.h +++ b/include/erofs/internal.h @@ -180,6 +180,9 @@ struct erofs_inode { unsigned int xattr_isize; unsigned int extent_isize; + unsigned int xattr_shared_count; + unsigned int *xattr_shared_xattrs; + erofs_nid_t nid; struct erofs_buffer_head *bh; struct erofs_buffer_head *bh_inline, *bh_data; @@ -351,6 +354,11 @@ static inline int erofs_get_occupied_size(const struct erofs_inode *inode, return 0; } +/* data.c */ +int erofs_getxattr(struct erofs_inode *vi, const char *name, char *buffer, + size_t buffer_size); +int erofs_listxattr(struct erofs_inode *vi, char *buffer, size_t buffer_size); + /* zmap.c */ int z_erofs_fill_inode(struct erofs_inode *vi); int z_erofs_map_blocks_iter(struct erofs_inode *vi, diff --git a/include/erofs/xattr.h b/include/erofs/xattr.h index 226e984..a0528c0 100644 --- a/include/erofs/xattr.h +++ b/include/erofs/xattr.h @@ -14,6 +14,27 @@ extern "C" #include "internal.h" +#ifndef ENOATTR +#define ENOATTR ENODATA +#endif + +static inline unsigned int inlinexattr_header_size(struct erofs_inode *vi) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; +} + +static inline erofs_blk_t xattrblock_addr(unsigned int xattr_id) +{ + return sbi.xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +} + +static inline unsigned int xattrblock_offset(unsigned int xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + #define EROFS_INODE_XATTR_ICOUNT(_size) ({\ u32 __size = le16_to_cpu(_size); \ ((__size) == 0) ? 0 : \ diff --git a/lib/xattr.c b/lib/xattr.c index c8ce278..8979dcc 100644 --- a/lib/xattr.c +++ b/lib/xattr.c @@ -714,3 +714,511 @@ char *erofs_export_xattr_ibody(struct list_head *ixattrs, unsigned int size) DBG_BUGON(p > size); return buf; } + +struct xattr_iter { + char page[EROFS_BLKSIZ]; + + void *kaddr; + + erofs_blk_t blkaddr; + unsigned int ofs; +}; + +static int init_inode_xattrs(struct erofs_inode *vi) +{ + struct xattr_iter it; + unsigned int i; + struct erofs_xattr_ibody_header *ih; + int ret = 0; + + /* the most case is that xattrs of this inode are initialized. */ + if (vi->flags & EROFS_I_EA_INITED) + return ret; + + /* + * bypass all xattr operations if ->xattr_isize is not greater than + * sizeof(struct erofs_xattr_ibody_header), in detail: + * 1) it is not enough to contain erofs_xattr_ibody_header then + * ->xattr_isize should be 0 (it means no xattr); + * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk + * undefined right now (maybe use later with some new sb feature). + */ + if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { + erofs_err("xattr_isize %d of nid %llu is not supported yet", + vi->xattr_isize, vi->nid); + return -EOPNOTSUPP; + } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { + if (vi->xattr_isize) { + erofs_err("bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; /* xattr ondisk layout error */ + } + return -ENOATTR; + } + + it.blkaddr = erofs_blknr(iloc(vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(vi->nid) + vi->inode_isize); + + ret = blk_read(0, it.page, it.blkaddr, 1); + if (ret < 0) + return -EIO; + + it.kaddr = it.page; + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = malloc(vi->xattr_shared_count * sizeof(uint)); + if (!vi->xattr_shared_xattrs) + return -ENOMEM; + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (it.ofs >= EROFS_BLKSIZ) { + /* cannot be unaligned */ + DBG_BUGON(it.ofs != EROFS_BLKSIZ); + + ret = blk_read(0, it.page, ++it.blkaddr, 1); + if (ret < 0) { + free(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + return -EIO; + } + + it.kaddr = it.page; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + + vi->flags |= EROFS_I_EA_INITED; + + return ret; +} + +/* + * the general idea for these return values is + * if 0 is returned, go on processing the current xattr; + * 1 (> 0) is returned, skip this round to process the next xattr; + * -err (< 0) is returned, an error (maybe ENOXATTR) occurred + * and need to be handled + */ +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); + int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); + int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); + void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); +}; + +static inline int xattr_iter_fixup(struct xattr_iter *it) +{ + int ret; + + if (it->ofs < EROFS_BLKSIZ) + return 0; + + it->blkaddr += erofs_blknr(it->ofs); + + ret = blk_read(0, it->page, it->blkaddr, 1); + if (ret < 0) + return -EIO; + + it->kaddr = it->page; + it->ofs = erofs_blkoff(it->ofs); + return 0; +} + +static int inline_xattr_iter_pre(struct xattr_iter *it, + struct erofs_inode *vi) +{ + unsigned int xattr_header_sz, inline_xattr_ofs; + int ret; + + xattr_header_sz = inlinexattr_header_size(vi); + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(vi->nid) + inline_xattr_ofs); + + ret = blk_read(0, it->page, it->blkaddr, 1); + if (ret < 0) + return -EIO; + + it->kaddr = it->page; + return vi->xattr_isize - xattr_header_sz; +} + +/* + * Regardless of success or failure, `xattr_foreach' will end up with + * `ofs' pointing to the next xattr item rather than an arbitrary position. + */ +static int xattr_foreach(struct xattr_iter *it, + const struct xattr_iter_handlers *op, + unsigned int *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned int value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + err = xattr_iter_fixup(it); + if (err) + return err; + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit) { + unsigned int entry_sz = erofs_xattr_entry_size(&entry); + + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (*tlimit < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* xattrs should be 4-byte aligned (on-disk constraint) */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err < 0 ? err : 0; +} + +struct getxattr_iter { + struct xattr_iter it; + + int buffer_size, index; + char *buffer; + const char *name; + size_t len; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + + return memcmp(buf, it->name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return !it->buffer ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned int processed, + char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static const struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct erofs_inode *vi, struct getxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_pre(&it->it, vi); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret != -ENOATTR) + break; + } + + return ret ? ret : it->buffer_size; +} + +static int shared_getxattr(struct erofs_inode *vi, struct getxattr_iter *it) +{ + unsigned int i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + ret = blk_read(0, it->it.page, blkaddr, 1); + if (ret < 0) + return -EIO; + + it->it.kaddr = it->it.page; + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret != -ENOATTR) + break; + } + + return ret ? ret : it->buffer_size; +} + +int erofs_getxattr(struct erofs_inode *vi, const char *name, char *buffer, + size_t buffer_size) +{ + int ret; + u8 prefix; + u16 prefixlen; + struct getxattr_iter it; + + if (!name) + return -EINVAL; + + ret = init_inode_xattrs(vi); + if (ret) + return ret; + + if (!match_prefix(name, &prefix, &prefixlen)) + return -ENODATA; + + it.index = prefix; + it.name = name + prefixlen; + it.len = strlen(it.name); + if (it.len > EROFS_NAME_LEN) + return -ERANGE; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + ret = inline_getxattr(vi, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(vi, &it); + return ret; +} + +struct listxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned int prefix_len; + const char *prefix; + + prefix = xattr_types[entry->e_name_index].prefix; + prefix_len = xattr_types[entry->e_name_index].prefix_len; + + if (!it->buffer) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static const struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct erofs_inode *vi, struct listxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_pre(&it->it, vi); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret) + break; + } + + return ret ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct erofs_inode *vi, struct listxattr_iter *it) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + ret = blk_read(0, it->it.page, blkaddr, 1); + if (ret < 0) + return -EIO; + + it->it.kaddr = it->it.page; + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret) + break; + } + + return ret ? ret : it->buffer_ofs; +} + +int erofs_listxattr(struct erofs_inode *vi, char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + ret = init_inode_xattrs(vi); + if (ret == -ENOATTR) + return 0; + if (ret) + return ret; + + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = inline_listxattr(vi, &it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(vi, &it); +} -- 2.34.1 From jnhuang at linux.alibaba.com Thu Jul 28 22:09:10 2022 From: jnhuang at linux.alibaba.com (Huang Jianan) Date: Thu, 28 Jul 2022 20:09:10 +0800 Subject: [PATCH v2 2/2] erofs-utils: tests: add random test for xattrs In-Reply-To: <20220728120910.61636-1-jnhuang@linux.alibaba.com> References: <20220728120910.61636-1-jnhuang@linux.alibaba.com> Message-ID: <20220728120910.61636-2-jnhuang@linux.alibaba.com> Add random functional check for xattrs. Signed-off-by: Huang Jianan --- tests/Makefile.am | 3 ++ tests/erofs/020 | 72 +++++++++++++++++++++++++++++++++++++++++++++ tests/erofs/020.out | 2 ++ 3 files changed, 77 insertions(+) create mode 100755 tests/erofs/020 create mode 100644 tests/erofs/020.out diff --git a/tests/Makefile.am b/tests/Makefile.am index b85ae89..d486ce3 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -91,6 +91,9 @@ TESTS += erofs/018 # 019 - check extended attribute functionality TESTS += erofs/019 +# 020 - test random extended attribute functionality for mkfs and fuse +TESTS += erofs/020 + EXTRA_DIST = common/rc erofs clean-local: clean-local-check diff --git a/tests/erofs/020 b/tests/erofs/020 new file mode 100755 index 0000000..77409dd --- /dev/null +++ b/tests/erofs/020 @@ -0,0 +1,72 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# 020 - check extended attribute functionality +# +seq=`basename $0` +seqres=$RESULT_DIR/$(echo $0 | awk '{print $((NF-1))"/"$NF}' FS="/") + +# get standard environment, filters and checks +. "${srcdir}/common/rc" + +cleanup() +{ + cd / + rm -rf $tmp.* +} + +check_xattrs() +{ + _scratch_mount 2>>$seqres.full + + # check xattrs + for d in $dirs; do + xattr1=`getfattr --absolute-names -d $localdir/$d | tail -n+2` + xattr2=`getfattr --absolute-names -d $SCRATCH_MNT/$d | tail -n+2` + [ "x$xattr1" = "x$xattr2" ] || _fail "-->check xattrs FAILED" + done + + _scratch_unmount +} + +_require_erofs + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here +echo "QA output created by $seq" + +have_attr=`which setfattr` +[ -z "$have_attr" ] && \ + _notrun "attr isn't installed, skipped." + +if [ -z $SCRATCH_DEV ]; then + SCRATCH_DEV=$tmp/erofs_$seq.img + rm -f SCRATCH_DEV +fi + +localdir="$tmp/$seq" +rm -rf $localdir +mkdir -p $localdir + +# set random xattrs +cp -nR ../ $localdir +dirs=`ls $localdir` +for d in $dirs; do + for i in `seq $((RANDOM % 20))`; do + key=`head -20 /dev/urandom | cksum | cut -f1 -d " "` + val="0s"`head -3 /dev/urandom | base64 -w0` + setfattr -n user.$key -v $val $localdir/$d + done +done + +_scratch_mkfs $localdir >> $seqres.full 2>&1 || _fail "failed to mkfs" +check_xattrs + +FSTYP="erofsfuse" +check_xattrs + +echo Silence is golden +status=0 +exit 0 diff --git a/tests/erofs/020.out b/tests/erofs/020.out new file mode 100644 index 0000000..20d7944 --- /dev/null +++ b/tests/erofs/020.out @@ -0,0 +1,2 @@ +QA output created by 020 +Silence is golden -- 2.34.1 From jnhuang at linux.alibaba.com Thu Jul 28 22:11:36 2022 From: jnhuang at linux.alibaba.com (Huang Jianan) Date: Thu, 28 Jul 2022 20:11:36 +0800 Subject: [PATCH] erofs-utils: fuse: introduce xattr support In-Reply-To: References: <20220715095359.37534-1-jnhuang@linux.alibaba.com> Message-ID: ? 2022/7/19 18:21, Gao Xiang ??: > Hi Jianan, > > On Fri, Jul 15, 2022 at 05:53:59PM +0800, Huang Jianan wrote: >> This implements xattr functionalities for erofsfuse. A large amount >> of code was adapted from Linux kernel. >> >> Signed-off-by: Huang Jianan >> --- >> fuse/main.c | 32 +++ >> include/erofs/internal.h | 8 + >> include/erofs/xattr.h | 21 ++ >> lib/xattr.c | 508 +++++++++++++++++++++++++++++++++++++++ >> 4 files changed, 569 insertions(+) >> >> diff --git a/fuse/main.c b/fuse/main.c >> index f4c2476..30a0bed 100644 >> --- a/fuse/main.c >> +++ b/fuse/main.c >> @@ -139,7 +139,39 @@ static int erofsfuse_readlink(const char *path, char *buffer, size_t size) >> return 0; >> } >> >> +static int erofsfuse_getxattr(const char *path, const char *name, char *value, >> + size_t size) >> +{ >> + int ret; >> + struct erofs_inode vi; >> + >> + erofs_dbg("getxattr(%s): name=%s size=%llu", path, name, size); >> + >> + ret = erofs_ilookup(path, &vi); >> + if (ret) >> + return ret; >> + >> + return erofs_getxattr(&vi, name, value, size); >> +} >> + >> +static int erofsfuse_listxattr(const char *path, char *list, size_t size) >> +{ >> + int ret; >> + struct erofs_inode vi; >> + int i; > > As we discussed offline, this line should be unneeded. A new patch has been sent, in addition a test case has been added. Thanks, Jianan > > Thanks, > Gao Xiang From lkp at intel.com Fri Jul 29 01:08:06 2022 From: lkp at intel.com (kernel test robot) Date: Thu, 28 Jul 2022 23:08:06 +0800 Subject: [xiang-erofs:dev] BUILD SUCCESS 4cdfa6ef7aa015cb922af4608e53de0c62638010 Message-ID: <62e2a656.IRFsen2qCJ/JwRij%lkp@intel.com> tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev branch HEAD: 4cdfa6ef7aa015cb922af4608e53de0c62638010 erofs: update ctx->pos for every emitted dirent elapsed time: 1100m configs tested: 135 configs skipped: 6 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: i386 defconfig i386 allyesconfig arc randconfig-r043-20220727 x86_64 randconfig-a002 x86_64 randconfig-a004 x86_64 randconfig-a006 um x86_64_defconfig um i386_defconfig csky allnoconfig alpha allnoconfig arc allnoconfig riscv allnoconfig arm64 allyesconfig arm defconfig arm allyesconfig x86_64 rhel-8.3-kvm x86_64 rhel-8.3-func x86_64 rhel-8.3-syz x86_64 rhel-8.3-kselftests x86_64 rhel-8.3-kunit arc vdk_hs38_defconfig mips ar7_defconfig m68k stmark2_defconfig sh lboxre2_defconfig powerpc allnoconfig mips allyesconfig powerpc allmodconfig sh allmodconfig i386 randconfig-a012 i386 randconfig-a014 i386 randconfig-a016 x86_64 defconfig x86_64 allyesconfig x86_64 rhel-8.3 m68k allyesconfig m68k allmodconfig arc allyesconfig alpha allyesconfig m68k amcore_defconfig arm lart_defconfig sh se7712_defconfig mips decstation_defconfig xtensa cadence_csp_defconfig arc defconfig powerpc pasemi_defconfig powerpc mpc837x_rdb_defconfig x86_64 randconfig-a011 x86_64 randconfig-a013 x86_64 randconfig-a015 powerpc ps3_defconfig arm mvebu_v7_defconfig arm vf610m4_defconfig xtensa common_defconfig i386 randconfig-c001 sparc sparc64_defconfig arm sama5_defconfig arm qcom_defconfig i386 debian-10.3-kvm i386 debian-10.3-kunit i386 debian-10.3-func arm hisi_defconfig m68k m5272c3_defconfig sh rsk7264_defconfig mips decstation_64_defconfig arm assabet_defconfig sh ecovec24_defconfig powerpc mpc8540_ads_defconfig sh sh7770_generic_defconfig m68k amiga_defconfig arm cm_x300_defconfig ia64 defconfig xtensa smp_lx200_defconfig parisc64 defconfig csky alldefconfig arm viper_defconfig mips randconfig-c004-20220728 powerpc randconfig-c003-20220728 loongarch defconfig loongarch allnoconfig riscv nommu_virt_defconfig riscv rv32_defconfig riscv nommu_k210_defconfig i386 debian-10.3-kselftests i386 debian-10.3 xtensa allyesconfig arm u8500_defconfig powerpc linkstation_defconfig s390 randconfig-r044-20220728 riscv randconfig-r042-20220728 arc randconfig-r043-20220728 nios2 allyesconfig nios2 defconfig parisc defconfig parisc allyesconfig powerpc amigaone_defconfig sh landisk_defconfig ia64 generic_defconfig arc hsdk_defconfig s390 defconfig s390 allmodconfig alpha defconfig s390 allyesconfig clang tested configs: hexagon randconfig-r041-20220727 hexagon randconfig-r045-20220727 x86_64 randconfig-a001 x86_64 randconfig-a003 riscv randconfig-r042-20220727 s390 randconfig-r044-20220727 x86_64 randconfig-a005 x86_64 randconfig-a014 x86_64 randconfig-a016 hexagon randconfig-r041-20220728 hexagon randconfig-r045-20220728 i386 randconfig-a002 i386 randconfig-a006 i386 randconfig-a004 x86_64 randconfig-a012 powerpc mvme5100_defconfig powerpc pmac32_defconfig arm spear3xx_defconfig x86_64 randconfig-k001 i386 randconfig-a011 i386 randconfig-a013 i386 randconfig-a015 powerpc mpc8315_rdb_defconfig mips pic32mzda_defconfig hexagon defconfig arm ixp4xx_defconfig mips randconfig-c004-20220728 x86_64 randconfig-c007 s390 randconfig-c005-20220728 powerpc randconfig-c003-20220728 i386 randconfig-c001 riscv randconfig-c006-20220728 arm randconfig-c002-20220728 mips malta_qemu_32r6_defconfig powerpc gamecube_defconfig arm collie_defconfig -- 0-DAY CI Kernel Test Service https://01.org/lkp From desarrollo.micromedia at micromedia.es Sat Jul 30 11:42:28 2022 From: desarrollo.micromedia at micromedia.es (MICROMEDIA) Date: Sat, 30 Jul 2022 03:42:28 +0200 Subject: Alta NewsLetter Micromedia Message-ID: An HTML attachment was scrubbed... URL: From heinrich.schuchardt at canonical.com Sun Jul 31 19:10:06 2022 From: heinrich.schuchardt at canonical.com (Heinrich Schuchardt) Date: Sun, 31 Jul 2022 11:10:06 +0200 Subject: [PATCH 1/1] fs/erofs: silence erofs_probe() Message-ID: <20220731091006.50073-1-heinrich.schuchardt@canonical.com> fs_set_blk_dev() probes all file-systems until it finds one that matches the volume. We do not expect any console output for non-matching file-systems. Convert error messages in erofs_read_superblock() to debug output. Fixes: 830613f8f5bb ("fs/erofs: add erofs filesystem support") Signed-off-by: Heinrich Schuchardt --- fs/erofs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 4cca322b9e..095754dc28 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -65,14 +65,14 @@ int erofs_read_superblock(void) ret = erofs_blk_read(data, 0, 1); if (ret < 0) { - erofs_err("cannot read erofs superblock: %d", ret); + erofs_dbg("cannot read erofs superblock: %d", ret); return -EIO; } dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { - erofs_err("cannot find valid erofs superblock"); + erofs_dbg("cannot find valid erofs superblock"); return ret; } @@ -81,7 +81,7 @@ int erofs_read_superblock(void) blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err("blksize %u isn't supported on this platform", + erofs_dbg("blksize %u isn't supported on this platform", 1 << blkszbits); return ret; } -- 2.36.1 From chao at kernel.org Sun Jul 31 23:18:05 2022 From: chao at kernel.org (Chao Yu) Date: Sun, 31 Jul 2022 21:18:05 +0800 Subject: [PATCH v4] erofs: update ctx->pos for every emitted dirent In-Reply-To: <20220722082732.30935-1-jefflexu@linux.alibaba.com> References: <20220722082732.30935-1-jefflexu@linux.alibaba.com> Message-ID: <368e99d1-bc41-976a-094b-816e80ef6367@kernel.org> On 2022/7/22 16:27, Jeffle Xu wrote: > From: Hongnan Li > > erofs_readdir update ctx->pos after filling a batch of dentries > and it may cause dir/files duplication for NFS readdirplus which > depends on ctx->pos to fill dir correctly. So update ctx->pos for > every emitted dirent in erofs_fill_dentries to fix it. > > Also fix the update of ctx->pos when the initial file position has > exceeded nameoff. > > Fixes: 3e917cc305c6 ("erofs: make filesystem exportable") > Signed-off-by: Hongnan Li > Signed-off-by: Jeffle Xu Reviewed-by: Chao Yu Thanks, From jnhuang95 at gmail.com Sun Jul 31 23:49:24 2022 From: jnhuang95 at gmail.com (Huang Jianan) Date: Sun, 31 Jul 2022 21:49:24 +0800 Subject: [PATCH 1/1] fs/erofs: silence erofs_probe() In-Reply-To: <20220731091006.50073-1-heinrich.schuchardt@canonical.com> References: <20220731091006.50073-1-heinrich.schuchardt@canonical.com> Message-ID: <32a4a652-47ec-cee9-a7ed-f0cc15ab5872@gmail.com> ? 2022/7/31 17:10, Heinrich Schuchardt ??: > fs_set_blk_dev() probes all file-systems until it finds one that matches > the volume. We do not expect any console output for non-matching > file-systems. > > Convert error messages in erofs_read_superblock() to debug output. > > Fixes: 830613f8f5bb ("fs/erofs: add erofs filesystem support") > Signed-off-by: Heinrich Schuchardt > --- > fs/erofs/super.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/fs/erofs/super.c b/fs/erofs/super.c > index 4cca322b9e..095754dc28 100644 > --- a/fs/erofs/super.c > +++ b/fs/erofs/super.c > @@ -65,14 +65,14 @@ int erofs_read_superblock(void) > > ret = erofs_blk_read(data, 0, 1); > if (ret < 0) { > - erofs_err("cannot read erofs superblock: %d", ret); > + erofs_dbg("cannot read erofs superblock: %d", ret); > return -EIO; > } > dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); > > ret = -EINVAL; > if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { > - erofs_err("cannot find valid erofs superblock"); > + erofs_dbg("cannot find valid erofs superblock"); > return ret; > } > > @@ -81,7 +81,7 @@ int erofs_read_superblock(void) > blkszbits = dsb->blkszbits; > /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ > if (blkszbits != LOG_BLOCK_SIZE) { > - erofs_err("blksize %u isn't supported on this platform", > + erofs_dbg("blksize %u isn't supported on this platform", > 1 << blkszbits); > return ret; > } Reviewed-by: Huang Jianan Thanks, Jianan