[PATCH v4 2/2] erofs-utils: mkfs: allow to specify dictionary size for compression algorithms

Yifan Zhao zhaoyifan at sjtu.edu.cn
Sat Jan 20 22:53:19 AEDT 2024


Currently, the dictionary size for compression algorithms is fixed. This
patch allows to specify different ones with new -zX,dictsize=<dictsize>
options.

This patch also changes the way to specify compression levels. Now, the
compression level is specified with -zX,level=<level> options and could
be specified together with dictsize. The old -zX,<level> form is still
supported for compatibility.

Suggested-by: Gao Xiang <hsiangkao at linux.alibaba.com>
Signed-off-by: Yifan Zhao <zhaoyifan at sjtu.edu.cn>
---
 include/erofs/config.h   |  10 ++--
 lib/compress.c           |  33 ++++++++-----
 lib/compress_hints.c     |   2 +-
 lib/compressor.c         |  16 +++++-
 lib/compressor.h         |   6 ++-
 lib/compressor_deflate.c |  26 ++++++++--
 lib/compressor_liblzma.c |  34 ++++++++-----
 lib/config.c             |   4 +-
 lib/inode.c              |   2 +-
 mkfs/main.c              | 104 +++++++++++++++++++++++++++++++--------
 10 files changed, 180 insertions(+), 57 deletions(-)

diff --git a/include/erofs/config.h b/include/erofs/config.h
index 89fe522..eecf575 100644
--- a/include/erofs/config.h
+++ b/include/erofs/config.h
@@ -34,6 +34,12 @@ enum {
 
 #define EROFS_MAX_COMPR_CFGS		64
 
+struct erofs_compr_opts {
+	char *alg;
+	int level;
+	u32 dict_size;
+};
+
 struct erofs_configure {
 	const char *c_version;
 	int c_dbg_lvl;
@@ -64,8 +70,7 @@ struct erofs_configure {
 	char *c_src_path;
 	char *c_blobdev_path;
 	char *c_compress_hints_file;
-	char *c_compr_alg[EROFS_MAX_COMPR_CFGS];
-	int c_compr_level[EROFS_MAX_COMPR_CFGS];
+	struct erofs_compr_opts c_compr_opts[EROFS_MAX_COMPR_CFGS];
 	char c_force_inodeversion;
 	char c_force_chunkformat;
 	/* < 0, xattr disabled and INT_MAX, always use inline xattrs */
@@ -73,7 +78,6 @@ struct erofs_configure {
 
 	u32 c_pclusterblks_max, c_pclusterblks_def, c_pclusterblks_packed;
 	u32 c_max_decompressed_extent_bytes;
-	u32 c_dict_size;
 	u64 c_unix_timestamp;
 	u32 c_uid, c_gid;
 	const char *mount_point;
diff --git a/lib/compress.c b/lib/compress.c
index 3ea735c..ea9d00d 100644
--- a/lib/compress.c
+++ b/lib/compress.c
@@ -1123,7 +1123,8 @@ err_free_meta:
 }
 
 static int z_erofs_build_compr_cfgs(struct erofs_sb_info *sbi,
-				    struct erofs_buffer_head *sb_bh)
+				    struct erofs_buffer_head *sb_bh,
+				    u32 *max_dict_size)
 {
 	struct erofs_buffer_head *bh = sb_bh;
 	int ret = 0;
@@ -1159,7 +1160,9 @@ static int z_erofs_build_compr_cfgs(struct erofs_sb_info *sbi,
 		} __packed lzmaalg = {
 			.size = cpu_to_le16(sizeof(struct z_erofs_lzma_cfgs)),
 			.lzma = {
-				.dict_size = cpu_to_le32(cfg.c_dict_size),
+				.dict_size = cpu_to_le32(
+					max_dict_size
+						[Z_EROFS_COMPRESSION_LZMA]),
 			}
 		};
 
@@ -1181,8 +1184,9 @@ static int z_erofs_build_compr_cfgs(struct erofs_sb_info *sbi,
 		} __packed zalg = {
 			.size = cpu_to_le16(sizeof(struct z_erofs_deflate_cfgs)),
 			.z = {
-				.windowbits =
-					cpu_to_le32(ilog2(cfg.c_dict_size)),
+				.windowbits = cpu_to_le32(ilog2(
+					max_dict_size
+						[Z_EROFS_COMPRESSION_DEFLATE])),
 			}
 		};
 
@@ -1202,11 +1206,14 @@ static int z_erofs_build_compr_cfgs(struct erofs_sb_info *sbi,
 int z_erofs_compress_init(struct erofs_sb_info *sbi, struct erofs_buffer_head *sb_bh)
 {
 	int i, ret;
+	u32 max_dict_size[Z_EROFS_COMPRESSION_MAX];
 
-	for (i = 0; cfg.c_compr_alg[i]; ++i) {
+	for (i = 0; cfg.c_compr_opts[i].alg; ++i) {
 		struct erofs_compress *c = &erofs_ccfg[i].handle;
 
-		ret = erofs_compressor_init(sbi, c, cfg.c_compr_alg[i], cfg.c_compr_level[i]);
+		ret = erofs_compressor_init(sbi, c, cfg.c_compr_opts[i].alg,
+					    cfg.c_compr_opts[i].level,
+					    cfg.c_compr_opts[i].dict_size);
 		if (ret)
 			return ret;
 
@@ -1216,17 +1223,21 @@ int z_erofs_compress_init(struct erofs_sb_info *sbi, struct erofs_buffer_head *s
 		sbi->available_compr_algs |= 1 << erofs_ccfg[i].algorithmtype;
 		if (erofs_ccfg[i].algorithmtype != Z_EROFS_COMPRESSION_LZ4)
 			erofs_sb_set_compr_cfgs(sbi);
+		max_dict_size[erofs_ccfg[i].algorithmtype] =
+			max(max_dict_size[erofs_ccfg[i].algorithmtype],
+			    c->dict_size);
 	}
 
 	/*
 	 * if primary algorithm is empty (e.g. compression off),
 	 * clear 0PADDING feature for old kernel compatibility.
 	 */
-	if (!cfg.c_compr_alg[0] ||
-	    (cfg.c_legacy_compress && !strncmp(cfg.c_compr_alg[0], "lz4", 3)))
+	if (!cfg.c_compr_opts[0].alg ||
+	    (cfg.c_legacy_compress &&
+	     !strncmp(cfg.c_compr_opts[0].alg, "lz4", 3)))
 		erofs_sb_clear_lz4_0padding(sbi);
 
-	if (!cfg.c_compr_alg[0])
+	if (!cfg.c_compr_opts[0].alg)
 		return 0;
 
 	/*
@@ -1248,7 +1259,7 @@ int z_erofs_compress_init(struct erofs_sb_info *sbi, struct erofs_buffer_head *s
 	}
 
 	if (erofs_sb_has_compr_cfgs(sbi))
-		return z_erofs_build_compr_cfgs(sbi, sb_bh);
+		return z_erofs_build_compr_cfgs(sbi, sb_bh, max_dict_size);
 	return 0;
 }
 
@@ -1256,7 +1267,7 @@ int z_erofs_compress_exit(void)
 {
 	int i, ret;
 
-	for (i = 0; cfg.c_compr_alg[i]; ++i) {
+	for (i = 0; cfg.c_compr_opts[i].alg; ++i) {
 		ret = erofs_compressor_exit(&erofs_ccfg[i].handle);
 		if (ret)
 			return ret;
diff --git a/lib/compress_hints.c b/lib/compress_hints.c
index afc9f8f..8b78f80 100644
--- a/lib/compress_hints.c
+++ b/lib/compress_hints.c
@@ -125,7 +125,7 @@ int erofs_load_compress_hints(struct erofs_sb_info *sbi)
 		} else {
 			ccfg = atoi(alg);
 			if (ccfg >= EROFS_MAX_COMPR_CFGS ||
-			    !cfg.c_compr_alg[ccfg]) {
+			    !cfg.c_compr_opts[ccfg].alg) {
 				erofs_err("invalid compressing configuration \"%s\" at line %u",
 					  alg, line);
 				ret = -EINVAL;
diff --git a/lib/compressor.c b/lib/compressor.c
index 295aa47..9f8d220 100644
--- a/lib/compressor.c
+++ b/lib/compressor.c
@@ -78,7 +78,7 @@ int erofs_compress_destsize(const struct erofs_compress *c,
 }
 
 int erofs_compressor_init(struct erofs_sb_info *sbi, struct erofs_compress *c,
-			  char *alg_name, int compression_level)
+			  char *alg_name, int compression_level, u32 dict_size)
 {
 	int ret, i;
 
@@ -116,6 +116,20 @@ int erofs_compressor_init(struct erofs_sb_info *sbi, struct erofs_compress *c,
 				  alg_name);
 			return -EINVAL;
 		}
+
+		if (erofs_algs[i].c->setdictsize) {
+			ret = erofs_algs[i].c->setdictsize(c, dict_size);
+			if (ret) {
+				erofs_err("failed to set dict size %u for %s",
+					  dict_size, alg_name);
+				return ret;
+			}
+		} else if (dict_size) {
+			erofs_err("dict size is not supported for %s",
+				  alg_name);
+			return -EINVAL;
+		}
+
 		if (!ret) {
 			c->alg = &erofs_algs[i];
 			return 0;
diff --git a/lib/compressor.h b/lib/compressor.h
index ec5485d..d8ccf2e 100644
--- a/lib/compressor.h
+++ b/lib/compressor.h
@@ -14,10 +14,13 @@ struct erofs_compress;
 struct erofs_compressor {
 	int default_level;
 	int best_level;
+	u32 default_dictsize;
+	u32 max_dictsize;
 
 	int (*init)(struct erofs_compress *c);
 	int (*exit)(struct erofs_compress *c);
 	int (*setlevel)(struct erofs_compress *c, int compression_level);
+	int (*setdictsize)(struct erofs_compress *c, u32 dict_size);
 
 	int (*compress_destsize)(const struct erofs_compress *c,
 				 const void *src, unsigned int *srcsize,
@@ -39,6 +42,7 @@ struct erofs_compress {
 
 	unsigned int compress_threshold;
 	unsigned int compression_level;
+	unsigned int dict_size;
 
 	void *private_data;
 };
@@ -56,7 +60,7 @@ int erofs_compress_destsize(const struct erofs_compress *c,
 			    void *dst, unsigned int dstsize);
 
 int erofs_compressor_init(struct erofs_sb_info *sbi, struct erofs_compress *c,
-			  char *alg_name, int compression_level);
+			  char *alg_name, int compression_level, u32 dict_size);
 int erofs_compressor_exit(struct erofs_compress *c);
 
 #endif
diff --git a/lib/compressor_deflate.c b/lib/compressor_deflate.c
index 4e5902e..aa2ff24 100644
--- a/lib/compressor_deflate.c
+++ b/lib/compressor_deflate.c
@@ -46,6 +46,16 @@ static int compressor_deflate_init(struct erofs_compress *c)
 
 static int erofs_compressor_deflate_setlevel(struct erofs_compress *c,
 					     int compression_level)
+{
+	if (compression_level < 0)
+		compression_level = erofs_compressor_deflate.default_level;
+
+	c->compression_level = compression_level;
+	return 0;
+}
+
+static int erofs_compressor_deflate_setdictsize(struct erofs_compress *c,
+						u32 dict_size)
 {
 	void *s;
 
@@ -54,23 +64,31 @@ static int erofs_compressor_deflate_setlevel(struct erofs_compress *c,
 		c->private_data = NULL;
 	}
 
-	if (compression_level < 0)
-		compression_level = erofs_compressor_deflate.default_level;
+	if (dict_size > erofs_compressor_deflate.max_dictsize) {
+		erofs_err("dict size %u is too large", dict_size);
+		return -EINVAL;
+	}
+
+	if (dict_size == 0)
+		dict_size = erofs_compressor_deflate.default_dictsize;
 
-	s = kite_deflate_init(compression_level, cfg.c_dict_size);
+	s = kite_deflate_init(c->compression_level, dict_size);
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 
 	c->private_data = s;
-	c->compression_level = compression_level;
+	c->dict_size = dict_size;
 	return 0;
 }
 
 const struct erofs_compressor erofs_compressor_deflate = {
 	.default_level = 1,
 	.best_level = 9,
+	.default_dictsize = 1 << 15,
+	.max_dictsize = 1 << 15,
 	.init = compressor_deflate_init,
 	.exit = compressor_deflate_exit,
 	.setlevel = erofs_compressor_deflate_setlevel,
+	.setdictsize = erofs_compressor_deflate_setdictsize,
 	.compress_destsize = deflate_compress_destsize,
 };
diff --git a/lib/compressor_liblzma.c b/lib/compressor_liblzma.c
index 0ed6f23..a9551e2 100644
--- a/lib/compressor_liblzma.c
+++ b/lib/compressor_liblzma.c
@@ -68,22 +68,29 @@ static int erofs_compressor_liblzma_setlevel(struct erofs_compress *c,
 	if (lzma_lzma_preset(&ctx->opt, preset))
 		return -EINVAL;
 
-	/* XXX: temporary hack */
-	if (cfg.c_dict_size) {
-		if (cfg.c_dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE) {
-			erofs_err("dict size %u is too large", cfg.c_dict_size);
-			return -EINVAL;
-		}
-		ctx->opt.dict_size = cfg.c_dict_size;
-	} else {
-		if (ctx->opt.dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE)
-			ctx->opt.dict_size = Z_EROFS_LZMA_MAX_DICT_SIZE;
-		cfg.c_dict_size = ctx->opt.dict_size;
-	}
 	c->compression_level = compression_level;
 	return 0;
 }
 
+static int erofs_compressor_liblzma_setdictsize(struct erofs_compress *c,
+						u32 dict_size)
+{
+	struct erofs_liblzma_context *ctx = c->private_data;
+
+	if (dict_size > erofs_compressor_lzma.max_dictsize ||
+	    dict_size < 4096) {
+		erofs_err("invalid dict size %u", dict_size);
+		return -EINVAL;
+	}
+
+	if (dict_size == 0)
+		dict_size = erofs_compressor_lzma.default_dictsize;
+
+	ctx->opt.dict_size = dict_size;
+	c->dict_size = dict_size;
+	return 0;
+}
+
 static int erofs_compressor_liblzma_init(struct erofs_compress *c)
 {
 	struct erofs_liblzma_context *ctx;
@@ -101,9 +108,12 @@ static int erofs_compressor_liblzma_init(struct erofs_compress *c)
 const struct erofs_compressor erofs_compressor_lzma = {
 	.default_level = LZMA_PRESET_DEFAULT,
 	.best_level = 109,
+	.default_dictsize = 8 * Z_EROFS_PCLUSTER_MAX_SIZE,
+	.max_dictsize = Z_EROFS_LZMA_MAX_DICT_SIZE,
 	.init = erofs_compressor_liblzma_init,
 	.exit = erofs_compressor_liblzma_exit,
 	.setlevel = erofs_compressor_liblzma_setlevel,
+	.setdictsize = erofs_compressor_liblzma_setdictsize,
 	.compress_destsize = erofs_liblzma_compress_destsize,
 };
 #endif
diff --git a/lib/config.c b/lib/config.c
index aa3dd1f..1096cd1 100644
--- a/lib/config.c
+++ b/lib/config.c
@@ -62,8 +62,8 @@ void erofs_exit_configure(void)
 		free(cfg.c_img_path);
 	if (cfg.c_src_path)
 		free(cfg.c_src_path);
-	for (i = 0; i < EROFS_MAX_COMPR_CFGS && cfg.c_compr_alg[i]; i++)
-		free(cfg.c_compr_alg[i]);
+	for (i = 0; i < EROFS_MAX_COMPR_CFGS && cfg.c_compr_opts[i].alg; i++)
+		free(cfg.c_compr_opts[i].alg);
 }
 
 static unsigned int fullpath_prefix;	/* root directory prefix length */
diff --git a/lib/inode.c b/lib/inode.c
index bcdb4b8..c6424c0 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -492,7 +492,7 @@ int erofs_write_file(struct erofs_inode *inode, int fd, u64 fpos)
 		return erofs_blob_write_chunked_file(inode, fd, fpos);
 	}
 
-	if (cfg.c_compr_alg[0] && erofs_file_is_compressible(inode)) {
+	if (cfg.c_compr_opts[0].alg && erofs_file_is_compressible(inode)) {
 		ret = erofs_write_compressed_file(inode, fd);
 		if (!ret || ret != -ENOSPC)
 			return ret;
diff --git a/mkfs/main.c b/mkfs/main.c
index 13fea41..acb2108 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -5,6 +5,7 @@
  * Created by Li Guifu <bluce.liguifu at huawei.com>
  */
 #define _GNU_SOURCE
+#include <ctype.h>
 #include <time.h>
 #include <sys/time.h>
 #include <stdlib.h>
@@ -108,24 +109,29 @@ static void usage(int argc, char **argv)
 		" -b#                   set block size to # (# = page size by default)\n"
 		" -d<0-9>               set output verbosity; 0=quiet, 9=verbose (default=%i)\n"
 		" -x#                   set xattr tolerance to # (< 0, disable xattrs; default 2)\n"
-		" -zX[,Y][:...]         X=compressor (Y=compression level, optional)\n"
-		"                       alternative compressors can be separated by colons(:)\n"
-		"                       supported compressors and their level ranges are:\n",
+		" -zX[,level=Y]         X=compressor (Y=compression level, Z=dictionary size, optional)\n"
+		"    [,dictsize=Z]      alternative compressors can be separated by colons(:)\n"
+		"    [:...]             supported compressors and their option ranges are:\n",
 		argv[0], EROFS_WARN);
 	while ((s = z_erofs_list_available_compressors(&i)) != NULL) {
-		printf("                           %s", s->name);
+		char * const spaces = "                         ";
+
+		printf("%s%s\n", spaces, s->name);
 		if (s->c->setlevel) {
 			if (!strcmp(s->name, "lzma"))
 				/* A little kludge to show the range as disjointed
 				 * "0-9,100-109" instead of a continuous "0-109", and to
 				 * state what those two subranges respectively mean.  */
-				printf("[<0-9,100-109>]\t0-9=normal, 100-109=extreme (default=%i)",
-				       s->c->default_level);
+				printf("%s  [,level=<0-9,100-109>]\t0-9=normal, 100-109=extreme (default=%i)\n",
+				       spaces, s->c->default_level);
 			else
-				printf("[,<0-%i>]\t(default=%i)",
-				       s->c->best_level, s->c->default_level);
+				printf("%s  [,level=<0-%i>]\t\t(default=%i)\n",
+				       spaces, s->c->best_level, s->c->default_level);
+		}
+		if (s->c->setdictsize) {
+			printf("%s  [,dictsize=<dictsize>]\t(default=%u, max=%u)\n",
+			       spaces, s->c->default_dictsize, s->c->max_dictsize);
 		}
-		putchar('\n');
 	}
 	printf(
 		" -C#                   specify the size of compress physical cluster in bytes\n"
@@ -304,27 +310,83 @@ handle_fragment:
 	return 0;
 }
 
+static int mkfs_parse_one_compress_alg(char *alg,
+				       struct erofs_compr_opts *copts)
+{
+	char *p, *q, *opt, *endptr;
+
+	copts->level = -1;
+	copts->dict_size = 0;
+
+	p = strchr(alg, ',');
+	if (p) {
+		copts->alg = strndup(alg, p - alg);
+
+		/* support old '-zlzma,9' form */
+		if (isdigit(*(p + 1))) {
+			copts->level = strtol(p + 1, &endptr, 10);
+			if (*endptr && *endptr != ',') {
+				erofs_err("invalid compression level %s",
+					  p + 1);
+				return -EINVAL;
+			}
+			return 0;
+		}
+	} else {
+		copts->alg = strdup(alg);
+		return 0;
+	}
+
+	opt = p + 1;
+	while (opt) {
+		q = strchr(opt, ',');
+		if (q)
+			*q = '\0';
+
+		if ((p = strstr(opt, "level="))) {
+			p += strlen("level=");
+			copts->level = strtol(p, &endptr, 10);
+			if ((endptr == p) || (*endptr && *endptr != ',')) {
+				erofs_err("invalid compression level %s", p);
+				return -EINVAL;
+			}
+		} else if ((p = strstr(opt, "dictsize="))) {
+			p += strlen("dictsize=");
+			copts->dict_size = strtoul(p, &endptr, 10);
+			if (*endptr == 'k' || *endptr == 'K')
+				copts->dict_size <<= 10;
+			else if (*endptr == 'm' || *endptr == 'M')
+				copts->dict_size <<= 20;
+			else if ((endptr == p) || (*endptr && *endptr != ',')) {
+				erofs_err("invalid compression dictsize %s", p);
+				return -EINVAL;
+			}
+		} else {
+			erofs_err("invalid compression option %s", opt);
+			return -EINVAL;
+		}
+
+		opt = q ? q + 1 : NULL;
+	}
+
+	return 0;
+}
+
 static int mkfs_parse_compress_algs(char *algs)
 {
 	unsigned int i;
 	char *s;
+	int ret;
 
 	for (s = strtok(algs, ":"), i = 0; s; s = strtok(NULL, ":"), ++i) {
-		const char *lv;
-
 		if (i >= EROFS_MAX_COMPR_CFGS - 1) {
 			erofs_err("too many algorithm types");
 			return -EINVAL;
 		}
 
-		lv = strchr(s, ',');
-		if (lv) {
-			cfg.c_compr_level[i] = atoi(lv + 1);
-			cfg.c_compr_alg[i] = strndup(s, lv - s);
-		} else {
-			cfg.c_compr_level[i] = -1;
-			cfg.c_compr_alg[i] = strdup(s);
-		}
+		ret = mkfs_parse_one_compress_alg(s, &cfg.c_compr_opts[i]);
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
@@ -692,7 +754,7 @@ static int mkfs_parse_options_cfg(int argc, char *argv[])
 		cfg.c_showprogress = false;
 	}
 
-	if (cfg.c_compr_alg[0] && erofs_blksiz(&sbi) != getpagesize())
+	if (cfg.c_compr_opts[0].alg && erofs_blksiz(&sbi) != getpagesize())
 		erofs_warn("Please note that subpage blocksize with compression isn't yet supported in kernel. "
 			   "This compressed image will only work with bs = ps = %u bytes",
 			   erofs_blksiz(&sbi));
@@ -1119,7 +1181,7 @@ int main(int argc, char **argv)
 	}
 
 	if (cfg.c_dedupe) {
-		if (!cfg.c_compr_alg[0]) {
+		if (!cfg.c_compr_opts[0].alg) {
 			erofs_err("Compression is not enabled.  Turn on chunk-based data deduplication instead.");
 			cfg.c_chunkbits = sbi.blkszbits;
 		} else {
-- 
2.43.0



More information about the Linux-erofs mailing list