[PATCH] erofs-utils: dump: add --ext # option to show top# file extensions

Qi Wang mpiglet at outlook.com
Mon Dec 26 14:32:38 AEDT 2022


The --ext # option is used to let the user specify top# file extensions
with higher occurrence, instead of hardcode some file extensions ahead.

Signed-off-by: Qi Wang <mpiglet at outlook.com>
---
 dump/main.c | 131 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 101 insertions(+), 30 deletions(-)

diff --git a/dump/main.c b/dump/main.c
index 49ff2b7..188ab34 100644
--- a/dump/main.c
+++ b/dump/main.c
@@ -6,6 +6,7 @@
  *            Guo Xuenan <guoxuenan at huawei.com>
  */
 #define _GNU_SOURCE
+#include <string.h>
 #include <stdlib.h>
 #include <getopt.h>
 #include <time.h>
@@ -15,6 +16,7 @@
 #include "erofs/io.h"
 #include "erofs/dir.h"
 #include "../lib/liberofs_private.h"
+#include "erofs/hashmap.h"
 
 #ifdef HAVE_LIBUUID
 #include <uuid.h>
@@ -29,17 +31,34 @@ struct erofsdump_cfg {
 	bool show_subdirectories;
 	erofs_nid_t nid;
 	const char *inode_path;
+	unsigned int show_ext_count;
 };
 static struct erofsdump_cfg dumpcfg;
 
 static const char chart_format[] = "%-16s	%-11d %8.2f%% |%-50s|\n";
 static const char header_format[] = "%-16s %11s %16s |%-50s|\n";
-static char *file_types[] = {
-	".txt", ".so", ".xml", ".apk",
-	".odex", ".vdex", ".oat", ".rc",
-	".otf", ".txt", "others",
+
+struct postfix_statistics {
+	struct hashmap_entry ent;
+	char postfix[16];
+	unsigned int count;
+	unsigned long occupied_size;
+	unsigned long original_size;
 };
-#define OTHERFILETYPE	ARRAY_SIZE(file_types)
+
+static int erofs_postfix_hashmap_cmp(const void *a, const void *b,
+				  const void *key)
+{
+	const struct postfix_statistics *ps1 =
+			container_of((struct hashmap_entry *)a,
+				     struct postfix_statistics, ent);
+	const struct postfix_statistics *ps2 =
+			container_of((struct hashmap_entry *)b,
+				     struct postfix_statistics, ent);
+
+	return strncmp(ps1->postfix, key ? key : ps2->postfix, sizeof(ps1->postfix));
+}
+
 /* (1 << FILE_MAX_SIZE_BITS)KB */
 #define	FILE_MAX_SIZE_BITS	16
 
@@ -65,7 +84,7 @@ struct erofs_statistics {
 	/* [statistics] # of files based on inode_info->flags */
 	unsigned long file_category_stat[EROFS_FT_MAX];
 	/* [statistics] # of files based on file name extensions */
-	unsigned int file_type_stat[OTHERFILETYPE];
+	struct hashmap postfix_hashmap;
 	/* [statistics] # of files based on the original size of files */
 	unsigned int file_original_size[FILE_MAX_SIZE_BITS + 1];
 	/* [statistics] # of files based on the compressed size of files */
@@ -79,6 +98,7 @@ static struct option long_options[] = {
 	{"device", required_argument, NULL, 3},
 	{"path", required_argument, NULL, 4},
 	{"ls", no_argument, NULL, 5},
+	{"ext", required_argument, NULL, 6},
 	{0, 0, 0, 0},
 };
 
@@ -111,6 +131,7 @@ static void usage(void)
 	      " --ls            show directory contents (INODE required)\n"
 	      " --nid=#         show the target inode info of nid #\n"
 	      " --path=X        show the target inode info of path X\n"
+	      " --ext=#         show the top # extension file info\n"
 	      " --help          display this help and exit.\n",
 	      stderr);
 }
@@ -164,6 +185,9 @@ static int erofsdump_parse_options_cfg(int argc, char **argv)
 		case 5:
 			dumpcfg.show_subdirectories = true;
 			break;
+		case 6:
+			dumpcfg.show_ext_count = atoi(optarg);
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -208,20 +232,41 @@ static int erofsdump_get_occupied_size(struct erofs_inode *inode,
 	return 0;
 }
 
-static void inc_file_extension_count(const char *dname, unsigned int len)
+static void inc_file_extension_count(const char *dname, unsigned int len,
+		unsigned long occupied_size, unsigned long original_size)
 {
 	char *postfix = memrchr(dname, '.', len);
-	int type;
+	unsigned int hash, plen;
+	struct postfix_statistics *ps;
+	char pf[sizeof(ps->postfix)] = {0};
+
+	plen = len - (postfix - dname);
+	if (plen > sizeof(ps->postfix))
+		plen = sizeof(ps->postfix);
+	if (postfix) {
+		memcpy(pf, postfix, plen);
+		hash = strhash(pf);
+		ps = hashmap_get_from_hash(&stats.postfix_hashmap, hash, pf);
+		if (ps) {
+			ps->count++;
+			ps->occupied_size += occupied_size;
+			ps->original_size += original_size;
+			return;
+		}
+		ps = malloc(sizeof(struct postfix_statistics));
+		if (!ps) {
+			erofs_err("memory allocation failed!");
+			return;
+		}
 
-	if (!postfix) {
-		type = OTHERFILETYPE - 1;
-	} else {
-		for (type = 0; type < OTHERFILETYPE - 1; ++type)
-			if (!strncmp(postfix, file_types[type],
-				     len - (postfix - dname)))
-				break;
+		ps->count = 1;
+		ps->occupied_size = occupied_size;
+		ps->original_size = original_size;
+		memset(ps->postfix, 0, sizeof(ps->postfix));
+		strncpy(ps->postfix, pf, plen);
+		hashmap_entry_init(&ps->ent, hash);
+		hashmap_add(&stats.postfix_hashmap, ps);
 	}
-	++stats.file_type_stat[type];
 }
 
 static void update_file_size_statatics(erofs_off_t occupied_size,
@@ -298,7 +343,7 @@ static int erofsdump_readdir(struct erofs_dir_context *ctx)
 
 	if (S_ISREG(vi.i_mode)) {
 		stats.files_total_origin_size += vi.i_size;
-		inc_file_extension_count(ctx->dname, ctx->de_namelen);
+		inc_file_extension_count(ctx->dname, ctx->de_namelen, occupied_size, vi.i_size);
 		stats.files_total_size += occupied_size;
 		update_file_size_statatics(occupied_size, vi.i_size);
 	}
@@ -481,27 +526,50 @@ static void erofsdump_filesize_distribution(const char *title,
 	}
 }
 
-static void erofsdump_filetype_distribution(char **file_types, unsigned int len)
+static int comp_postfix_statistics(const void *a, const void *b)
+{
+	const struct postfix_statistics *psa, *psb;
+
+	psa = a;
+	psb = b;
+	return psa->count < psb->count ? 1 :
+		(psa->count > psb->count) ? -1 : 0;
+}
+
+static void erofsdump_filetype_distribution(int topk)
 {
 	char col1[30];
-	unsigned int col2, i;
-	double col3;
+	unsigned int col2, i, pos;
+	double col3, compression_rate;
 	char col4[401];
-
+	struct postfix_statistics *ps_array;
+	struct postfix_statistics *ps;
+	struct hashmap_iter iter;
+
+	pos = 0;
+	ps_array = malloc(sizeof(struct postfix_statistics) * stats.postfix_hashmap.size);
+	hashmap_iter_init(&stats.postfix_hashmap, &iter);
+	while ((ps = hashmap_iter_next(&iter)))
+		ps_array[pos++] = *ps;
+
+	DBG_BUGON(pos != stats.postfix_hashmap.size);
+	qsort(ps_array, pos, sizeof(struct postfix_statistics),
+			comp_postfix_statistics);
 	fprintf(stdout, "\nFile type distribution:\n");
 	fprintf(stdout, header_format, "type", "count", "ratio",
-			"distribution");
-	for (i = 0; i < len; i++) {
+			"compression rate");
+	for (i = 0; i < topk && i < pos; i++) {
 		memset(col1, 0, sizeof(col1));
-		memset(col4, 0, sizeof(col4));
-		sprintf(col1, "%-17s", file_types[i]);
-		col2 = stats.file_type_stat[i];
+		sprintf(col1, "%-17s", ps_array[i].postfix);
+		col2 = ps_array[i].count;
 		if (stats.file_category_stat[EROFS_FT_REG_FILE])
 			col3 = (double)(100 * col2) /
 				stats.file_category_stat[EROFS_FT_REG_FILE];
 		else
 			col3 = 0.0;
-		memset(col4, '#', col3 / 2);
+		compression_rate = 100.0 * (double)ps_array[i].occupied_size /
+				(double)ps_array[i].original_size;
+		sprintf(col4, "%.2f%%", compression_rate);
 		fprintf(stdout, chart_format, col1, col2, col3, col4);
 	}
 }
@@ -543,6 +611,7 @@ static void erofsdump_print_statistic(void)
 		.de_namelen = 0,
 	};
 
+	hashmap_init(&stats.postfix_hashmap, erofs_postfix_hashmap_cmp, 0);
 	err = erofsdump_readdir(&ctx);
 	if (err) {
 		erofs_err("read dir failed");
@@ -555,7 +624,7 @@ static void erofsdump_print_statistic(void)
 	erofsdump_filesize_distribution("On-disk",
 			stats.file_comp_size,
 			ARRAY_SIZE(stats.file_comp_size));
-	erofsdump_filetype_distribution(file_types, OTHERFILETYPE);
+	erofsdump_filetype_distribution(dumpcfg.show_ext_count);
 }
 
 static void erofsdump_show_superblock(void)
@@ -624,9 +693,11 @@ int main(int argc, char **argv)
 	if (dumpcfg.show_superblock)
 		erofsdump_show_superblock();
 
-	if (dumpcfg.show_statistics)
+	if (dumpcfg.show_statistics) {
+		if (dumpcfg.show_ext_count == 0)
+			dumpcfg.show_ext_count = 10;
 		erofsdump_print_statistic();
-
+	}
 	if (dumpcfg.show_extent && !dumpcfg.show_inode) {
 		usage();
 		goto exit_dev_close;
-- 
2.30.2



More information about the Linux-erofs mailing list