[PATCH v2] erofs-utils: dump: add --ext # option to show file statistics for the top # extensions
Qi Wang
mpiglet at outlook.com
Sat Jan 7 19:50:05 AEDT 2023
The --ext # option is used to let the user specify top # file extensions
with higher occurrence to show their corresponding statistic info,
instead of hardcode some file extensions ahead.
Signed-off-by: Qi Wang <mpiglet at outlook.com>
---
dump/main.c | 144 ++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 111 insertions(+), 33 deletions(-)
diff --git a/dump/main.c b/dump/main.c
index 49ff2b7..24674d6 100644
--- a/dump/main.c
+++ b/dump/main.c
@@ -6,6 +6,7 @@
* Guo Xuenan <guoxuenan at huawei.com>
*/
#define _GNU_SOURCE
+#include <string.h>
#include <stdlib.h>
#include <getopt.h>
#include <time.h>
@@ -14,6 +15,7 @@
#include "erofs/inode.h"
#include "erofs/io.h"
#include "erofs/dir.h"
+#include "erofs/hashmap.h"
#include "../lib/liberofs_private.h"
#ifdef HAVE_LIBUUID
@@ -29,17 +31,35 @@ struct erofsdump_cfg {
bool show_subdirectories;
erofs_nid_t nid;
const char *inode_path;
+ unsigned int show_ext_count;
};
static struct erofsdump_cfg dumpcfg;
static const char chart_format[] = "%-16s %-11d %8.2f%% |%-50s|\n";
static const char header_format[] = "%-16s %11s %16s |%-50s|\n";
-static char *file_types[] = {
- ".txt", ".so", ".xml", ".apk",
- ".odex", ".vdex", ".oat", ".rc",
- ".otf", ".txt", "others",
+
+static const char no_ext[] = "no extensions";
+struct erofsdump_extension_statistics {
+ struct hashmap_entry ent;
+ char suffix[16];
+ unsigned int count;
+ unsigned long occupied_size;
+ unsigned long original_size;
};
-#define OTHERFILETYPE ARRAY_SIZE(file_types)
+
+static int erofs_suffix_hashmap_cmp(const void *a, const void *b,
+ const void *key)
+{
+ const struct erofsdump_extension_statistics *es1 =
+ container_of((struct hashmap_entry *)a,
+ struct erofsdump_extension_statistics, ent);
+ const struct erofsdump_extension_statistics *es2 =
+ container_of((struct hashmap_entry *)b,
+ struct erofsdump_extension_statistics, ent);
+
+ return strncmp(es1->suffix, key ? key : es2->suffix, sizeof(es1->suffix));
+}
+
/* (1 << FILE_MAX_SIZE_BITS)KB */
#define FILE_MAX_SIZE_BITS 16
@@ -65,7 +85,7 @@ struct erofs_statistics {
/* [statistics] # of files based on inode_info->flags */
unsigned long file_category_stat[EROFS_FT_MAX];
/* [statistics] # of files based on file name extensions */
- unsigned int file_type_stat[OTHERFILETYPE];
+ struct hashmap suffix_hashmap;
/* [statistics] # of files based on the original size of files */
unsigned int file_original_size[FILE_MAX_SIZE_BITS + 1];
/* [statistics] # of files based on the compressed size of files */
@@ -79,6 +99,7 @@ static struct option long_options[] = {
{"device", required_argument, NULL, 3},
{"path", required_argument, NULL, 4},
{"ls", no_argument, NULL, 5},
+ {"extentions", required_argument, NULL, 6},
{0, 0, 0, 0},
};
@@ -111,6 +132,7 @@ static void usage(void)
" --ls show directory contents (INODE required)\n"
" --nid=# show the target inode info of nid #\n"
" --path=X show the target inode info of path X\n"
+ " --ext=# show the file statistics for the top # extensions\n"
" --help display this help and exit.\n",
stderr);
}
@@ -164,6 +186,9 @@ static int erofsdump_parse_options_cfg(int argc, char **argv)
case 5:
dumpcfg.show_subdirectories = true;
break;
+ case 6:
+ dumpcfg.show_ext_count = atoi(optarg);
+ break;
default:
return -EINVAL;
}
@@ -208,20 +233,45 @@ static int erofsdump_get_occupied_size(struct erofs_inode *inode,
return 0;
}
-static void inc_file_extension_count(const char *dname, unsigned int len)
+static void inc_file_extension_count(const char *dname, unsigned int len,
+ unsigned long occupied_size, unsigned long original_size)
{
- char *postfix = memrchr(dname, '.', len);
- int type;
-
- if (!postfix) {
- type = OTHERFILETYPE - 1;
+ unsigned int hash, sf_len;
+ struct erofsdump_extension_statistics *es;
+ char sf[sizeof(es->suffix)] = {0};
+ const char *suffix = memrchr(dname, '.', len);
+
+ if (suffix) {
+ sf_len = len - (suffix - dname);
+ if (sf_len > sizeof(es->suffix))
+ sf_len = sizeof(es->suffix);
} else {
- for (type = 0; type < OTHERFILETYPE - 1; ++type)
- if (!strncmp(postfix, file_types[type],
- len - (postfix - dname)))
- break;
+ suffix = no_ext;
+ sf_len = strlen(no_ext);
}
- ++stats.file_type_stat[type];
+
+ memcpy(sf, suffix, sf_len);
+ hash = strhash(sf);
+ es = hashmap_get_from_hash(&stats.suffix_hashmap, hash, sf);
+ if (es) {
+ es->count++;
+ es->occupied_size += occupied_size;
+ es->original_size += original_size;
+ return;
+ }
+ es = malloc(sizeof(struct erofsdump_extension_statistics));
+ if (!es) {
+ erofs_err("memory allocation failed!");
+ return;
+ }
+
+ es->count = 1;
+ es->occupied_size = occupied_size;
+ es->original_size = original_size;
+ memset(es->suffix, 0, sizeof(es->suffix));
+ strncpy(es->suffix, sf, sf_len);
+ hashmap_entry_init(&es->ent, hash);
+ hashmap_add(&stats.suffix_hashmap, es);
}
static void update_file_size_statatics(erofs_off_t occupied_size,
@@ -298,12 +348,12 @@ static int erofsdump_readdir(struct erofs_dir_context *ctx)
if (S_ISREG(vi.i_mode)) {
stats.files_total_origin_size += vi.i_size;
- inc_file_extension_count(ctx->dname, ctx->de_namelen);
+ inc_file_extension_count(ctx->dname, ctx->de_namelen, occupied_size, vi.i_size);
stats.files_total_size += occupied_size;
update_file_size_statatics(occupied_size, vi.i_size);
}
- /* XXXX: the dir depth should be restricted in order to avoid loops */
+ /* XXXX: the dir depth should be restricted in order to avoid looes */
if (S_ISDIR(vi.i_mode)) {
struct erofs_dir_context nctx = {
.flags = ctx->dir ? EROFS_READDIR_VALID_PNID : 0,
@@ -481,27 +531,50 @@ static void erofsdump_filesize_distribution(const char *title,
}
}
-static void erofsdump_filetype_distribution(char **file_types, unsigned int len)
+static int comp_erofsdump_extension_statistics(const void *a, const void *b)
+{
+ const struct erofsdump_extension_statistics *esa, *esb;
+
+ esa = a;
+ esb = b;
+ return esa->count < esb->count ? 1 :
+ (esa->count > esb->count) ? -1 : 0;
+}
+
+static void erofsdump_filetype_distribution(int topk)
{
char col1[30];
- unsigned int col2, i;
- double col3;
+ unsigned int col2, i, pos;
+ double col3, compression_rate;
char col4[401];
-
+ struct erofsdump_extension_statistics *es_arr;
+ struct erofsdump_extension_statistics *es;
+ struct hashmap_iter iter;
+
+ pos = 0;
+ es_arr = malloc(sizeof(struct erofsdump_extension_statistics) * stats.suffix_hashmap.size);
+ hashmap_iter_init(&stats.suffix_hashmap, &iter);
+ while ((es = hashmap_iter_next(&iter)))
+ es_arr[pos++] = *es;
+
+ DBG_BUGON(pos != stats.suffix_hashmap.size);
+ qsort(es_arr, pos, sizeof(struct erofsdump_extension_statistics),
+ comp_erofsdump_extension_statistics);
fprintf(stdout, "\nFile type distribution:\n");
fprintf(stdout, header_format, "type", "count", "ratio",
- "distribution");
- for (i = 0; i < len; i++) {
+ "compression rate");
+ for (i = 0; i < topk && i < pos; i++) {
memset(col1, 0, sizeof(col1));
- memset(col4, 0, sizeof(col4));
- sprintf(col1, "%-17s", file_types[i]);
- col2 = stats.file_type_stat[i];
+ sprintf(col1, "%-17s", es_arr[i].suffix);
+ col2 = es_arr[i].count;
if (stats.file_category_stat[EROFS_FT_REG_FILE])
col3 = (double)(100 * col2) /
stats.file_category_stat[EROFS_FT_REG_FILE];
else
col3 = 0.0;
- memset(col4, '#', col3 / 2);
+ compression_rate = 100.0 * (double)es_arr[i].occupied_size /
+ (double)es_arr[i].original_size;
+ sprintf(col4, "%.2f%%", compression_rate);
fprintf(stdout, chart_format, col1, col2, col3, col4);
}
}
@@ -543,10 +616,11 @@ static void erofsdump_print_statistic(void)
.de_namelen = 0,
};
+ hashmap_init(&stats.suffix_hashmap, erofs_suffix_hashmap_cmp, 0);
err = erofsdump_readdir(&ctx);
if (err) {
erofs_err("read dir failed");
- return;
+ goto exit;
}
erofsdump_file_statistic();
erofsdump_filesize_distribution("Original",
@@ -555,7 +629,9 @@ static void erofsdump_print_statistic(void)
erofsdump_filesize_distribution("On-disk",
stats.file_comp_size,
ARRAY_SIZE(stats.file_comp_size));
- erofsdump_filetype_distribution(file_types, OTHERFILETYPE);
+ erofsdump_filetype_distribution(dumpcfg.show_ext_count);
+exit:
+ hashmap_free(&stats.suffix_hashmap, 1);
}
static void erofsdump_show_superblock(void)
@@ -624,9 +700,11 @@ int main(int argc, char **argv)
if (dumpcfg.show_superblock)
erofsdump_show_superblock();
- if (dumpcfg.show_statistics)
+ if (dumpcfg.show_statistics) {
+ if (dumpcfg.show_ext_count == 0)
+ dumpcfg.show_ext_count = 10;
erofsdump_print_statistic();
-
+ }
if (dumpcfg.show_extent && !dumpcfg.show_inode) {
usage();
goto exit_dev_close;
--
2.30.2
More information about the Linux-erofs
mailing list