[PATCH] erofs-utils: dump: add --ext # option to show top# file extensions
Xiang Gao
hsiangkao at linux.alibaba.com
Wed Jan 4 14:22:22 AEDT 2023
Hi Qi,
On 2022/12/26 11:32, Qi Wang wrote:
> The --ext # option is used to let the user specify top# file extensions
> with higher occurrence, instead of hardcode some file extensions ahead.
>
> Signed-off-by: Qi Wang <mpiglet at outlook.com>
> ---
> dump/main.c | 131 ++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 101 insertions(+), 30 deletions(-)
>
> diff --git a/dump/main.c b/dump/main.c
> index 49ff2b7..188ab34 100644
> --- a/dump/main.c
> +++ b/dump/main.c
> @@ -6,6 +6,7 @@
> * Guo Xuenan <guoxuenan at huawei.com>
> */
> #define _GNU_SOURCE
> +#include <string.h>
> #include <stdlib.h>
> #include <getopt.h>
> #include <time.h>
> @@ -15,6 +16,7 @@
> #include "erofs/io.h"
> #include "erofs/dir.h"
> #include "../lib/liberofs_private.h"
> +#include "erofs/hashmap.h"
>
> #ifdef HAVE_LIBUUID
> #include <uuid.h>
> @@ -29,17 +31,34 @@ struct erofsdump_cfg {
> bool show_subdirectories;
> erofs_nid_t nid;
> const char *inode_path;
> + unsigned int show_ext_count;
> };
> static struct erofsdump_cfg dumpcfg;
>
> static const char chart_format[] = "%-16s %-11d %8.2f%% |%-50s|\n";
> static const char header_format[] = "%-16s %11s %16s |%-50s|\n";
> -static char *file_types[] = {
> - ".txt", ".so", ".xml", ".apk",
> - ".odex", ".vdex", ".oat", ".rc",
> - ".otf", ".txt", "others",
> +
> +struct postfix_statistics {
Thanks for the patch. However what does postfix mean?
I think erofsdump_extension_statistics would be better..
> + struct hashmap_entry ent;
> + char postfix[16];
char suffix[16];
> + unsigned int count;
> + unsigned long occupied_size;
> + unsigned long original_size;
> };
> -#define OTHERFILETYPE ARRAY_SIZE(file_types)
> +
> +static int erofs_postfix_hashmap_cmp(const void *a, const void *b,
> + const void *key)
> +{
> + const struct postfix_statistics *ps1 =
> + container_of((struct hashmap_entry *)a,
> + struct postfix_statistics, ent);
> + const struct postfix_statistics *ps2 =
> + container_of((struct hashmap_entry *)b,
> + struct postfix_statistics, ent);
> +
> + return strncmp(ps1->postfix, key ? key : ps2->postfix, sizeof(ps1->postfix));
> +}
> +
> /* (1 << FILE_MAX_SIZE_BITS)KB */
> #define FILE_MAX_SIZE_BITS 16
>
> @@ -65,7 +84,7 @@ struct erofs_statistics {
> /* [statistics] # of files based on inode_info->flags */
> unsigned long file_category_stat[EROFS_FT_MAX];
> /* [statistics] # of files based on file name extensions */
> - unsigned int file_type_stat[OTHERFILETYPE];
> + struct hashmap postfix_hashmap;
> /* [statistics] # of files based on the original size of files */
> unsigned int file_original_size[FILE_MAX_SIZE_BITS + 1];
> /* [statistics] # of files based on the compressed size of files */
> @@ -79,6 +98,7 @@ static struct option long_options[] = {
> {"device", required_argument, NULL, 3},
> {"path", required_argument, NULL, 4},
> {"ls", no_argument, NULL, 5},
> + {"ext", required_argument, NULL, 6},
{"extensions" ... } ?
> {0, 0, 0, 0},
> };
>
> @@ -111,6 +131,7 @@ static void usage(void)
> " --ls show directory contents (INODE required)\n"
> " --nid=# show the target inode info of nid #\n"
> " --path=X show the target inode info of path X\n"
> + " --ext=# show the top # extension file info\n"
show the file statistics for the top # extensions?
> " --help display this help and exit.\n",
> stderr);
> }
> @@ -164,6 +185,9 @@ static int erofsdump_parse_options_cfg(int argc, char **argv)
> case 5:
> dumpcfg.show_subdirectories = true;
> break;
> + case 6:
> + dumpcfg.show_ext_count = atoi(optarg);
> + break;
> default:
> return -EINVAL;
> }
> @@ -208,20 +232,41 @@ static int erofsdump_get_occupied_size(struct erofs_inode *inode,
> return 0;
> }
>
> -static void inc_file_extension_count(const char *dname, unsigned int len)
> +static void inc_file_extension_count(const char *dname, unsigned int len,
> + unsigned long occupied_size, unsigned long original_size)
> {
> char *postfix = memrchr(dname, '.', len);
> - int type;
> + unsigned int hash, plen;
> + struct postfix_statistics *ps;
> + char pf[sizeof(ps->postfix)] = {0};
> +
> + plen = len - (postfix - dname);
> + if (plen > sizeof(ps->postfix))
> + plen = sizeof(ps->postfix);
Do we need to have one entry for files without extension?
> + if (postfix) {
> + memcpy(pf, postfix, plen);
> + hash = strhash(pf);
> + ps = hashmap_get_from_hash(&stats.postfix_hashmap, hash, pf);
> + if (ps) {
> + ps->count++;
> + ps->occupied_size += occupied_size;
> + ps->original_size += original_size;
> + return;
> + }
> + ps = malloc(sizeof(struct postfix_statistics));
> + if (!ps) {
> + erofs_err("memory allocation failed!");
> + return;
> + }
>
> - if (!postfix) {
> - type = OTHERFILETYPE - 1;
> - } else {
> - for (type = 0; type < OTHERFILETYPE - 1; ++type)
> - if (!strncmp(postfix, file_types[type],
> - len - (postfix - dname)))
> - break;
> + ps->count = 1;
> + ps->occupied_size = occupied_size;
> + ps->original_size = original_size;
> + memset(ps->postfix, 0, sizeof(ps->postfix));
> + strncpy(ps->postfix, pf, plen);
> + hashmap_entry_init(&ps->ent, hash);
> + hashmap_add(&stats.postfix_hashmap, ps);
> }
> - ++stats.file_type_stat[type];
> }
>
> static void update_file_size_statatics(erofs_off_t occupied_size,
> @@ -298,7 +343,7 @@ static int erofsdump_readdir(struct erofs_dir_context *ctx)
>
> if (S_ISREG(vi.i_mode)) {
> stats.files_total_origin_size += vi.i_size;
> - inc_file_extension_count(ctx->dname, ctx->de_namelen);
> + inc_file_extension_count(ctx->dname, ctx->de_namelen, occupied_size, vi.i_size);
> stats.files_total_size += occupied_size;
> update_file_size_statatics(occupied_size, vi.i_size);
> }
> @@ -481,27 +526,50 @@ static void erofsdump_filesize_distribution(const char *title,
> }
> }
>
> -static void erofsdump_filetype_distribution(char **file_types, unsigned int len)
> +static int comp_postfix_statistics(const void *a, const void *b)
> +{
> + const struct postfix_statistics *psa, *psb;
> +
> + psa = a;
> + psb = b;
> + return psa->count < psb->count ? 1 :
> + (psa->count > psb->count) ? -1 : 0;
> +}
> +
> +static void erofsdump_filetype_distribution(int topk)
> {
> char col1[30];
> - unsigned int col2, i;
> - double col3;
> + unsigned int col2, i, pos;
> + double col3, compression_rate;
> char col4[401];
> -
> + struct postfix_statistics *ps_array;
> + struct postfix_statistics *ps;
> + struct hashmap_iter iter;
> +
> + pos = 0;
> + ps_array = malloc(sizeof(struct postfix_statistics) * stats.postfix_hashmap.size);
> + hashmap_iter_init(&stats.postfix_hashmap, &iter);
> + while ((ps = hashmap_iter_next(&iter)))
> + ps_array[pos++] = *ps;
> +
> + DBG_BUGON(pos != stats.postfix_hashmap.size);
> + qsort(ps_array, pos, sizeof(struct postfix_statistics),
> + comp_postfix_statistics);
> fprintf(stdout, "\nFile type distribution:\n");
> fprintf(stdout, header_format, "type", "count", "ratio",
> - "distribution");
> - for (i = 0; i < len; i++) {
> + "compression rate");
> + for (i = 0; i < topk && i < pos; i++) {
> memset(col1, 0, sizeof(col1));
> - memset(col4, 0, sizeof(col4));
> - sprintf(col1, "%-17s", file_types[i]);
> - col2 = stats.file_type_stat[i];
> + sprintf(col1, "%-17s", ps_array[i].postfix);
> + col2 = ps_array[i].count;
> if (stats.file_category_stat[EROFS_FT_REG_FILE])
> col3 = (double)(100 * col2) /
> stats.file_category_stat[EROFS_FT_REG_FILE];
> else
> col3 = 0.0;
> - memset(col4, '#', col3 / 2);
> + compression_rate = 100.0 * (double)ps_array[i].occupied_size /
> + (double)ps_array[i].original_size;
> + sprintf(col4, "%.2f%%", compression_rate);
> fprintf(stdout, chart_format, col1, col2, col3, col4);
> }
> }
> @@ -543,6 +611,7 @@ static void erofsdump_print_statistic(void)
> .de_namelen = 0,
> };
>
> + hashmap_init(&stats.postfix_hashmap, erofs_postfix_hashmap_cmp, 0);
> err = erofsdump_readdir(&ctx);
> if (err) {
> erofs_err("read dir failed");
> @@ -555,7 +624,7 @@ static void erofsdump_print_statistic(void)
> erofsdump_filesize_distribution("On-disk",
> stats.file_comp_size,
> ARRAY_SIZE(stats.file_comp_size));
> - erofsdump_filetype_distribution(file_types, OTHERFILETYPE);
> + erofsdump_filetype_distribution(dumpcfg.show_ext_count);
> }
>
> static void erofsdump_show_superblock(void)
> @@ -624,9 +693,11 @@ int main(int argc, char **argv)
> if (dumpcfg.show_superblock)
> erofsdump_show_superblock();
>
> - if (dumpcfg.show_statistics)
> + if (dumpcfg.show_statistics) {
> + if (dumpcfg.show_ext_count == 0)
> + dumpcfg.show_ext_count = 10;
> erofsdump_print_statistic();
> -
> + }
also do we have some logic to free hashmap and extension entries?
Thanks,
Gao Xiang
> if (dumpcfg.show_extent && !dumpcfg.show_inode) {
> usage();
> goto exit_dev_close;
More information about the Linux-erofs
mailing list