[PATCH] erofs-utils: mkfs: support tgz streams for tarerofs

Gao Xiang hsiangkao at linux.alibaba.com
Wed Sep 20 04:59:47 AEST 2023


Introduce iostream to wrap up the input tarball stream for tarerofs.

Besides, add bultin tgz support if zlib is linked to mkfs.

Signed-off-by: Gao Xiang <hsiangkao at linux.alibaba.com>
---
 include/erofs/tar.h |  21 +++
 lib/tar.c           | 309 ++++++++++++++++++++++++++++++++------------
 mkfs/main.c         |  34 ++++-
 3 files changed, 278 insertions(+), 86 deletions(-)

diff --git a/include/erofs/tar.h b/include/erofs/tar.h
index b50db1d..a76f740 100644
--- a/include/erofs/tar.h
+++ b/include/erofs/tar.h
@@ -7,6 +7,9 @@ extern "C"
 {
 #endif
 
+#if defined(HAVE_ZLIB)
+#include <zlib.h>
+#endif
 #include <sys/stat.h>
 
 #include "internal.h"
@@ -21,8 +24,24 @@ struct erofs_pax_header {
 	char *path, *link;
 };
 
+#define EROFS_IOS_DECODER_NONE		0
+#define EROFS_IOS_DECODER_GZIP		1
+
+struct erofs_iostream {
+	union {
+		int fd;			/* original fd */
+		void *handler;
+	};
+	u64 sz;
+	char *buffer;
+	unsigned int head, tail, bufsize;
+	int decoder;
+	bool feof;
+};
+
 struct erofs_tarfile {
 	struct erofs_pax_header global;
+	struct erofs_iostream ios;
 	char *mapfile;
 
 	int fd;
@@ -30,6 +49,8 @@ struct erofs_tarfile {
 	bool index_mode, aufs;
 };
 
+void erofs_iostream_close(struct erofs_iostream *ios);
+int erofs_iostream_open(struct erofs_iostream *ios, int fd, int decoder);
 int tarerofs_parse_tar(struct erofs_inode *root, struct erofs_tarfile *tar);
 
 #ifdef __cplusplus
diff --git a/lib/tar.c b/lib/tar.c
index f6320b0..8e71f11 100644
--- a/lib/tar.c
+++ b/lib/tar.c
@@ -3,6 +3,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
+#if defined(HAVE_ZLIB)
+#include <zlib.h>
+#endif
 #include "erofs/print.h"
 #include "erofs/cache.h"
 #include "erofs/diskbuf.h"
@@ -14,8 +17,6 @@
 #include "erofs/blobchunk.h"
 #include "erofs/rebuild.h"
 
-static char erofs_libbuf[16384];
-
 struct tar_header {
 	char name[100];		/*   0-99 */
 	char mode[8];		/* 100-107 */
@@ -60,35 +61,165 @@ s64 erofs_read_from_fd(int fd, void *buf, u64 bytes)
         return i;
 }
 
-/*
- * skip this many bytes of input. Return 0 for success, >0 means this much
- * left after input skipped.
- */
-u64 erofs_lskip(int fd, u64 sz)
+void erofs_iostream_close(struct erofs_iostream *ios)
+{
+	free(ios->buffer);
+	if (ios->decoder == EROFS_IOS_DECODER_GZIP) {
+#if defined(HAVE_ZLIB)
+		gzclose(ios->handler);
+#endif
+		return;
+	}
+	close(ios->fd);
+}
+
+int erofs_iostream_open(struct erofs_iostream *ios, int fd, int decoder)
 {
-	s64 cur = lseek(fd, 0, SEEK_CUR);
+	s64 fsz;
+
+	ios->tail = ios->head = 0;
+	ios->decoder = decoder;
+	if (decoder == EROFS_IOS_DECODER_GZIP) {
+#if defined(HAVE_ZLIB)
+		ios->handler = gzdopen(fd, "r");
+		if (!ios->handler)
+			return -ENOMEM;
+		ios->sz = fsz = 0;
+		ios->bufsize = 32768;
+#else
+		return -EOPNOTSUPP;
+#endif
+	} else {
+		ios->fd = fd;
+		fsz = lseek(fd, 0, SEEK_END);
+		if (fsz <= 0) {
+			ios->feof = !fsz;
+			ios->sz = 0;
+		} else {
+			ios->feof = false;
+			ios->sz = fsz;
+			if (lseek(fd, 0, SEEK_SET))
+				return -EIO;
+			if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL))
+				erofs_warn("failed to fadvise: %s, ignored.",
+					   erofs_strerror(errno));
+		}
+		ios->bufsize = 16384;
+	}
+
+	do {
+		ios->buffer = malloc(ios->bufsize);
+		if (ios->buffer)
+			break;
+		ios->bufsize >>= 1;
+	} while (ios->bufsize >= 1024);
 
-	if (cur >= 0) {
-		s64 end = lseek(fd, 0, SEEK_END) - cur;
+	if (!ios->buffer)
+		return -ENOMEM;
+	return 0;
+}
+
+int erofs_iostream_read(struct erofs_iostream *ios, void **buf, u64 bytes)
+{
+	unsigned int rabytes = ios->tail - ios->head;
+	int ret;
 
-		if (end > 0 && end < sz)
-			return sz - end;
+	if (rabytes >= bytes) {
+		*buf = ios->buffer + ios->head;
+		ios->head += bytes;
+		return bytes;
+	}
 
-		end = cur + sz;
-		if (end == lseek(fd, end, SEEK_SET))
-			return 0;
+	if (ios->head) {
+		memmove(ios->buffer, ios->buffer + ios->head, rabytes);
+		ios->head = 0;
+		ios->tail = rabytes;
 	}
 
-	while (sz) {
-		int try = min_t(u64, sz, sizeof(erofs_libbuf));
-		int or;
+	if (!ios->feof) {
+		if (ios->decoder == EROFS_IOS_DECODER_GZIP) {
+#if defined(HAVE_ZLIB)
+			ret = gzread(ios->handler, ios->buffer + rabytes,
+				     ios->bufsize - rabytes);
+			if (!ret) {
+				int errnum;
+				const char *errstr;
 
-		or = read(fd, erofs_libbuf, try);
-		if (or <= 0)
-			break;
-		else
-			sz -= or;
+				errstr = gzerror(ios->handler, &errnum);
+				if (errnum != Z_STREAM_END) {
+					erofs_err("failed to gzread: %s", errstr);
+					return -EIO;
+				}
+				ios->feof = true;
+			}
+			ios->tail += ret;
+#else
+			return -EOPNOTSUPP;
+#endif
+		} else {
+			ret = erofs_read_from_fd(ios->fd, ios->buffer + rabytes,
+						 ios->bufsize - rabytes);
+			if (ret < 0)
+				return ret;
+			ios->tail += ret;
+			if (ret < ios->bufsize - rabytes)
+				ios->feof = true;
+		}
 	}
+	*buf = ios->buffer;
+	ret = min_t(int, ios->tail, bytes);
+	ios->head = ret;
+	return ret;
+}
+
+int erofs_iostream_bread(struct erofs_iostream *ios, void *buf, u64 bytes)
+{
+	u64 rem = bytes;
+	void *src;
+	int ret;
+
+	do {
+		ret = erofs_iostream_read(ios, &src, rem);
+		if (ret < 0)
+			return ret;
+		memcpy(buf, src, ret);
+		rem -= ret;
+	} while (rem && ret);
+
+	return rem;
+}
+
+int erofs_iostream_lskip(struct erofs_iostream *ios, u64 sz)
+{
+	unsigned int rabytes = ios->tail - ios->head;
+	int ret;
+	void *dummy;
+
+	if (rabytes >= sz) {
+		ios->head += sz;
+		return 0;
+	}
+
+	sz -= rabytes;
+	ios->head = ios->tail = 0;
+	if (ios->feof)
+		return sz;
+
+	if (ios->sz) {
+		s64 cur = lseek(ios->fd, sz, SEEK_CUR);
+
+		if (cur > ios->sz)
+			return cur - ios->sz;
+		return 0;
+	}
+
+	do {
+		ret = erofs_iostream_read(ios, &dummy, sz);
+		if (ret < 0)
+			return ret;
+		sz -= ret;
+	} while (!(ios->feof || !ret || !sz));
+
 	return sz;
 }
 
@@ -251,7 +382,8 @@ static int base64_decode(const char *src, int len, u8 *dst)
 	return cp - dst;
 }
 
-int tarerofs_parse_pax_header(int fd, struct erofs_pax_header *eh, u32 size)
+int tarerofs_parse_pax_header(struct erofs_iostream *ios,
+			      struct erofs_pax_header *eh, u32 size)
 {
 	char *buf, *p;
 	int ret;
@@ -261,7 +393,7 @@ int tarerofs_parse_pax_header(int fd, struct erofs_pax_header *eh, u32 size)
 		return -ENOMEM;
 	p = buf;
 
-	ret = erofs_read_from_fd(fd, buf, size);
+	ret = erofs_iostream_bread(ios, buf, size);
 	if (ret != size)
 		goto out;
 
@@ -407,10 +539,10 @@ void tarerofs_remove_inode(struct erofs_inode *inode)
 static int tarerofs_write_file_data(struct erofs_inode *inode,
 				    struct erofs_tarfile *tar)
 {
-	unsigned int j, rem;
-	int fd;
+	unsigned int j;
+	void *buf;
+	int fd, nread;
 	u64 off;
-	char buf[65536];
 
 	if (!inode->i_diskbuf) {
 		inode->i_diskbuf = calloc(1, sizeof(*inode->i_diskbuf));
@@ -425,12 +557,14 @@ static int tarerofs_write_file_data(struct erofs_inode *inode,
 		return -EBADF;
 
 	for (j = inode->i_size; j; ) {
-		rem = min_t(unsigned int, sizeof(buf), j);
-
-		if (erofs_read_from_fd(tar->fd, buf, rem) != rem ||
-		    write(fd, buf, rem) != rem)
-			return -EIO;
-		j -= rem;
+		nread = erofs_iostream_read(&tar->ios, &buf, j);
+		if (nread < 0)
+			break;
+		if (write(fd, buf, nread) != nread) {
+			nread = -EIO;
+			break;
+		}
+		j -= nread;
 	}
 	erofs_diskbuf_commit(inode->i_diskbuf, inode->i_size);
 	inode->with_diskbuf = true;
@@ -445,7 +579,7 @@ static int tarerofs_write_file_index(struct erofs_inode *inode,
 	ret = tarerofs_write_chunkes(inode, data_offset);
 	if (ret)
 		return ret;
-	if (erofs_lskip(tar->fd, inode->i_size))
+	if (erofs_iostream_lskip(&tar->ios, inode->i_size))
 		return -EIO;
 	return 0;
 }
@@ -459,7 +593,7 @@ int tarerofs_parse_tar(struct erofs_inode *root, struct erofs_tarfile *tar)
 	struct stat st;
 	erofs_off_t tar_offset, data_offset;
 
-	struct tar_header th;
+	struct tar_header *th;
 	struct erofs_dentry *d;
 	struct erofs_inode *inode;
 	unsigned int j, csum, cksum;
@@ -474,7 +608,7 @@ int tarerofs_parse_tar(struct erofs_inode *root, struct erofs_tarfile *tar)
 restart:
 	rem = tar->offset & 511;
 	if (rem) {
-		if (erofs_lskip(tar->fd, 512 - rem)) {
+		if (erofs_iostream_lskip(&tar->ios, 512 - rem)) {
 			ret = -EIO;
 			goto out;
 		}
@@ -482,11 +616,14 @@ restart:
 	}
 
 	tar_offset = tar->offset;
-	ret = erofs_read_from_fd(tar->fd, &th, sizeof(th));
-	if (ret != sizeof(th))
+	ret = erofs_iostream_read(&tar->ios, (void **)&th, sizeof(*th));
+	if (ret != sizeof(*th)) {
+		erofs_err("failed to read header block @ %llu", tar_offset);
+		ret = -EIO;
 		goto out;
-	tar->offset += sizeof(th);
-	if (*th.name == '\0') {
+	}
+	tar->offset += sizeof(*th);
+	if (*th->name == '\0') {
 		if (e) {	/* end of tar 2 empty blocks */
 			ret = 1;
 			goto out;
@@ -495,14 +632,14 @@ restart:
 		goto restart;
 	}
 
-	if (strncmp(th.magic, "ustar", 5)) {
+	if (memcmp(th->magic, "ustar", 5)) {
 		erofs_err("invalid tar magic @ %llu", tar_offset);
 		ret = -EIO;
 		goto out;
 	}
 
 	/* chksum field itself treated as ' ' */
-	csum = tarerofs_otoi(th.chksum, sizeof(th.chksum));
+	csum = tarerofs_otoi(th->chksum, sizeof(th->chksum));
 	if (errno) {
 		erofs_err("invalid chksum @ %llu", tar_offset);
 		ret = -EBADMSG;
@@ -513,12 +650,12 @@ restart:
 		cksum += (unsigned int)' ';
 	ckksum = cksum;
 	for (j = 0; j < 148; ++j) {
-		cksum += (unsigned int)((u8*)&th)[j];
-		ckksum += (int)((char*)&th)[j];
+		cksum += (unsigned int)((u8*)th)[j];
+		ckksum += (int)((char*)th)[j];
 	}
 	for (j = 156; j < 500; ++j) {
-		cksum += (unsigned int)((u8*)&th)[j];
-		ckksum += (int)((char*)&th)[j];
+		cksum += (unsigned int)((u8*)th)[j];
+		ckksum += (int)((char*)th)[j];
 	}
 	if (csum != cksum && csum != ckksum) {
 		erofs_err("chksum mismatch @ %llu", tar_offset);
@@ -526,14 +663,14 @@ restart:
 		goto out;
 	}
 
-	st.st_mode = tarerofs_otoi(th.mode, sizeof(th.mode));
+	st.st_mode = tarerofs_otoi(th->mode, sizeof(th->mode));
 	if (errno)
 		goto invalid_tar;
 
 	if (eh.use_uid) {
 		st.st_uid = eh.st.st_uid;
 	} else {
-		st.st_uid = tarerofs_parsenum(th.uid, sizeof(th.uid));
+		st.st_uid = tarerofs_parsenum(th->uid, sizeof(th->uid));
 		if (errno)
 			goto invalid_tar;
 	}
@@ -541,7 +678,7 @@ restart:
 	if (eh.use_gid) {
 		st.st_gid = eh.st.st_gid;
 	} else {
-		st.st_gid = tarerofs_parsenum(th.gid, sizeof(th.gid));
+		st.st_gid = tarerofs_parsenum(th->gid, sizeof(th->gid));
 		if (errno)
 			goto invalid_tar;
 	}
@@ -549,7 +686,7 @@ restart:
 	if (eh.use_size) {
 		st.st_size = eh.st.st_size;
 	} else {
-		st.st_size = tarerofs_parsenum(th.size, sizeof(th.size));
+		st.st_size = tarerofs_parsenum(th->size, sizeof(th->size));
 		if (errno)
 			goto invalid_tar;
 	}
@@ -560,25 +697,25 @@ restart:
 		ST_MTIM_NSEC(&st) = ST_MTIM_NSEC(&eh.st);
 #endif
 	} else {
-		st.st_mtime = tarerofs_parsenum(th.mtime, sizeof(th.mtime));
+		st.st_mtime = tarerofs_parsenum(th->mtime, sizeof(th->mtime));
 		if (errno)
 			goto invalid_tar;
 	}
 
-	if (th.typeflag <= '7' && !eh.path) {
+	if (th->typeflag <= '7' && !eh.path) {
 		eh.path = path;
 		j = 0;
-		if (*th.prefix) {
-			memcpy(path, th.prefix, sizeof(th.prefix));
-			path[sizeof(th.prefix)] = '\0';
+		if (*th->prefix) {
+			memcpy(path, th->prefix, sizeof(th->prefix));
+			path[sizeof(th->prefix)] = '\0';
 			j = strlen(path);
 			if (path[j - 1] != '/') {
 				path[j] = '/';
 				path[++j] = '\0';
 			}
 		}
-		memcpy(path + j, th.name, sizeof(th.name));
-		path[j + sizeof(th.name)] = '\0';
+		memcpy(path + j, th->name, sizeof(th->name));
+		path[j + sizeof(th->name)] = '\0';
 		j = strlen(path);
 		while (path[j - 1] == '/')
 			path[--j] = '\0';
@@ -586,20 +723,30 @@ restart:
 
 	data_offset = tar->offset;
 	tar->offset += st.st_size;
-	if (th.typeflag == '0' || th.typeflag == '7' || th.typeflag == '1') {
+	switch(th->typeflag) {
+	case '0':
+	case '7':
+	case '1':
 		st.st_mode |= S_IFREG;
-	} else if (th.typeflag == '2') {
+		break;
+	case '2':
 		st.st_mode |= S_IFLNK;
-	} else if (th.typeflag == '3') {
+		break;
+	case '3':
 		st.st_mode |= S_IFCHR;
-	} else if (th.typeflag == '4') {
+		break;
+	case '4':
 		st.st_mode |= S_IFBLK;
-	} else if (th.typeflag == '5') {
+		break;
+	case '5':
 		st.st_mode |= S_IFDIR;
-	} else if (th.typeflag == '6') {
+		break;
+	case '6':
 		st.st_mode |= S_IFIFO;
-	} else if (th.typeflag == 'g') {
-		ret = tarerofs_parse_pax_header(tar->fd, &tar->global, st.st_size);
+		break;
+	case 'g':
+		ret = tarerofs_parse_pax_header(&tar->ios, &tar->global,
+						st.st_size);
 		if (ret)
 			goto out;
 		if (tar->global.path) {
@@ -611,31 +758,31 @@ restart:
 			eh.link = strdup(tar->global.link);
 		}
 		goto restart;
-	} else if (th.typeflag == 'x') {
-		ret = tarerofs_parse_pax_header(tar->fd, &eh, st.st_size);
+	case 'x':
+		ret = tarerofs_parse_pax_header(&tar->ios, &eh, st.st_size);
 		if (ret)
 			goto out;
 		goto restart;
-	} else if (th.typeflag == 'L') {
+	case 'L':
 		free(eh.path);
 		eh.path = malloc(st.st_size + 1);
-		if (st.st_size != erofs_read_from_fd(tar->fd, eh.path,
-						     st.st_size))
+		if (st.st_size != erofs_iostream_bread(&tar->ios, eh.path,
+						       st.st_size))
 			goto invalid_tar;
 		eh.path[st.st_size] = '\0';
 		goto restart;
-	} else if (th.typeflag == 'K') {
+	case 'K':
 		free(eh.link);
 		eh.link = malloc(st.st_size + 1);
 		if (st.st_size > PATH_MAX || st.st_size !=
-		    erofs_read_from_fd(tar->fd, eh.link, st.st_size))
+		    erofs_iostream_bread(&tar->ios, eh.link, st.st_size))
 			goto invalid_tar;
 		eh.link[st.st_size] = '\0';
 		goto restart;
-	} else {
+	default:
 		erofs_info("unrecognized typeflag %xh @ %llu - ignoring",
-			   th.typeflag, tar_offset);
-		(void)erofs_lskip(tar->fd, st.st_size);
+			   th->typeflag, tar_offset);
+		(void)erofs_iostream_lskip(&tar->ios, st.st_size);
 		ret = 0;
 		goto out;
 	}
@@ -644,22 +791,22 @@ restart:
 	if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
 		int major, minor;
 
-		major = tarerofs_parsenum(th.devmajor, sizeof(th.devmajor));
+		major = tarerofs_parsenum(th->devmajor, sizeof(th->devmajor));
 		if (errno) {
 			erofs_err("invalid device major @ %llu", tar_offset);
 			goto out;
 		}
 
-		minor = tarerofs_parsenum(th.devminor, sizeof(th.devminor));
+		minor = tarerofs_parsenum(th->devminor, sizeof(th->devminor));
 		if (errno) {
 			erofs_err("invalid device minor @ %llu", tar_offset);
 			goto out;
 		}
 
 		st.st_rdev = (major << 8) | (minor & 0xff) | ((minor & ~0xff) << 12);
-	} else if (th.typeflag == '1' || th.typeflag == '2') {
+	} else if (th->typeflag == '1' || th->typeflag == '2') {
 		if (!eh.link)
-			eh.link = strndup(th.linkname, sizeof(th.linkname));
+			eh.link = strndup(th->linkname, sizeof(th->linkname));
 	}
 
 	if (tar->index_mode && !tar->mapfile &&
@@ -689,7 +836,7 @@ restart:
 		DBG_BUGON(!d->inode);
 		ret = erofs_set_opaque_xattr(d->inode);
 		goto out;
-	} else if (th.typeflag == '1') {	/* hard link cases */
+	} else if (th->typeflag == '1') {	/* hard link cases */
 		struct erofs_dentry *d2;
 		bool dumb;
 
diff --git a/mkfs/main.c b/mkfs/main.c
index ea868bb..6d2b700 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -66,6 +66,9 @@ static struct option long_options[] = {
 	{"block-list-file", required_argument, NULL, 515},
 #endif
 	{"ovlfs-strip", optional_argument, NULL, 516},
+#ifdef HAVE_ZLIB
+	{"gzip", no_argument, NULL, 517},
+#endif
 	{0, 0, 0, 0},
 };
 
@@ -111,6 +114,9 @@ static void usage(void)
 	      " --force-gid=#         set all file gids to # (# = GID)\n"
 	      " --uid-offset=#        add offset # to all file uids (# = id offset)\n"
 	      " --gid-offset=#        add offset # to all file gids (# = id offset)\n"
+#ifdef HAVE_ZLIB
+	      " --gzip                try to filter the tarball stream through gzip\n"
+#endif
 	      " --help                display this help and exit\n"
 	      " --ignore-mtime        use build time instead of strict per-file modification time\n"
 	      " --max-extent-bytes=#  set maximum decompressed extent size # in bytes\n"
@@ -139,7 +145,7 @@ static unsigned int pclustersize_packed, pclustersize_max;
 static struct erofs_tarfile erofstar = {
 	.global.xattrs = LIST_HEAD_INIT(erofstar.global.xattrs)
 };
-static bool tar_mode, rebuild_mode;
+static bool tar_mode, rebuild_mode, gzip_supported;
 
 static unsigned int rebuild_src_count;
 static LIST_HEAD(rebuild_src_list);
@@ -525,6 +531,9 @@ static int mkfs_parse_options_cfg(int argc, char *argv[])
 			else
 				cfg.c_ovlfs_strip = false;
 			break;
+		case 517:
+			gzip_supported = true;
+			break;
 		case 1:
 			usage();
 			exit(0);
@@ -560,7 +569,17 @@ static int mkfs_parse_options_cfg(int argc, char *argv[])
 			erofs_err("missing argument: SOURCE(s)");
 			return -EINVAL;
 		} else {
-			erofstar.fd = STDIN_FILENO;
+			int dupfd;
+
+			dupfd = dup(STDIN_FILENO);
+			if (dupfd < 0) {
+				erofs_err("failed to duplicate STDIN_FILENO: %s",
+					  strerror(errno));
+				return -errno;
+			}
+			err = erofs_iostream_open(&erofstar.ios, dupfd, gzip_supported);
+			if (err)
+				return err;
 		}
 	} else {
 		struct stat st;
@@ -573,12 +592,15 @@ static int mkfs_parse_options_cfg(int argc, char *argv[])
 		}
 
 		if (tar_mode) {
-			erofstar.fd = open(cfg.c_src_path, O_RDONLY);
-			if (erofstar.fd < 0) {
+			int fd = open(cfg.c_src_path, O_RDONLY);
+
+			if (fd < 0) {
 				erofs_err("failed to open file: %s", cfg.c_src_path);
-				usage();
 				return -errno;
 			}
+			err = erofs_iostream_open(&erofstar.ios, fd, gzip_supported);
+			if (err)
+				return err;
 		} else {
 			err = lstat(cfg.c_src_path, &st);
 			if (err)
@@ -1182,6 +1204,8 @@ exit:
 	erofs_rebuild_cleanup();
 	erofs_diskbuf_exit();
 	erofs_exit_configure();
+	if (tar_mode)
+		erofs_iostream_close(&erofstar.ios);
 
 	if (err) {
 		erofs_err("\tCould not format the device : %s\n",
-- 
2.39.3



More information about the Linux-erofs mailing list