From 4ba42acfea72bbb378808bbf033396cd6a0e3d22 Mon Sep 17 00:00:00 2001 From: Konstantin Demin Date: Tue, 27 May 2025 11:36:06 +0300 Subject: [PATCH] initial commit --- .gitignore | 8 + Makefile | 40 +++ coreutils-sort.cc | 163 +++++++++ coreutils-sort.hh | 10 + main.cc | 174 ++++++++++ overlay-common.hh | 43 +++ overlay.cc | 842 ++++++++++++++++++++++++++++++++++++++++++++++ overlay.hh | 90 +++++ print.cc | 62 ++++ print.hh | 22 ++ sort.cc | 29 ++ sort.hh | 26 ++ xxhash.cc | 26 ++ xxhash.hh | 49 +++ 14 files changed, 1584 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 coreutils-sort.cc create mode 100644 coreutils-sort.hh create mode 100644 main.cc create mode 100644 overlay-common.hh create mode 100644 overlay.cc create mode 100644 overlay.hh create mode 100644 print.cc create mode 100644 print.hh create mode 100644 sort.cc create mode 100644 sort.hh create mode 100644 xxhash.cc create mode 100644 xxhash.hh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..78660df --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +## local copy +xxhash.h + +## build objects +*.o + +## build artefact +overlaydirs diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6d98eba --- /dev/null +++ b/Makefile @@ -0,0 +1,40 @@ +#!/usr/bin/make -f +SHELL :=/bin/sh +.SHELLFLAGS :=-efc +MAKEFLAGS +=--no-print-directory + +SRC = main.cc coreutils-sort.cc overlay.cc print.cc sort.cc xxhash.cc + +CROSS ?= +CC =$(CROSS)gcc +CXX =$(CROSS)g++ +STRIP =$(CROSS)strip + +CFLAGS_LTO ?=-flto=2 -fuse-linker-plugin -ffat-lto-objects -flto-partition=none +CFLAGS_COMMON ?=-O2 -g -fPIE -fstack-protector-strong +CFLAGS ?=$(CFLAGS_COMMON) $(CFLAGS_LTO) +CPPFLAGS ?=-Wall -Wextra -Werror=format-security -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 +CXXFLAGS ?=$(CFLAGS_COMMON) $(CFLAGS_LTO) -std=gnu++20 +LDFLAGS ?=-Wl,-z,relro -Wl,-z,now -pie + +NO_WARN = attributes class-memaccess unused-function unused-result +CPPFLAGS += $(foreach w,$(NO_WARN),-Wno-$(w)) + +NO_CXX = rtti exceptions +CXXFLAGS +=$(foreach f,$(NO_CXX),-fno-$(f)) + +OBJ = $(SRC:.cc=.cc.o) + +.DEFAULT: all +.PHONY: all build clean +all build: overlaydirs + +%.cc.o: %.cc + $(CXX) -c $(CXXFLAGS) $(CPPFLAGS) -o $@ $^ + +overlaydirs: $(OBJ) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ \ + $(if $(filter $(RELMODE),1),&& $(STRIP) --strip-debug --strip-unneeded $@) + +clean: + rm -f overlaydirs $(OBJ) diff --git a/coreutils-sort.cc b/coreutils-sort.cc new file mode 100644 index 0000000..3bda749 --- /dev/null +++ b/coreutils-sort.cc @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-3.0-or-later + Origin: coreutils-9.7/lib/filevercmp.c + Copyright (C) 1995 Ian Jackson + Copyright (C) 2001 Anthony Towns + Copyright (C) 2008-2025 Free Software Foundation, Inc. +*/ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include + +#include "coreutils-sort.hh" + +/* Return the length of a prefix of S that corresponds to the suffix + defined by this extended regular expression in the C locale: + (\.[A-Za-z~][A-Za-z0-9~]*)*$ + Use the longest suffix matching this regular expression, + except do not use all of S as a suffix if S is nonempty. + If *LEN is -1, S is a string; set *LEN to S's length. + Otherwise, *LEN should be nonnegative, S is a char array, + and *LEN does not change. */ +static +ptrdiff_t file_prefixlen(char const * s, ptrdiff_t * len) +{ + size_t n = *len; /* SIZE_MAX if N == -1. */ + ptrdiff_t prefixlen = 0; + + for (ptrdiff_t i = 0; ; ) { + if ((*len < 0) ? !s[i] : (i == n)) { + *len = i; + return prefixlen; + } + + i++; + prefixlen = i; + while ((i + 1 < n) && (s[i] == '.') && (isalpha(s[i + 1]) || (s[i + 1] == '~'))) { + for (i += 2; (i < n) && (isalnum(s[i]) || (s[i] == '~')); i++) { + continue; + } + } + } +} + +/* Return a version sort comparison value for S's byte at position POS. + S has length LEN. If POS == LEN, sort before all non-'~' bytes. */ +static +int order(char const * s, ptrdiff_t pos, ptrdiff_t len) +{ + if (pos == len) return -1; + + unsigned char c = s[pos]; + if (isdigit(c)) return 0; + else if (isalpha(c)) return c; + else if (c == '~') return -2; + else + { + static_assert (UCHAR_MAX <= (INT_MAX - 1 - 2) / 2); + return c + UCHAR_MAX + 1; + } +} + +/* slightly modified verrevcmp function from dpkg + S1, S2 - compared char array + S1_LEN, S2_LEN - length of arrays to be scanned + + This implements the algorithm for comparison of version strings + specified by Debian and now widely adopted. The detailed + specification can be found in the Debian Policy Manual in the + section on the 'Version' control field. This version of the code + implements that from s5.6.12 of Debian Policy v3.8.0.1 + https://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version */ +static +int verrevcmp(const char * s1, ptrdiff_t s1_len, const char * s2, ptrdiff_t s2_len) +{ + ptrdiff_t s1_pos = 0; + ptrdiff_t s2_pos = 0; + + while (s1_pos < s1_len || s2_pos < s2_len) { + int first_diff = 0; + while (((s1_pos < s1_len) && !isdigit(s1[s1_pos])) || ((s2_pos < s2_len) && !isdigit(s2[s2_pos]))) { + int s1_c = order(s1, s1_pos, s1_len); + int s2_c = order(s2, s2_pos, s2_len); + if (s1_c != s2_c) return s1_c - s2_c; + s1_pos++; + s2_pos++; + } + + while ((s1_pos < s1_len) && (s1[s1_pos] == '0')) { + s1_pos++; + } + while ((s2_pos < s2_len) && (s2[s2_pos] == '0')) { + s2_pos++; + } + while ((s1_pos < s1_len) && (s2_pos < s2_len) && isdigit(s1[s1_pos]) && isdigit(s2[s2_pos])) { + if (!first_diff) first_diff = s1[s1_pos] - s2[s2_pos]; + s1_pos++; + s2_pos++; + } + + if ((s1_pos < s1_len) && isdigit(s1[s1_pos])) return 1; + if ((s2_pos < s2_len) && isdigit(s2[s2_pos])) return -1; + if (first_diff) return first_diff; + } + return 0; +} + +static +int filenvercmp (char const * a, ptrdiff_t alen, char const * b, ptrdiff_t blen) +{ + /* Special case for empty versions. */ + bool aempty = (alen < 0) ? !a[0] : !alen; + bool bempty = (blen < 0) ? !b[0] : !blen; + if (aempty) return -!bempty; + if (bempty) return 1; + + /* Special cases for leading ".": "." sorts first, then "..", then + other names with leading ".", then other names. */ + if (a[0] == '.') { + if (b[0] != '.') return -1; + + bool adot = alen < 0 ? !a[1] : alen == 1; + bool bdot = blen < 0 ? !b[1] : blen == 1; + if (adot) return -!bdot; + if (bdot) return 1; + + bool adotdot = (a[1] == '.') && ((alen < 0) ? !a[2] : (alen == 2)); + bool bdotdot = (b[1] == '.') && ((blen < 0) ? !b[2] : (blen == 2)); + if (adotdot) return -!bdotdot; + if (bdotdot) return 1; + } + else if (b[0] == '.') { + return 1; + } + + /* Cut file suffixes. */ + ptrdiff_t aprefixlen = file_prefixlen(a, &alen); + ptrdiff_t bprefixlen = file_prefixlen(b, &blen); + + /* If both suffixes are empty, a second pass would return the same thing. */ + bool one_pass_only = (aprefixlen == alen) && (bprefixlen == blen); + + int result = verrevcmp(a, aprefixlen, b, bprefixlen); + + /* Return the initial result if nonzero, or if no second pass is needed. + Otherwise, restore the suffixes and try again. */ + return (result || one_pass_only) ? result : verrevcmp(a, alen, b, blen); +} + +static inline +int filevercmp(const char * s1, const char * s2) +{ + return filenvercmp(s1, -1, s2, -1); +} + +int coreutils_version_sort(char const * a, char const * b) +{ + return filevercmp(a, b); +} diff --git a/coreutils-sort.hh b/coreutils-sort.hh new file mode 100644 index 0000000..4fb287f --- /dev/null +++ b/coreutils-sort.hh @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef INCLUDE_COREUTILS_SORT_HH +#define INCLUDE_COREUTILS_SORT_HH 1 + +int coreutils_version_sort(char const * a, char const * b); + +#endif /* INCLUDE_COREUTILS_SORT_HH */ diff --git a/main.cc b/main.cc new file mode 100644 index 0000000..10c04f8 --- /dev/null +++ b/main.cc @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include + +extern "C" { + #include +} + +#include "overlay.hh" + +static +void usage(int retcode = 0) +{ + static const char usage_msg[] = + "overlaydirs 0.0.1\n" + "Usage:\n" + " overlaydirs --help\n" + " show this message\n" + " overlaydirs --list [--no-sort|--zero] [..]\n" + // TODO: shell-escape mode + /* " overlaydirs --list [--no-sort|--zero|--escape] [..]\n" */ + " list entries\n" + " overlaydirs --merge [..]\n" + " symlinks entries into \n" + "\n" + " --no-sort - don't sort entries\n" + " --zero - separate entries with NUL instead of LF\n" + // TODO: shell-escape mode + /* + " --escape - shell-escape strings\n" + "\n" + "Notes:\n" + " - flags \"--zero\" and \"--escape\" are mutually exclusive.\n" + */ + ; + + (void) write(STDERR_FILENO, usage_msg, sizeof(usage_msg)); + + exit(retcode); +} + +static int main_list(int argc, char * argv[]); +static int main_merge(int argc, char * argv[]); + +int main(int argc, char * argv[]) +{ + if (argc < 2) usage(0); + + if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) + usage(0); + else if (strcmp(argv[1], "--list") == 0) + return main_list(argc, argv); + else if (strcmp(argv[1], "--merge") == 0) + return main_merge(argc, argv); + else + usage(EINVAL); + + return 0; +} + +static int main_list(int argc, char * argv[]) +{ + if (argc < 3) usage(EINVAL); + + print_mode print_m = print_mode::_default; + sort_mode sort_m = sort_mode::_default; + + int arg_start = 2; + while (arg_start < argc) { + char * _arg = argv[arg_start]; + + if (strncmp(_arg, "--", 2) != 0) break; + + if (strcmp(_arg, "--") == 0) { + arg_start++; + break; + } + else if (strcmp(_arg, "--no-sort") == 0) { + arg_start++; + if (sort_m != sort_mode::_default) { + (void) fprintf(stderr, "overlaydirs: no-sort mode already set\n"); + } + sort_m = sort_mode::none; + } + else if (strcmp(_arg, "--zero") == 0) { + arg_start++; + if (print_m != print_mode::_default) { + (void) fprintf(stderr, "overlaydirs: output mode already set\n"); + } + print_m = print_mode::zero; + } + // TODO + /* + else if (strcmp(_arg, "--escape") == 0) { + arg_start++; + if (print_m != print_mode::_default) { + (void) fprintf(stderr, "overlaydirs: output mode already set\n"); + } + print_m = print_mode::shell_escape; + } + */ + else { + (void) fprintf(stderr, "overlaydirs: unknown option \"%s\"\n", _arg); + // nevertheless, continue and try argument as a directory + break; + } + } + + auto roots = std::vector(); + for (int i = arg_start; i < argc; i++) { + ovl_dirspec_t t; + if (!arg_to_rootspec(argv[i], &t, ovl_rootspec_flags::relative)) continue; + roots.push_back(t); + } + if (roots.empty()) { + (void) fprintf(stderr, "overlaydirs: no usable sources found\n"); + return EINVAL; + } + + auto ovl = process_overlay(roots, sort_m); + // not needed anymore + roots.clear(); + roots.shrink_to_fit(); + + list_overlay(ovl, print_m); + return 0; +} + +static int main_merge(int argc, char * argv[]) +{ + if (argc < 4) usage(EINVAL); + + ovl_dirspec_t target; + if (!arg_to_rootspec(argv[2], &target, ovl_rootspec_flags::relative)) { + (void) fprintf(stderr, "overlaydirs: not a directory: \"%s\"\n", argv[2]); + return EINVAL; + } + if (!is_empty_dir(target.fd)) { + (void) fprintf(stderr, "overlaydirs: target directory is not empty\n"); + return EEXIST; + } + + auto roots = std::vector(); + for (int i = 3; i < argc; i++) { + ovl_dirspec_t t; + if (!arg_to_rootspec(argv[i], &t)) continue; + roots.push_back(t); + } + if (roots.empty()) { + (void) fprintf(stderr, "overlaydirs: no usable sources found\n"); + return EINVAL; + } + + auto ovl = process_overlay(roots); + // not needed anymore + roots.clear(); + roots.shrink_to_fit(); + + int rv = merge_overlay(ovl, target); + if (rv == -1) { + (void) fprintf(stderr, "overlaydirs: target directory is not empty\n"); + return EEXIST; + } + + return rv; +} diff --git a/overlay-common.hh b/overlay-common.hh new file mode 100644 index 0000000..53b61e1 --- /dev/null +++ b/overlay-common.hh @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef INCLUDE_OVERLAY_COMMON_HH +#define INCLUDE_OVERLAY_COMMON_HH 1 + +#include + +extern "C" { + #include +} + +constexpr unsigned int ovl_name_max = sizeof(dirent::d_name); +constexpr unsigned int ovl_path_max = (PATH_MAX); +/* +constexpr unsigned int ovl_path_max = +#ifdef PATH_MAX + #if (PATH_MAX > 0) && (PATH_MAX <= 8192) + (PATH_MAX); + #else + 8192; + #endif +#else + 4096; +#endif +*/ + +constexpr unsigned int ovl_path_name_ratio = ovl_path_max / ovl_name_max; +constexpr unsigned int ovl_depth_max = ovl_path_name_ratio - 1; + +struct ovl_name_t { + size_t len; + char str[ovl_name_max]; +}; + +struct ovl_path_t { + size_t len; + char str[ovl_path_max]; +}; +typedef std::vector ovl_path_vec; + +#endif /* INCLUDE_OVERLAY_COMMON_HH */ diff --git a/overlay.cc b/overlay.cc new file mode 100644 index 0000000..2db9d41 --- /dev/null +++ b/overlay.cc @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#include "overlay.hh" + +#include +#include +#include +#include +#include + +extern "C" { + #include + #include + #include + + #include + #include + + #include +} + +struct x_dirspec_t { + dev_t dev; + ino_t ino; + int fd; +}; +typedef std::vector dirspec_vec; + +typedef std::set> dev_ino_set; + +struct x_entry_t { + int32_t root_id; + uint32_t type; + xxhash_t name_hash; + dev_t dev; + ino_t ino; + ovl_name_t name; +}; + +static +size_t procfs_fd2name(int fd, char * buffer, size_t size); + +template +static inline +bool set_contains(const std::set & set, const T & needle); + +static inline +bool filter_out_dots(const char * name); + +static inline +bool handle_file_type(int type, bool allow_symlinks = false); + +static inline +ovl_entry_kind dirent_to_entry_kind(typeof(dirent::d_type) d_type); + +bool arg_to_rootspec(const char * arg, ovl_dirspec_t * rootspec, uint32_t flags) +{ + if ((!arg) || (!rootspec)) return false; + if (!(arg[0])) return false; + + int fd, fd_real; + fd = open(arg, O_DIRECTORY | O_RDONLY); + // here is relatively small window for race/TOCTOU + fd_real = open(arg, O_PATH | O_NOFOLLOW | O_RDONLY); + + if (fd < 0) { + if (fd_real >= 0) (void) close(fd_real); + return false; + } + + (void) memset(rootspec, 0, sizeof(ovl_dirspec_t)); + + do { + struct stat st; + (void) memset(&st, 0, sizeof(st)); + if (fstat(fd, &st) != 0) break; + + if (!S_ISDIR(st.st_mode)) break; + + rootspec->dev = st.st_dev; + rootspec->ino = st.st_ino; + + rootspec->fd = fd; + rootspec->root.len = procfs_fd2name((fd_real < 0) ? fd : fd_real, rootspec->root.str, ovl_path_max); + } while (false); + + if ((fd_real >= 0) && (fd_real != fd)) { + (void) close(fd_real); + } + + // basic sanity checks: + // - path length is non-zero + // - either dev_t or ino_t are non-zero + if ((!rootspec->root.len) || ((!rootspec->dev) && (!rootspec->ino))) { + // not needed anymore + (void) close(fd); + + (void) memset(rootspec, 0, sizeof(ovl_dirspec_t)); + return false; + } + + // try to adjust/shorten path against current working directory + do { + bool relative = !!(flags & ovl_rootspec_flags::relative); + if (!relative) break; + + ovl_path_t cwd; + (void) memset(&cwd, 0, sizeof(cwd)); + + if (!getcwd(cwd.str, ovl_path_max)) break; + cwd.len = strnlen(cwd.str, ovl_path_max); + if ((!cwd.len) || (cwd.len == ovl_path_max)) break; + + // cwd is root - nothing to do here + if ((cwd.len == 1) && (cwd.str[0] == '/')) break; + + // ensure that rootpath has prefix of cwd + "/" + if (cwd.len >= rootspec->root.len) break; + if (strncmp(rootspec->root.str, cwd.str, cwd.len) != 0) break; + if (rootspec->root.str[cwd.len] != '/') break; + + // "cwd" isn't needed anymore so reuse it as temporary container + (void) memset(cwd.str, 0, cwd.len); + size_t prefix_len = cwd.len + 1; + cwd.len = rootspec->root.len - prefix_len; + (void) memcpy(cwd.str, &rootspec->root.str[prefix_len], cwd.len); + // finally replace with adjusted value + (void) memcpy(&rootspec->root, &cwd, sizeof(ovl_path_t)); + } while (false); + + return true; +} + +bool is_empty_dir(int dirfd) +{ + // internal method, parameters are already checked + /* if (dirfd < 0) return false; */ + + // reset handle to beginning + (void) lseek(dirfd, 0, SEEK_SET); + + int fd_dir = dup(dirfd); + if (fd_dir < 0) return false; + + DIR * p_dir = fdopendir(fd_dir); + if (!p_dir) { + (void) close(fd_dir); + return false; + } + + // read current directory entries + bool empty = true; + for (struct dirent * dent = nullptr; (dent = readdir(p_dir)) != nullptr; ) { + if (!filter_out_dots(dent->d_name)) continue; + empty = false; + break; + } + + // rewind directory + rewinddir(p_dir); + (void) lseek(fd_dir, 0, SEEK_SET); + + // close handles + (void) closedir(p_dir); + // handle was reowned [by libc] via fdopendir() but why not to try to close it [again] + (void) close(fd_dir); + + return empty; +} + +class overlay_processor { + sort_mode _sort; + unsigned int _depth; + ovl_dirspec_vec _roots; + + // we should report problems when they arise but there's common problem: + // name is relative to almost unknown path and we don't know the full path. + // furthermore we won't track path due to performance reasons. + ovl_entry_vec _impl(const dirspec_vec & paths, unsigned int depth) { + ovl_entry_vec rv = ovl_entry_vec(); + + if (!depth) return rv; + + int n_usable_paths = 0; + for (const auto & p : paths) { + if (p.fd < 0) continue; + n_usable_paths++; + } + if (!n_usable_paths) return rv; + + bool multiple_layers = n_usable_paths > 1; + + auto seen_paths = dev_ino_set(); + auto dirs = std::list(); + auto nondirs = std::list(); + + for (size_t i = 0; i < paths.size(); i++) { + const auto & p = paths[i]; + if (p.fd < 0) continue; + + if (multiple_layers) { + auto needle = std::make_tuple(p.dev, p.ino); + if (set_contains(seen_paths, needle)) { + continue; + } + (void) seen_paths.emplace(needle); + } + + // reset handle to beginning + (void) lseek(p.fd, 0, SEEK_SET); + + int fd_dir = dup(p.fd); + if (fd_dir < 0) { + (void) fprintf(stderr, "overlaydirs: dup(%d): %s\n", p.fd, strerror(errno)); + continue; + } + + DIR * p_dir = fdopendir(fd_dir); + if (!p_dir) { + (void) fprintf(stderr, "overlaydirs: fdopendir(%d): %s\n", fd_dir, strerror(errno)); + (void) close(fd_dir); + continue; + } + + // read current directory entries + auto ent_all = std::vector(); + for (struct dirent * dent = nullptr; (dent = readdir(p_dir)) != nullptr; ) { + if (!filter_out_dots(dent->d_name)) continue; + if (!handle_file_type(dent->d_type)) continue; + + x_entry_t t; + (void) memset(&t, 0, sizeof(t)); + + // 1. only basic info is filled - name and type + // 2. directories and symlinks are handled later + // 3. dev_t and ino_t are relevant only for directories + + t.name.len = strnlen(dent->d_name, sizeof(dirent::d_name)); + if (t.name.len == sizeof(dirent::d_name)) { + // TODO: report warning about too long name + // (see function' comment) + continue; + } + + t.type = dent->d_type; + (void) memcpy(t.name.str, dent->d_name, t.name.len); + + ent_all.push_back(t); + } + + // rewind directory + rewinddir(p_dir); + (void) lseek(fd_dir, 0, SEEK_SET); + + // close handles + (void) closedir(p_dir); /* p_dir = nullptr; */ + // handle was reowned [by libc] via fdopendir() but why not to try to close it [again] + (void) close(fd_dir); /* fd_dir = -1; */ + + // process entries + auto hash_skip = std::set(); + auto ent_dirs = std::list(); + auto ent_nondirs = std::list(); + for (auto & x : ent_all) { + bool is_skip = false; + do { + if (x.name.len < 2) break; + /* if (strncmp(&x.name.name[x.name.len - 2], ".-", 2) != 0) break; */ + if (x.name.str[x.name.len - 2] != '.') break; + if (x.name.str[x.name.len - 1] != '-') break; + + // current entry is "skip" entry + is_skip = true; + } while (false); + if (is_skip) { + // skip file is found but we're not using them + if (!multiple_layers) continue; + + // don't use "bare skip" entries (e.g. named exactly ".-") + if (x.name.len == 2) continue; + + (void) hash_skip.emplace(xxhash(x.name.str, x.name.len - 2)); + continue; + } + + if ((x.type == DT_DIR) || (x.type == DT_LNK)) { + struct stat st; + (void) memset(&st, 0, sizeof(st)); + + if (fstatat(p.fd, x.name.str, &st, 0) != 0) { + // TODO: report warning about unavailable entry + // (see function' comment) + continue; + } + + // handle (new) entry type + typeof(x_entry_t::type) new_type = IFTODT(st.st_mode); + if ((x.type == DT_DIR) && (new_type != DT_DIR)) { + // TODO: report warning about inconsistent entry type + // (see function' comment) + // PS: this is probably a bug + continue; + } + + // skip dangling symlinks along with unknown file types + if (!handle_file_type(new_type, true)) { + // TODO: report warning about unapplicable entry type + // (see function' comment) + continue; + } + + // adjust entry type + x.type = new_type; + + if (x.type == DT_DIR) { + if (depth > 1) { + // we should track precise dev_t and ino_t for directories + x.dev = st.st_dev; + x.ino = st.st_ino; + } + else { + // end-of-depth directories are handled as regular files + x.type = DT_REG; + } + } + } + + x.root_id = i; + x.name_hash = xxhash(x.name.str, x.name.len); + + if (x.type == DT_DIR) { + ent_dirs.push_back(x); + } else { + ent_nondirs.push_back(x); + } + } + // not needed anymore + ent_all.clear(); + ent_all.shrink_to_fit(); + + // remove "skip" entries + if (!hash_skip.empty()) { + nondirs.remove_if( + [&hash_skip](const x_entry_t & x) -> bool { + return set_contains(hash_skip, x.name_hash); + } + ); + + dirs.remove_if( + [&hash_skip](const x_entry_t & x) -> bool { + return set_contains(hash_skip, x.name_hash); + } + ); + + // not needed anymore + hash_skip.clear(); + } + + // "hash_skip" is now empty and doesn't contain exactly "skip" entries + // but is going to be reused further + + // override all entries with current "non-dir" entries + if (!ent_nondirs.empty()) { + for (const auto & x : ent_nondirs) { + (void) hash_skip.emplace(x.name_hash); + } + + nondirs.remove_if( + [&hash_skip](const x_entry_t & x) -> bool { + return set_contains(hash_skip, x.name_hash); + } + ); + + dirs.remove_if( + [&hash_skip](const x_entry_t & x) -> bool { + return set_contains(hash_skip, x.name_hash); + } + ); + + // not needed anymore + hash_skip.clear(); + } + + // override "non-dir" entries with "dir" entries + if (!ent_dirs.empty()) { + for (const auto & x : ent_dirs) { + (void) hash_skip.emplace(x.name_hash); + } + + nondirs.remove_if( + [&hash_skip](const x_entry_t & x) -> bool { + return set_contains(hash_skip, x.name_hash); + } + ); + + // not needed anymore + hash_skip.clear(); + } + + // merge current entries with accumulated "all" entries + for (const auto & x : ent_nondirs) { + nondirs.push_back(x); + } + ent_nondirs.clear(); + for (const auto & x : ent_dirs) { + dirs.push_back(x); + } + ent_dirs.clear(); + } + + // sort "dir" entries + if (_sort != sort_mode::none) { + dirs.sort( + [this](const x_entry_t & a, const x_entry_t & b) -> bool { + if (a.name_hash == b.name_hash) return a.root_id < b.root_id; + + return custom_sort(a.name.str, b.name.str, _sort) < 0; + } + ); + } + + // merge entries + auto ent_all = std::list(); + for (const auto & x : dirs) { + ent_all.push_back(x); + } + for (const auto & x : nondirs) { + ent_all.push_back(x); + } + // not needed anymore + nondirs.clear(); + + // sort entries + if (_sort != sort_mode::none) { + ent_all.sort( + [this](const x_entry_t & a, const x_entry_t & b) -> bool { + // in case of directory entries + if (a.name_hash == b.name_hash) return a.root_id < b.root_id; + + return custom_sort(a.name.str, b.name.str, _sort) < 0; + } + ); + } + + auto seen_dirs = std::set(); + for (const auto & x : ent_all) { + if (x.root_id < 0) continue; + + ovl_entry_kind etype = dirent_to_entry_kind(x.type); + if (etype == ovl_entry_kind::unknown) continue; + if (etype == ovl_entry_kind::directory) { + // skip if directory with the same name is already handled + if (set_contains(seen_dirs, x.name_hash)) continue; + (void) seen_dirs.emplace(x.name_hash); + } + + ovl_entry_t t; + t.type = etype; + t.root_id = x.root_id; + t.name_hash = x.name_hash; + t.name.len = x.name.len; + (void) memcpy(t.name.str, x.name.str, t.name.len); + + if (t.type == ovl_entry_kind::directory) { + // root_id is meaningless for directories + t.root_id = 0; + + auto child_paths = dirspec_vec(); + for (size_t i = 0; i < paths.size(); i++) { + child_paths.push_back(x_dirspec_t{ .dev = 0, .ino = 0, .fd = -1 }); + } + + for (const auto & x : dirs) { + if (x.name_hash != t.name_hash) continue; + + int fd = openat(paths[x.root_id].fd, t.name.str, O_DIRECTORY | O_RDONLY); + if (fd < 0) { + // TODO: report warning about unsuccessful openat() + // (see function' comment) + continue; + } + + child_paths[x.root_id].dev = x.dev; + child_paths[x.root_id].ino = x.ino; + child_paths[x.root_id].fd = fd; + } + + t.children = _impl(child_paths, depth - 1); + + for (const auto & x : child_paths) { + if (x.fd < 0) continue; + (void) close(x.fd); + } + child_paths.clear(); + child_paths.shrink_to_fit(); + } + + rv.push_back(t); + } + // not needed anymore + dirs.clear(); + + return rv; + } + +public: + overlay_processor(const ovl_dirspec_vec & roots, unsigned int depth, sort_mode sort) { + _roots = ovl_dirspec_vec(); + for (const auto & r : roots) { + _roots.push_back(r); + } + _sort = sort; + _depth = (depth > ovl_depth_max) ? ovl_depth_max : depth; + } + + ovl_result run(void) { + auto rv = ovl_result(); + + auto paths = dirspec_vec(); + for (const auto & r : _roots) { + rv.roots.push_back(r.root); + paths.push_back(x_dirspec_t{ .dev = r.dev, .ino = r.ino, .fd = r.fd }); + } + rv.entries = _impl(paths, _depth); + return rv; + } +}; + +ovl_result process_overlay(ovl_dirspec_vec & roots, sort_mode sort, unsigned int depth) +{ + auto rv = ovl_result(); + if (!depth) return rv; + if (roots.empty()) return rv; + + rv = overlay_processor(roots, depth, sort).run(); + + // close all open file descriptors + for (auto & r : roots) { + (void) close(r.fd); + r.fd = -1; + } + + return rv; +} + +class overlay_printer { + print_mode _print; + ovl_result _ovl; + + void _impl(const ovl_entry_vec & entries, const ovl_path_t * subpath = nullptr) { + for (const auto & e : entries) { + switch (e.type) { + case ovl_entry_kind::file: + print_path(_print, &(_ovl.roots[e.root_id]), subpath, &(e.name)); + break; + + case ovl_entry_kind::directory: + // NOT printing directory name + + ovl_path_t n_subpath; + (void) memset(&n_subpath, 0, sizeof(n_subpath)); + if (subpath) { + n_subpath.len = subpath->len + 1 + e.name.len; + if (n_subpath.len >= ovl_path_max) { + (void) fprintf(stderr, "subpath too long: /%s/%s/\n", subpath->str, e.name.str); + continue; + } + + /* (void) snprintf(n_subpath.str, ovl_path_max, "%s/%s", subpath->str, e.name.str); */ + size_t off = 0; + (void) memcpy(n_subpath.str, subpath->str, subpath->len); + off = subpath->len; + n_subpath.str[off++] = '/'; + (void) memcpy(n_subpath.str + off, e.name.str, e.name.len); + } else { + // ovl_name_max is definitely smaller than ovl_path_max + + n_subpath.len = e.name.len; + (void) memcpy(n_subpath.str, e.name.str, e.name.len); + } + + _impl(e.children, &n_subpath); + break; + + default: + break; + } + } + } + +public: + overlay_printer(const ovl_result & ovl, print_mode print) { + _ovl = ovl; + _print = print; + } + + void run(void) { + _impl(_ovl.entries); + } +}; + +void list_overlay(const ovl_result & overlay, print_mode print) +{ + overlay_printer(overlay, print).run(); +} + +class overlay_merger { + ovl_result _ovl; + ovl_dirspec_t _target; + + int _impl(const ovl_entry_vec & entries, int dirfd, const ovl_path_t * subpath = nullptr) { + int rv = 0; + + int new_fd; + ovl_path_t t; + for (const auto & e : entries) { + switch (e.type) { + case ovl_entry_kind::file: + (void) memset(&t, 0, sizeof(t)); + + if (subpath) { + t.len = _ovl.roots[e.root_id].len + 1 + subpath->len + 1 + e.name.len; + if (t.len >= ovl_path_max) { + (void) fprintf(stderr, "name too long: %s/%s/%s\n", _ovl.roots[e.root_id].str, subpath->str, e.name.str); + return ENAMETOOLONG; + } + + /* (void) snprintf(t.str, ovl_path_max, "%s/%s/%s\n", _ovl.roots[e.root_id].str, subpath->str, e.name.str); */ + size_t off = 0; + (void) memcpy(t.str, _ovl.roots[e.root_id].str, _ovl.roots[e.root_id].len); + off = _ovl.roots[e.root_id].len; + t.str[off++] = '/'; + (void) memcpy(t.str + off, subpath->str, subpath->len); + off += subpath->len; + t.str[off++] = '/'; + (void) memcpy(t.str + off, e.name.str, e.name.len); + } else { + t.len = _ovl.roots[e.root_id].len + 1 + e.name.len; + if (t.len >= ovl_path_max) { + (void) fprintf(stderr, "name too long: %s/%s\n", _ovl.roots[e.root_id].str, e.name.str); + return ENAMETOOLONG; + } + + /* (void) snprintf(t.str, ovl_path_max, "%s/%s\n", _ovl.roots[e.root_id].str, e.name.str); */ + size_t off = 0; + (void) memcpy(t.str, _ovl.roots[e.root_id].str, _ovl.roots[e.root_id].len); + off = _ovl.roots[e.root_id].len; + t.str[off++] = '/'; + (void) memcpy(t.str + off, e.name.str, e.name.len); + } + + if (symlinkat(t.str, dirfd, e.name.str) < 0) { + rv = errno; + if (subpath) { + (void) fprintf(stderr, "symlinkat() error: \"%s\" with %s/%s/%s -> %s\n", strerror(rv), _target.root.str, subpath->str, e.name.str, t.str); + } else { + (void) fprintf(stderr, "symlinkat() error: \"%s\" with %s/%s -> %s\n", strerror(rv), _target.root.str, e.name.str, t.str); + } + return rv; + } + break; + + case ovl_entry_kind::directory: + (void) memset(&t, 0, sizeof(t)); + + if (subpath) { + t.len = subpath->len + 1 /* "/" */ + e.name.len; + if (t.len >= ovl_path_max) { + (void) fprintf(stderr, "subpath too long: /%s/%s/\n", subpath->str, e.name.str); + return ENAMETOOLONG; + } + + /* (void) snprintf(t.str, ovl_path_max, "%s/%s", subpath->str, e.name.str); */ + (void) memcpy(t.str, subpath->str, subpath->len); + t.str[subpath->len] = '/'; + (void) memcpy(t.str + subpath->len + 1, e.name.str, e.name.len); + } else { + // ovl_name_max is definitely smaller than ovl_path_max + + t.len = e.name.len; + (void) memcpy(t.str, e.name.str, e.name.len); + } + + if (mkdirat(dirfd, e.name.str, 0755) < 0) { + rv = errno; + if (subpath) { + (void) fprintf(stderr, "mkdirat() error: \"%s\" with path %s/%s/%s\n", strerror(rv), _target.root.str, subpath->str, e.name.str); + } else { + (void) fprintf(stderr, "mkdirat() error: \"%s\" with path %s/%s\n", strerror(rv), _target.root.str, e.name.str); + } + return rv; + } + + new_fd = openat(dirfd, e.name.str, O_DIRECTORY | O_RDONLY); + if (new_fd < 0) { + rv = errno; + if (subpath) { + (void) fprintf(stderr, "openat() error: \"%s\" with path %s/%s/%s\n", strerror(rv), _target.root.str, subpath->str, e.name.str); + } else { + (void) fprintf(stderr, "openat() error: \"%s\" with path %s/%s\n", strerror(rv), _target.root.str, e.name.str); + } + return rv; + } + + rv = _impl(e.children, new_fd, &t); + (void) close(new_fd); + + if (rv != 0) return rv; + + break; + + default: + break; + } + } + + return rv; + } + +public: + overlay_merger(const ovl_result & ovl, const ovl_dirspec_t & target) { + _ovl = ovl; + _target = target; + } + + int run(void) { + return _impl(_ovl.entries, _target.fd); + } +}; + +int merge_overlay(const ovl_result & overlay, const ovl_dirspec_t & target) +{ + if (!is_empty_dir(target.fd)) return -1; + + return overlay_merger(overlay, target).run(); +} + +static +size_t procfs_fd2name(int fd, char * buffer, size_t size) +{ + // internal method, parameters are already checked + /* if ((fd < 0) || (!buffer) || (!size)) return 0; */ + + // "/proc/self/fd/" - 14, "%d" - up to 10 + char procfs_link[32]; + (void) memset(procfs_link, 0, sizeof(procfs_link)); + (void) snprintf(procfs_link, sizeof(procfs_link) - 1, "/proc/self/fd/%d", fd); + + (void) memset(buffer, 0, size); + ssize_t x = readlink(procfs_link, buffer, size - 1); + x = (x > 0) ? x : 0; + // not really needed + buffer[x] = 0; + + // already clamped + return (size_t) x; +} + +#ifndef HAVE_CPP_SET_CONTAINS + #ifdef __cplusplus + #if __cplusplus >= 202002L + #define HAVE_CPP_SET_CONTAINS 1 + #endif + #endif +#endif + +#ifndef HAVE_CPP_SET_CONTAINS + #define HAVE_CPP_SET_CONTAINS 0 +#endif + +template +static inline +bool set_contains(const std::set & set, const T & needle) +{ +#if HAVE_CPP_SET_CONTAINS + return set.contains(needle); +#else + return (set.find(needle) != set.cend()); +#endif +} + +static inline +bool filter_out_dots(const char * name) +{ + /* + if (strcmp(name, ".") == 0) return false; + if (strcmp(name, "..") == 0) return false; + return true; + */ + + if (name[0] != '.') return true; + switch (name[1]) { + case 0: return false; + case '.': return (name[2] != 0); + } + return true; +} + +static inline +bool handle_file_type(int type, bool allow_symlinks) +{ + switch (type) { + // common file types + case DT_REG: /* -fallthrough */ + case DT_DIR: /* -fallthrough */ + + // less common types (but also allowed) + case DT_BLK: /* -fallthrough */ + case DT_CHR: /* -fallthrough */ + case DT_FIFO: /* -fallthrough */ + case DT_SOCK: + return true; + + // symlinks should be handled separately! + case DT_LNK: + return !allow_symlinks; + + default: + return false; + } +} + +static inline +ovl_entry_kind dirent_to_entry_kind(typeof(dirent::d_type) d_type) { + switch (d_type) { + case DT_DIR: + return ovl_entry_kind::directory; + + case DT_REG: /* -fallthrough */ + case DT_BLK: /* -fallthrough */ + case DT_CHR: /* -fallthrough */ + case DT_FIFO: /* -fallthrough */ + case DT_SOCK: + return ovl_entry_kind::file; + + // symlinks should be already handled! + + } + return ovl_entry_kind::unknown; +} diff --git a/overlay.hh b/overlay.hh new file mode 100644 index 0000000..103160f --- /dev/null +++ b/overlay.hh @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef INCLUDE_OVERLAY_HH +#define INCLUDE_OVERLAY_HH 1 + +#include + +extern "C" { + #include +} + +#include "overlay-common.hh" + +#include "print.hh" +#include "sort.hh" +#include "xxhash.hh" + +struct ovl_dirspec_t { + dev_t dev; + ino_t ino; + int fd; + ovl_path_t root; +}; +typedef std::vector ovl_dirspec_vec; + +enum ovl_rootspec_flags : uint32_t { + none = 0, + + // relative path (if possible) + relative = (1 << 0), +}; + +// forward declaration +struct ovl_entry_t; +typedef std::vector ovl_entry_vec; + +enum struct ovl_entry_kind : uint32_t { + unknown = 0, + + // entry may have it's own entries + directory, + + // entry may NOT have it's own entries + // this applies to: + // - regular files + // - directories at maximum depth (!) + // - block and character devices + // - pipes (FIFO) + // - sockets + file, +}; + +struct ovl_entry_t { + ovl_entry_kind type; + int32_t root_id; + xxhash_t name_hash; + ovl_entry_vec children; + ovl_name_t name; + + ovl_entry_t() { + root_id = -1; + type = ovl_entry_kind::unknown; + name_hash = 0; + children = ovl_entry_vec(); + (void) memset(&name, 0, sizeof(name)); + } +}; + +struct ovl_result { + ovl_path_vec roots; + ovl_entry_vec entries; + + ovl_result() { + roots = ovl_path_vec(); + entries = ovl_entry_vec(); + } +}; + +bool arg_to_rootspec(const char * arg, ovl_dirspec_t * path, uint32_t flags = ovl_rootspec_flags::none); + +bool is_empty_dir(int dirfd); + +ovl_result process_overlay(ovl_dirspec_vec & roots, sort_mode sort = sort_mode::none, unsigned int depth = ovl_depth_max); + +void list_overlay(const ovl_result & overlay, print_mode print = print_mode::normal); +int merge_overlay(const ovl_result & overlay, const ovl_dirspec_t & target); + +#endif /* INCLUDE_OVERLAY_HH */ diff --git a/print.cc b/print.cc new file mode 100644 index 0000000..75a136a --- /dev/null +++ b/print.cc @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#include "print.hh" + +#include +#include + +extern "C" { + #include +} + +static +void print_simple(print_mode mode, const ovl_path_t * root, const ovl_path_t * subpath, const ovl_name_t * name); + +// TODO: shell-escape mode +/* +static +void print_escape(print_mode mode, const ovl_path_t * root, const ovl_path_t * subpath, const ovl_name_t * name); +*/ + +void print_path(print_mode mode, const ovl_path_t * root, const ovl_path_t * subpath, const ovl_name_t * name) +{ + // "root" and "name" are mandatory + /* if ((!root) || (!name)) return; */ + + switch (mode) { + case print_mode::normal: /* -fallthrough */ + case print_mode::zero: + print_simple(mode, root, subpath, name); + return; + // TODO: shell-escape mode + /* + case print_mode::shell_escape: + print_escape(mode, root, subpath, name); + return; + */ + } +} + +static +void print_simple(print_mode mode, const ovl_path_t * root, const ovl_path_t * subpath, const ovl_name_t * name) +{ + // internal method, parameters are already checked + /* if ((!root) || (!name)) return 0; */ + + char eol = (mode == print_mode::zero) ? 0 : '\n'; + if (subpath) { + (void) fprintf(stdout, "%s/%s/%s%c", root->str, subpath->str, name->str, eol); + } else { + (void) fprintf(stdout, "%s/%s%c", root->str, name->str, eol); + } +} + +// TODO: shell-escape mode +/* +static +void print_escape(print_mode mode, const ovl_path_t * root, const ovl_path_t * subpath, const ovl_name_t * name) +{ +} +*/ diff --git a/print.hh b/print.hh new file mode 100644 index 0000000..b985684 --- /dev/null +++ b/print.hh @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef INCLUDE_LIST_MODE_HH +#define INCLUDE_LIST_MODE_HH 1 + +#include "overlay-common.hh" + +enum struct print_mode : unsigned int { + normal = 0, + zero, + + // TODO + /* shell_escape, */ + + _default = normal, +}; + +void print_path(print_mode mode, const ovl_path_t * root, const ovl_path_t * subpath, const ovl_name_t * name); + +#endif /* INCLUDE_LIST_MODE_HH */ diff --git a/sort.cc b/sort.cc new file mode 100644 index 0000000..f3c1865 --- /dev/null +++ b/sort.cc @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#include "sort.hh" + +#include + +#include "coreutils-sort.hh" + +int custom_sort(const char * a, const char * b, sort_mode mode) +{ + switch (mode) { + case sort_mode::glibc_version_sort: + return strverscmp(a, b); + case sort_mode::coreutils_version_sort: + return coreutils_version_sort(a, b); + + // not a actual version sort + + /* + case sort_mode::simple_sort: + return strcmp(a, b); + */ + + default: + return strcmp(a, b); + } +} diff --git a/sort.hh b/sort.hh new file mode 100644 index 0000000..1b7d665 --- /dev/null +++ b/sort.hh @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef INCLUDE_SORT_MODE_HH +#define INCLUDE_SORT_MODE_HH 1 + +enum struct sort_mode : unsigned int { + // don't sort entries at all + none = 0, + + // sort entries using version sort from coreutils + coreutils_version_sort, + + // sort entries using version sort from glibc + glibc_version_sort, + + // simpliest sort via strcmp() + simple_sort, + + _default = coreutils_version_sort, +}; + +int custom_sort(const char * a, const char * b, sort_mode mode); + +#endif /* INCLUDE_SORT_MODE_HH */ diff --git a/xxhash.cc b/xxhash.cc new file mode 100644 index 0000000..e69f39f --- /dev/null +++ b/xxhash.cc @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#define XXH_IMPLEMENTATION + +#include "xxhash.hh" + +xxhash_t xxhash(const void * input, size_t len, xxhash_t seed) +{ +#ifndef XXH_NO_LONG_LONG + XXH64_state_t st; + XXH64_reset(&st, seed); + XXH64_update(&st, input, len); + return XXH64_digest(&st); +#else + XXH32_state_t st; + XXH32_reset(&st, seed); + XXH32_update(&st, input, len); + return XXH32_digest(&st); +#endif +} diff --git a/xxhash.hh b/xxhash.hh new file mode 100644 index 0000000..b0c6f3c --- /dev/null +++ b/xxhash.hh @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: Apache-2.0 + * (c) 2025, Konstantin Demin + */ + +#ifndef INCLUDE_XXHASH_HH +#define INCLUDE_XXHASH_HH 1 + +#include +#if LONG_MAX == INT_MAX + #define XXH_NO_LONG_LONG +#endif + +// #define XXH_NO_EXTERNC_GUARD + +#define XXH_NO_STDLIB +#define XXH_STATIC_LINKING_ONLY + +#define XXH_NO_XXH3 + +#define XXH_NO_INLINE_HINTS 1 +#define XXH_SIZE_OPT 2 + +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define XXH_CPU_LITTLE_ENDIAN 1 + #endif +#endif + +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + #define XXH_CPU_LITTLE_ENDIAN 0 + #endif +#endif + +#ifdef USE_BUNDLED_XXHASH + #include "xxhash.h" +#else /* !USE_BUNDLED_XXHASH */ + #include +#endif /* USE_BUNDLED_XXHASH */ + +#ifndef XXH_NO_LONG_LONG + #define xxhash_t XXH64_hash_t +#else + #define xxhash_t XXH32_hash_t +#endif + +xxhash_t xxhash(const void * input, size_t len, xxhash_t seed = 0); + +#endif /* INCLUDE_XXHASH_HH */