Thank you for your comments. I'm currently writing a v2 of this patch with what I've learned working on the netlink interface. I will fix it accordingly all your comments.
[ Now, I can boot a PPC64 container with systemd on an x86_64 host: https://asciinema.org/a/7sgwvok72m34v7hg08hfspg4s ] Laurent Le 14/07/2015 22:50, Peter Maydell a écrit : > On 29 May 2015 at 23:50, Laurent Vivier <laur...@vivier.eu> wrote: >> This patch introduces a system very similar to the one used in the kernel >> to attach specific functions to a given file descriptor. >> >> In this case, we attach a specific "read()" to the fd returned by >> signalfd() to be able to byte-swap the signalfd_siginfo structure >> provided by read(). >> >> This system could be also used to introduce netlink sockets. > > Thanks for this patch; apologies for the delayed review. > >> This patch allows to execute the example program given by >> man signalfd(2): >> >> do { perror(msg); exit(EXIT_FAILURE); } while (0) > > Your commit message seems to have lost all the lines beginning > '#' in this example program. I blame git :-) > >> int >> main(int argc, char *argv[]) >> { >> sigset_t mask; >> int sfd; >> struct signalfd_siginfo fdsi; >> ssize_t s; >> >> sigemptyset(&mask); >> sigaddset(&mask, SIGINT); >> sigaddset(&mask, SIGQUIT); >> >> /* Block signals so that they aren't handled >> according to their default dispositions */ >> >> if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) >> handle_error("sigprocmask"); >> >> sfd = signalfd(-1, &mask, 0); >> if (sfd == -1) >> handle_error("signalfd"); >> >> for (;;) { >> s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo)); >> if (s != sizeof(struct signalfd_siginfo)) >> handle_error("read"); >> >> if (fdsi.ssi_signo == SIGINT) { >> printf("Got SIGINT\n"); >> } else if (fdsi.ssi_signo == SIGQUIT) { >> printf("Got SIGQUIT\n"); >> exit(EXIT_SUCCESS); >> } else { >> printf("Read unexpected signal\n"); >> } >> } >> } >> >> $ ./signalfd_demo >> ^CGot SIGINT >> ^CGot SIGINT >> ^\Got SIGQUIT >> >> Signed-off-by: Laurent Vivier <laur...@vivier.eu> >> --- >> linux-user/syscall.c | 117 >> ++++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 116 insertions(+), 1 deletion(-) >> >> diff --git a/linux-user/syscall.c b/linux-user/syscall.c >> index 1622ad6..c72c440 100644 >> --- a/linux-user/syscall.c >> +++ b/linux-user/syscall.c >> @@ -60,6 +60,7 @@ int __clone2(int (*fn)(void *), void *child_stack_base, >> #include <sys/statfs.h> >> #include <utime.h> >> #include <sys/sysinfo.h> >> +#include <sys/signalfd.h> >> //#include <sys/user.h> >> #include <netinet/ip.h> >> #include <netinet/tcp.h> >> @@ -294,6 +295,15 @@ static bitmask_transtbl fcntl_flags_tbl[] = { >> { 0, 0, 0, 0 } >> }; >> >> +struct target_fd_ops { >> + abi_long (*read) (int, void *, size_t); > > Maybe could use a brief comment defining the API here (for instance > that you need to return -target_errno isn't necessarily obvious). > >> +}; >> + >> +typedef struct target_fd_ops target_fd_ops_t; > > QEMU style generally prefers CamelCase for struct and > struct typedef names. > >> + >> +static int target_fd_max; >> +static target_fd_ops_t **target_fd_ops; >> + >> static int sys_getcwd1(char *buf, size_t size) >> { >> if (getcwd(buf, size) == NULL) { >> @@ -4878,6 +4888,7 @@ void syscall_init(void) >> const argtype *arg_type; >> int size; >> int i; >> + struct rlimit rlim; >> >> #define STRUCT(name, ...) thunk_register_struct(STRUCT_ ## name, #name, >> struct_ ## name ## _def); >> #define STRUCT_SPECIAL(name) thunk_register_struct_direct(STRUCT_ ## name, >> #name, &struct_ ## name ## _def); >> @@ -4885,6 +4896,14 @@ void syscall_init(void) >> #undef STRUCT >> #undef STRUCT_SPECIAL >> >> + /* allocate an array to store fd ops */ >> + >> + if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { >> + target_fd_max = rlim.rlim_cur; >> + target_fd_ops = g_malloc0(target_fd_max * sizeof(target_fd_ops_t >> *)); >> + } > > ...and what if the guest raises the rlimit after we've started? > I think it would be better to just dynamically allocate/reallocate > the table when the guest does a signalfd syscall, rather than > trying to allocate it once at startup. > >> + >> + >> /* Build target_to_host_errno_table[] table from >> * host_to_target_errno_table[]. */ >> for (i = 0; i < ERRNO_TABLE_SIZE; i++) { >> @@ -5512,6 +5531,51 @@ static target_timer_t get_timer_id(abi_long arg) >> return timerid; >> } >> >> +/* signalfd siginfo conversion */ >> + >> +static void >> +host_to_target_signalfd_siginfo(struct signalfd_siginfo *tinfo, >> + const struct signalfd_siginfo *info) >> +{ >> + int sig = host_to_target_signal(info->ssi_signo); >> + tinfo->ssi_signo = tswap32(sig); >> + tinfo->ssi_errno = 0; /* unused */ > > (a) Why not swap it anyway? > (b) the kernel seems to write something to this field: > http://lxr.free-electrons.com/source/fs/signalfd.c#L97 > >> + tinfo->ssi_code = tswap32(info->ssi_code); >> + tinfo->ssi_pid = tswap32(info->ssi_pid); >> + tinfo->ssi_uid = tswap32(info->ssi_uid); >> + tinfo->ssi_fd = tswap32(info->ssi_fd); >> + tinfo->ssi_tid = tswap32(info->ssi_tid); >> + tinfo->ssi_band = tswap32(info->ssi_band); >> + tinfo->ssi_overrun = tswap32(info->ssi_overrun); >> + tinfo->ssi_trapno = tswap32(info->ssi_trapno); >> + tinfo->ssi_status = tswap32(info->ssi_status); >> + tinfo->ssi_int = tswap32(info->ssi_int); >> + tinfo->ssi_ptr = tswap64(info->ssi_ptr); >> + tinfo->ssi_utime = tswap64(info->ssi_utime); >> + tinfo->ssi_stime = tswap64(info->ssi_stime); >> + tinfo->ssi_addr = tswap64(info->ssi_addr); > > Missing > tinfo->ssi_addr_lsb = tswap16(info->ssi_addr_lsb); > >> +} >> + >> +static abi_long target_signalfd_read(int fd, void *buf, size_t count) >> +{ >> + int ret; >> + int i; >> + >> + ret = get_errno(read(fd, buf, count)); > > The return value from read might not fit into an int... > >> + if (ret < 0) { >> + return ret; >> + } >> + >> + for (i = 0; i < ret; i += sizeof(struct signalfd_siginfo)) { >> + host_to_target_signalfd_siginfo(buf + i, buf + i); > > This is implicitly assuming that none of the fields in > the target and host arrangements of signalfd_siginfo > overlap, so that all we're doing is swapping each one > in place. That's true, but it's a bit subtle... > >> + } >> + return ret; >> +} >> + >> +static target_fd_ops_t target_signalfd_ops = { >> + .read = target_signalfd_read, >> +}; >> + >> /* do_syscall() should always have a single exit point at the end so >> that actions, such as logging of syscall results, can be performed. >> All errnos that do_syscall() returns must be -TARGET_<errcode>. */ >> @@ -5571,7 +5635,13 @@ abi_long do_syscall(void *cpu_env, int num, abi_long >> arg1, >> else { >> if (!(p = lock_user(VERIFY_WRITE, arg2, arg3, 0))) >> goto efault; >> - ret = get_errno(read(arg1, p, arg3)); >> + if (arg1 < target_fd_max && >> + target_fd_ops[arg1] && >> + target_fd_ops[arg1]->read) { >> + ret = target_fd_ops[arg1]->read(arg1, p, arg3); >> + } else { >> + ret = get_errno(read(arg1, p, arg3)); >> + } > > This won't do the right thing in the case of: > emulated-under-QEMU program A does a signalfd() with CLOEXEC clear > program A exec()s program B, also emulated under QEMU > program B reads its inherited signalfd descriptor > > That's a bit of an edge case, of course, and practically impossible > to get right. It is mentioned specifically as working in the > signalfd manpage, though; maybe worth a comment somewhere that we > don't do the right thing. > >> unlock_user(p, arg2, ret); >> } >> break; >> @@ -5598,6 +5668,10 @@ abi_long do_syscall(void *cpu_env, int num, abi_long >> arg1, >> unlock_user(p, arg2, 0); >> break; >> case TARGET_NR_close: >> + if (arg1 < target_fd_max && >> + target_fd_ops[arg1]) { >> + target_fd_ops[arg1] = NULL; >> + } > > This isn't the only place an fd can get closed; for instance > dup2() can implicitly close an fd. I think you need a wider set > of changes to either (a) NULL out the entries in all the places > where we can close an fd or (b) NULL out the entries in all the > places where we can open an fd. > >> ret = get_errno(close(arg1)); >> break; >> case TARGET_NR_brk: >> @@ -9451,6 +9525,26 @@ abi_long do_syscall(void *cpu_env, int num, abi_long >> arg1, >> break; >> #endif >> #endif >> +#if defined(TARGET_NR_signalfd4) >> + case TARGET_NR_signalfd4: >> + { >> + target_sigset_t *target_mask; >> + sigset_t mask; >> + if (!lock_user_struct(VERIFY_READ, target_mask, arg2, 1)) { >> + goto efault; >> + } >> + >> + target_to_host_sigset(&mask, target_mask); >> + >> + ret = get_errno(signalfd(arg1, &mask, arg4)); > > The signalfd flags are SFD_NONBLOCK and SFD_CLOEXEC, which the > kernel defines (and asserts) to be equal to O_NONBLOCK and > O_CLOEXEC. Those are not the same for all architectures, so we > need to convert the flags argument here. > >> + if (ret >= 0) { >> + target_fd_ops[ret] = &target_signalfd_ops; > > We checked the fd against target_fds_max earlier, but we don't here? > (Moot if you take my suggestion of auto-reallocating the fd_ops > array here.) > >> + } >> + >> + unlock_user_struct(target_mask, arg2, 0); > > You might as well do the unlock as soon as you've done > the target_to_host_sigset() conversion. > >> + } >> + break; >> +#endif >> #if defined(CONFIG_EPOLL) >> #if defined(TARGET_NR_epoll_create) >> case TARGET_NR_epoll_create: >> @@ -9743,6 +9837,27 @@ abi_long do_syscall(void *cpu_env, int num, abi_long >> arg1, >> } >> #endif >> >> +#if defined(TARGET_NR_signalfd) > > Can we put this next to the signalfd4 code rather than separated by > other cases? > >> + case TARGET_NR_signalfd: >> + { >> + target_sigset_t *target_mask; >> + sigset_t mask; >> + if (!lock_user_struct(VERIFY_READ, target_mask, arg2, 1)) { >> + goto efault; >> + } >> + >> + target_to_host_sigset(&mask, target_mask); >> + >> + ret = get_errno(signalfd(arg1, &mask, arg3)); > > The syscall signalfd() has the arguments (fd, mask, sizemask), > whereas the libc function takes (fd, mask, flags), so passing arg3 > here looks wrong. Should be const 0 ? > >> + if (ret >= 0) { >> + target_fd_ops[ret] = &target_signalfd_ops; >> + } >> + >> + unlock_user_struct(target_mask, arg2, 0); > > This is very similar to the signalfd4 implementation, > unsurprisingly. I would suggest factoring it out into a > do_signalfd4() function to be called from both places. > >> + } >> + break; >> +#endif >> + >> #if defined(TARGET_NR_timerfd_create) && defined(CONFIG_TIMERFD) >> case TARGET_NR_timerfd_create: >> ret = get_errno(timerfd_create(arg1, >> -- >> 2.4.1 > > thanks > -- PMM >