The branch main has been updated by dchagin:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3d2fec7db856c67e1a94a87a846d8ffe6f48b61f

commit 3d2fec7db856c67e1a94a87a846d8ffe6f48b61f
Author:     Dmitry Chagin <[email protected]>
AuthorDate: 2023-05-29 08:15:28 +0000
Commit:     Dmitry Chagin <[email protected]>
CommitDate: 2023-05-29 08:15:28 +0000

    namei: Add the abilty for the ABI to specify an alternate root path
    
    For now a non-native ABI (i.e., Linux) uses the kern_alternate_path()
    facility to dynamically reroot lookups. First, an attempt is made to
    lookup the file in /compat/linux/original-path. If that fails, the
    lookup is done in /original-path. Thats requires a bit of code in
    every ABI syscall implementation where path name translation is needed.
    Also our kern_alternate_path() does not properly lookups absolute symlinks
    in second attempt, i.e., does not append /compat/linux part to the resolved
    link.
    The change is intended to avoid this by specifiyng the ABI root directory
    for namei(), using one call to pwd_altroot() during exec-time into the ABI.
    In that case namei() will dynamically reroot lookups as mentioned above.
    
    PR:                     72920
    Reviewed by:            kib
    Differential revision:  https://reviews.freebsd.org/D38933
    MFC after:              2 month
---
 sys/kern/kern_descrip.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++--
 sys/kern/vfs_cache.c    |  4 ++--
 sys/kern/vfs_lookup.c   | 30 +++++++++++++++++++++---
 sys/sys/filedesc.h      |  4 ++++
 sys/sys/namei.h         | 11 +++++++--
 5 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 0be59e930dd4..908c3352514b 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -3839,6 +3839,11 @@ pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
                vrefact(oldpwd->pwd_jdir);
                newpwd->pwd_jdir = oldpwd->pwd_jdir;
        }
+
+       if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) {
+               vrefact(oldpwd->pwd_adir);
+               newpwd->pwd_adir = oldpwd->pwd_adir;
+       }
 }
 
 struct pwd *
@@ -3930,6 +3935,8 @@ pwd_drop(struct pwd *pwd)
                vrele(pwd->pwd_rdir);
        if (pwd->pwd_jdir != NULL)
                vrele(pwd->pwd_jdir);
+       if (pwd->pwd_adir != NULL)
+               vrele(pwd->pwd_adir);
        uma_zfree_smr(pwd_zone, pwd);
 }
 
@@ -3967,6 +3974,8 @@ pwd_chroot(struct thread *td, struct vnode *vp)
 
        vrefact(vp);
        newpwd->pwd_rdir = vp;
+       vrefact(vp);
+       newpwd->pwd_adir = vp;
        if (oldpwd->pwd_jdir == NULL) {
                vrefact(vp);
                newpwd->pwd_jdir = vp;
@@ -3997,6 +4006,40 @@ pwd_chdir(struct thread *td, struct vnode *vp)
        pwd_drop(oldpwd);
 }
 
+/*
+ * Process is transitioning to/from a non-native ABI.
+ */
+void
+pwd_altroot(struct thread *td, struct vnode *altroot_vp)
+{
+       struct pwddesc *pdp;
+       struct pwd *newpwd, *oldpwd;
+
+       newpwd = pwd_alloc();
+       pdp = td->td_proc->p_pd;
+       PWDDESC_XLOCK(pdp);
+       oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
+       if (altroot_vp != NULL) {
+               /*
+                * Native process to a non-native ABI.
+                */
+
+               vrefact(altroot_vp);
+               newpwd->pwd_adir = altroot_vp;
+       } else {
+               /*
+                * Non-native process to the native ABI.
+                */
+
+               vrefact(oldpwd->pwd_rdir);
+               newpwd->pwd_adir = oldpwd->pwd_rdir;
+       }
+       pwd_fill(oldpwd, newpwd);
+       pwd_set(pdp, newpwd);
+       PWDDESC_XUNLOCK(pdp);
+       pwd_drop(oldpwd);
+}
+
 /*
  * jail_attach(2) changes both root and working directories.
  */
@@ -4030,6 +4073,8 @@ pwd_chroot_chdir(struct thread *td, struct vnode *vp)
                vrefact(vp);
                newpwd->pwd_jdir = vp;
        }
+       vrefact(vp);
+       newpwd->pwd_adir = vp;
        pwd_fill(oldpwd, newpwd);
        pwd_set(pdp, newpwd);
        PWDDESC_XUNLOCK(pdp);
@@ -4046,7 +4091,8 @@ pwd_ensure_dirs(void)
        pdp = curproc->p_pd;
        PWDDESC_XLOCK(pdp);
        oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
-       if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
+       if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL &&
+           oldpwd->pwd_adir != NULL) {
                PWDDESC_XUNLOCK(pdp);
                return;
        }
@@ -4064,6 +4110,10 @@ pwd_ensure_dirs(void)
                vrefact(rootvnode);
                newpwd->pwd_rdir = rootvnode;
        }
+       if (newpwd->pwd_adir == NULL) {
+               vrefact(rootvnode);
+               newpwd->pwd_adir = rootvnode;
+       }
        pwd_set(pdp, newpwd);
        PWDDESC_XUNLOCK(pdp);
        pwd_drop(oldpwd);
@@ -4084,6 +4134,8 @@ pwd_set_rootvnode(void)
        newpwd->pwd_cdir = rootvnode;
        vrefact(rootvnode);
        newpwd->pwd_rdir = rootvnode;
+       vrefact(rootvnode);
+       newpwd->pwd_adir = rootvnode;
        pwd_fill(oldpwd, newpwd);
        pwd_set(pdp, newpwd);
        PWDDESC_XUNLOCK(pdp);
@@ -4119,7 +4171,8 @@ mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
                if (oldpwd == NULL ||
                    (oldpwd->pwd_cdir != olddp &&
                    oldpwd->pwd_rdir != olddp &&
-                   oldpwd->pwd_jdir != olddp)) {
+                   oldpwd->pwd_jdir != olddp &&
+                   oldpwd->pwd_adir != olddp)) {
                        PWDDESC_XUNLOCK(pdp);
                        pddrop(pdp);
                        continue;
@@ -4136,6 +4189,10 @@ mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
                        vrefact(newdp);
                        newpwd->pwd_jdir = newdp;
                }
+               if (oldpwd->pwd_adir == olddp) {
+                       vrefact(newdp);
+                       newpwd->pwd_adir = newdp;
+               }
                pwd_fill(oldpwd, newpwd);
                pwd_set(pdp, newpwd);
                PWDDESC_XUNLOCK(pdp);
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index e4977392349f..8daaf5bc53ad 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -4349,7 +4349,7 @@ cache_fpl_terminated(struct cache_fpl *fpl)
        (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT 
| \
         FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | WILLBEDIR | \
         ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | 
OPENREAD | \
-        OPENWRITE | WANTIOCTLCAPS)
+        OPENWRITE | WANTIOCTLCAPS | ISRESTARTED)
 
 #define CACHE_FPL_INTERNAL_CN_FLAGS \
        (ISDOTDOT | MAKEENTRY | ISLASTCN)
@@ -6238,7 +6238,7 @@ cache_fplookup(struct nameidata *ndp, enum 
cache_fpl_status *status,
        fpl.pwd = pwdp;
        pwd = pwd_get_smr();
        *(fpl.pwd) = pwd;
-       ndp->ni_rootdir = pwd->pwd_rdir;
+       namei_setup_rootdir(ndp, cnp, pwd);
        ndp->ni_topdir = pwd->pwd_jdir;
 
        if (cnp->cn_pnbuf[0] == '/') {
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index a75ea4ca16d6..593e1e487c6f 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -81,6 +81,13 @@ static void NDVALIDATE_impl(struct nameidata *, int);
 #define NDVALIDATE(ndp)
 #endif
 
+#define        NDRESTART(ndp) do {                                             
\
+       NDREINIT_DBG(ndp);                                              \
+       ndp->ni_resflags = 0;                                           \
+       ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;                  \
+       ndp->ni_cnd.cn_flags |= ISRESTARTED;                            \
+} while (0)
+
 SDT_PROVIDER_DEFINE(vfs);
 SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
     "unsigned long", "bool");
@@ -334,7 +341,7 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, 
struct pwd **pwdp)
         * The reference on ni_rootdir is acquired in the block below to avoid
         * back-to-back atomics for absolute lookups.
         */
-       ndp->ni_rootdir = pwd->pwd_rdir;
+       namei_setup_rootdir(ndp, cnp, pwd);
        ndp->ni_topdir = pwd->pwd_jdir;
 
        if (cnp->cn_pnbuf[0] == '/') {
@@ -594,6 +601,7 @@ namei(struct nameidata *ndp)
        MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
            ndp->ni_startdir->v_type == VBAD);
 
+restart:
        ndp->ni_lcf = 0;
        ndp->ni_loopcnt = 0;
        ndp->ni_vp = NULL;
@@ -628,6 +636,12 @@ namei(struct nameidata *ndp)
        case CACHE_FPL_STATUS_HANDLED:
                if (error == 0)
                        NDVALIDATE(ndp);
+               else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
+                   (cnp->cn_flags & ISRESTARTED) == 0)) {
+                       namei_cleanup_cnp(cnp);
+                       NDRESTART(ndp);
+                       goto restart;
+               }
                return (error);
        case CACHE_FPL_STATUS_PARTIAL:
                TAILQ_INIT(&ndp->ni_cap_tracker);
@@ -668,8 +682,18 @@ namei(struct nameidata *ndp)
        for (;;) {
                ndp->ni_startdir = dp;
                error = vfs_lookup(ndp);
-               if (error != 0)
-                       goto out;
+               if (error != 0) {
+                       if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
+                           error == ENOENT &&
+                           (cnp->cn_flags & ISRESTARTED) == 0)) {
+                               nameicap_cleanup(ndp);
+                               pwd_drop(pwd);
+                               namei_cleanup_cnp(cnp);
+                               NDRESTART(ndp);
+                               goto restart;
+                       } else
+                               goto out;
+               }
 
                /*
                 * If not a symbolic link, we're done.
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index 578b84696663..bba12d08287c 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -89,6 +89,8 @@ struct fdescenttbl {
 /*
  * This struct is copy-on-write and allocated from an SMR zone.
  * All fields are constant after initialization apart from the reference count.
+ * The ABI root directory is initialized as the root directory and changed
+ * during process transiting to or from non-native ABI.
  *
  * Check pwd_* routines for usage.
  */
@@ -97,6 +99,7 @@ struct pwd {
        struct  vnode   *pwd_cdir;      /* current directory */
        struct  vnode   *pwd_rdir;      /* root directory */
        struct  vnode   *pwd_jdir;      /* jail root directory */
+       struct  vnode   *pwd_adir;      /* abi root directory */
 };
 typedef SMR_POINTER(struct pwd *) smrpwd_t;
 
@@ -342,6 +345,7 @@ struct pwddesc *pdinit(struct pwddesc *pdp, bool keeplock);
 struct pwddesc *pdshare(struct pwddesc *pdp);
 void   pdunshare(struct thread *td);
 
+void   pwd_altroot(struct thread *td, struct vnode *altroot_vp);
 void   pwd_chdir(struct thread *td, struct vnode *vp);
 int    pwd_chroot(struct thread *td, struct vnode *vp);
 int    pwd_chroot_chdir(struct thread *td, struct vnode *vp);
diff --git a/sys/sys/namei.h b/sys/sys/namei.h
index e12d79b19c6e..88ddb0f13458 100644
--- a/sys/sys/namei.h
+++ b/sys/sys/namei.h
@@ -159,7 +159,7 @@ int cache_fplookup(struct nameidata *ndp, enum 
cache_fpl_status *status,
  * Namei parameter descriptors.
  */
 #define        RDONLY          0x00000200 /* lookup with read-only semantics */
-/* UNUSED              0x00000400 */
+#define        ISRESTARTED     0x00000400 /* restarted namei */
 /* UNUSED              0x00000800 */
 #define        ISWHITEOUT      0x00001000 /* found whiteout */
 #define        DOWHITEOUT      0x00002000 /* do whiteouts */
@@ -187,7 +187,7 @@ int cache_fplookup(struct nameidata *ndp, enum 
cache_fpl_status *status,
  */
 #define NAMEI_INTERNAL_FLAGS   \
        (NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \
-        TRAILINGSLASH)
+        TRAILINGSLASH | ISRESTARTED)
 
 /*
  * Namei results flags
@@ -293,6 +293,13 @@ int        namei(struct nameidata *ndp);
 int    vfs_lookup(struct nameidata *ndp);
 int    vfs_relookup(struct vnode *dvp, struct vnode **vpp,
            struct componentname *cnp, bool refstart);
+
+#define namei_setup_rootdir(ndp, cnp, pwd) do {                                
        \
+       if (__predict_true((cnp->cn_flags & ISRESTARTED) == 0))                 
\
+               ndp->ni_rootdir = pwd->pwd_adir;                                
\
+       else                                                                    
\
+               ndp->ni_rootdir = pwd->pwd_rdir;                                
\
+} while (0)
 #endif
 
 /*

Reply via email to