Hi Frank,

On Tue, 2020-03-24 at 16:32 -0400, Frank Ch. Eigler via Elfutils-devel wrote:
> The following patch generalizes the webapi slightly, which allows
> debuggers like lldb to operate against packages/binaries with
> source files that include posix pathname noise.
> 
> commit 8ac3883f36c3c3d48bc90891aad6718a798bdd7a (HEAD -> fche/pr25548)
> Author: Frank Ch. Eigler <f...@redhat.com>
> Date:   Mon Mar 23 15:33:56 2020 -0400
> 
>     PR25548: support canonicalized source-path names in debuginfod webapi
>     
>     Programs are sometimes compiled with source path names containing
>     noise like /./ or // or /foo/../, and these survive into DWARF.  This
>     code allows either raw or canonicalized pathnames in the webapi, by
>     letting the client pass things verbatim, and letting the server
>     store/accept both raw and canonicalized path names for source files.
>     Tests included & docs updated.

This looks good. And seems very useful. Thanks.
Just a comment below about the documentation (and some silly whitespace).
    
>     Signed-off-by: Frank Ch. Eigler <f...@redhat.com>
> 
> diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
> index 2410430c2361..4d687e75bdf8 100644
> --- a/debuginfod/ChangeLog
> +++ b/debuginfod/ChangeLog
> @@ -1,3 +1,14 @@
> +2020-03-24  Frank Ch. Eigler  <f...@redhat.com>
> +
> +     * debuginfod-client.c (debuginfod_query_server): Set
> +     CURLOPT_PATH_AS_IS, to propagate file names verbatim.
> +     * debuginfod.cxx (canon_pathname): Implement RFC3986
> +     style pathname canonicalization.
> +     (handle_buildid): Canonicalize incoming webapi source
> +     paths, accept either one.
> +     (scan_source_file, archive_classify): Store both
> +     original and canonicalized dwarf-source file names.
> +
>  2020-03-22  Frank Ch. Eigler  <f...@redhat.com>
>  
>       * debuginfod-client.c (struct debuginfod_client): Add url field.
> diff --git a/debuginfod/debuginfod-client.c b/debuginfod/debuginfod-client.c
> index 58a04b9a734b..3d6f645d56f2 100644
> --- a/debuginfod/debuginfod-client.c
> +++ b/debuginfod/debuginfod-client.c
> @@ -681,6 +681,7 @@ debuginfod_query_server (debuginfod_client *c,
>        curl_easy_setopt(data[i].handle, CURLOPT_FOLLOWLOCATION, (long) 1);
>        curl_easy_setopt(data[i].handle, CURLOPT_FAILONERROR, (long) 1);
>        curl_easy_setopt(data[i].handle, CURLOPT_NOSIGNAL, (long) 1);
> +      curl_easy_setopt(data[i].handle, CURLOPT_PATH_AS_IS, (long) 1);
>        curl_easy_setopt(data[i].handle, CURLOPT_AUTOREFERER, (long) 1);
>        curl_easy_setopt(data[i].handle, CURLOPT_ACCEPT_ENCODING, "");
>        add_extra_headers(data[i].handle);
> diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
> index 7c7e85eb6d14..dc62eb088c02 100644
> --- a/debuginfod/debuginfod.cxx
> +++ b/debuginfod/debuginfod.cxx
> @@ -944,6 +944,82 @@ shell_escape(const string& str)
>  }
>  
>  
> +// PR25548: Perform POSIX / RFC3986 style path canonicalization on the input 
> string.
> +//
> +// Namely:
> +//    //         ->   /
> +//    /foo/../   ->   /
> +//    /./        ->   /
> +//
> +// This mapping is done on dwarf-side source path names, which may
> +// include these constructs, so we can deal with debuginfod clients
> +// that accidentally canonicalize the paths.
> +//
> +// realpath(3) is close but not quite right, because it also resolves
> +// symbolic links.  Symlinks at the debuginfod server have nothing to
> +// do with the build-time symlinks, thus they must not be considered.
> +//
> +// see also curl Curl_dedotdotify() aka RFC3986, which we mostly follow here
> +// see also libc __realpath()
> +// see also llvm llvm::sys::path::remove_dots()

It would be good to add (part of) this to the documentation.

> +static string
> +canon_pathname (const string& input)
> +{
> +  string i = input; // 5.2.4 (1)
> +  string o;
> +
> +  while (i.size() != 0)
> +    {
> +      // 5.2.4 (2) A
> +      if (i.substr(0,3) == "../")
> +        i = i.substr(3);
> +      else if(i.substr(0,2) == "./")
> +        i = i.substr(2);
> +
> +      // 5.2.4 (2) B
> +      else if (i.substr(0,3) == "/./")
> +        i = i.substr(2);
> +      else if (i == "/.")
> +        i = ""; // no need to handle "/." complete-path-segment case; we're 
> dealing with file names
> +
> +      // 5.2.4 (2) C
> +      else if (i.substr(0,4) == "/../") {
> +        i = i.substr(3);
> +        string::size_type sl = o.rfind("/");
> +        if (sl != string::npos)
> +          o = o.substr(0, sl);
> +        else
> +          o = "";
> +      } else if (i == "/..")
> +        i = ""; // no need to handle "/.." complete-path-segment case; we're 
> dealing with file names
> +
> +      // 5.2.4 (2) D
> +      // no need to handle these cases; we're dealing with file names
> +      else if (i == ".")
> +        i = "";
> +      else if (i == "..")
> +        i = "";
> +
> +      // POSIX special: map // to /
> +      else if (i.substr(0,2) == "//")
> +        i = i.substr(1);
> +
> +      // 5.2.4 (2) E
> +      else {
> +        string::size_type next_slash = i.find("/", (i[0]=='/' ? 1 : 0)); // 
> skip first slash
> +        o += i.substr(0, next_slash);
> +        if (next_slash == string::npos)
> +          i = "";
> +        else
> +          i = i.substr(next_slash);
> +      }
> +    }
> +
> +  return o;
> +}
> +
> +
> +
>  // A map-like class that owns a cache of file descriptors (indexed by
>  // file / content names).
>  //
> @@ -1393,12 +1469,17 @@ static struct MHD_Response* handle_buildid (const 
> string& buildid /* unsafe */,
>      }
>    else if (atype_code == "S")
>      {
> +      // PR25548
> +      // Incoming source queries may come in with either dwarf-level OR 
> canonicalized paths.
> +      // We let the query pass with either one.
> +
>        pp = new sqlite_ps (db, "mhd-query-s",
> -                          "select mtime, sourcetype, source0, source1 from " 
> BUILDIDS "_query_s where buildid = ? and artifactsrc = ? "
> +                          "select mtime, sourcetype, source0, source1 from " 
> BUILDIDS "_query_s where buildid = ? and artifactsrc in (?,?) "
>                            "order by sharedprefix(source0,source0ref) desc, 
> mtime desc");
>        pp->reset();
>        pp->bind(1, buildid);
>        pp->bind(2, suffix);
> +      pp->bind(3, canon_pathname(suffix));
>      }
>    unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or return
>  
> @@ -2083,6 +2164,27 @@ scan_source_file (const string& rps, const stat_t& st,
>              .bind(4, sfs.st_mtime)
>              .step_ok_done();
>  
> +          // PR25548: also store canonicalized source path
> +          string dwarfsrc_canon = canon_pathname (dwarfsrc);
> +          if (dwarfsrc_canon != dwarfsrc)
> +            {
> +              if (verbose > 3)
> +                obatched(clog) << "canonicalized src=" << dwarfsrc << " 
> alias=" << dwarfsrc_canon << endl;
> +
> +              ps_upsert_files
> +                .reset()
> +                .bind(1, dwarfsrc_canon)
> +                .step_ok_done();
> +
> +              ps_upsert_s
> +                .reset()
> +                .bind(1, buildid)
> +                .bind(2, dwarfsrc_canon)
> +                .bind(3, srps)
> +                .bind(4, sfs.st_mtime)
> +                .step_ok_done();
> +            }
> +
>            inc_metric("found_sourcerefs_total","source","files");
>          }
>      }
> @@ -2244,6 +2346,26 @@ archive_classify (const string& rps, string& 
> archive_extension,
>                      .bind(2, s)
>                      .step_ok_done();
>  
> +                  // PR25548: also store canonicalized source path
> +                  const string& dwarfsrc = s;
> +                  string dwarfsrc_canon = canon_pathname (dwarfsrc);
> +                  if (dwarfsrc_canon != dwarfsrc)
> +                    {
> +                      if (verbose > 3)
> +                        obatched(clog) << "canonicalized src=" << dwarfsrc 
> << " alias=" << dwarfsrc_canon << endl;
> +
> +                      ps_upsert_files
> +                        .reset()
> +                        .bind(1, dwarfsrc_canon)
> +                        .step_ok_done();
> +
> +                      ps_upsert_sref
> +                        .reset()
> +                        .bind(1, buildid)
> +                        .bind(2, dwarfsrc_canon)
> +                        .step_ok_done();
> +                    }
> +
>                    fts_sref ++;
>                  }
>              }

Looks good.

> diff --git a/doc/ChangeLog b/doc/ChangeLog
> index 59809ea8a1e2..564644f41907 100644
> --- a/doc/ChangeLog
> +++ b/doc/ChangeLog
> @@ -1,3 +1,8 @@
> +2020-03-24  Frank Ch. Eigler  <f...@redhat.com>
> +
> +     * debuginfod-find.1, debuginfod_find_debuginfo.3: Document
> +     source path canonicalization.
> +
>  2020-03-22  Frank Ch. Eigler  <f...@redhat.com>
>  
>       * debuginfod_get_url.3: New function, documented ...
> diff --git a/doc/debuginfod-find.1 b/doc/debuginfod-find.1
> index e71ca29be96e..10c609f2c18e 100644
> --- a/doc/debuginfod-find.1
> +++ b/doc/debuginfod-find.1
> @@ -78,10 +78,9 @@ is made up of multiple CUs.  Therefore, to disambiguate, 
> debuginfod
>  expects source queries to prefix relative path names with the CU
>  compilation-directory, followed by a mandatory "/".
>  
> -Note: the user should not elide \fB../\fP or \fB/./\fP or extraneous
> -\fB///\fP sorts of path components in the directory names, because if
> -this is how those names appear in the DWARF files, that is what
> -debuginfod needs to see too.
> +Note: the caller may or may not elide \fB../\fP or \fB/./\fP or extraneous
> +\fB///\fP sorts of path components in the directory names.  debuginfod
> +accepts both forms.
>  
>  For example:
>  .TS
> diff --git a/doc/debuginfod_find_debuginfo.3 b/doc/debuginfod_find_debuginfo.3
> index f7ec6cc134ba..48846119b3aa 100644
> --- a/doc/debuginfod_find_debuginfo.3
> +++ b/doc/debuginfod_find_debuginfo.3
> @@ -87,10 +87,9 @@ paths, and yet an executable is made up of multiple CUs. 
> Therefore, to
>  disambiguate, debuginfod expects source queries to prefix relative path
>  names with the CU compilation-directory, followed by a mandatory "/".
>  
> -Note: the caller should not elide \fB../\fP or \fB/./\fP or extraneous
> -\fB///\fP sorts of path components in the directory names, because if
> -this is how those names appear in the DWARF files, that is what
> -debuginfod needs to see too.
> +Note: the caller may or may not elide \fB../\fP or \fB/./\fP or extraneous
> +\fB///\fP sorts of path components in the directory names.  debuginfod
> +accepts both forms.
>  
>  If \fIpath\fP is not NULL and the query is successful, \fIpath\fP is set
>  to the path of the file in the cache. The caller must \fBfree\fP() this 
> value.

Could you add something that describes the exact form of
canonicalization done? Maybe just:

"Canonicalization is done according to RFC3986 section 5.2.4 (Remove
Dot Segments), plus reducing any '//' to '/' in the path."

> diff --git a/tests/ChangeLog b/tests/ChangeLog
> index cefbceb4f3a8..47b40f1c18a9 100644
> --- a/tests/ChangeLog
> +++ b/tests/ChangeLog
> @@ -1,3 +1,9 @@
> +2020-03-24  Frank Ch. Eigler  <f...@redhat.com>
> +
> +     * debuginfod-rpms/hello3.spec., /fedora31/*: New files with
> +     uncanonicalized source paths.
> +     * run-debuginfod-find.sh: Test them.
> +
>  2020-03-22  Frank Ch. Eigler  <f...@redhat.com>
>  
>       * run-debuginfod-find.sh: Look for URL in default progressfn
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm 
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm
> new file mode 100644
> index 000000000000..d0b3454083d8
> Binary files /dev/null and 
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm differ
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm 
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..8b2fe9bbd9e7
> Binary files /dev/null and 
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm differ
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm 
> b/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..ee479ecb4909
> Binary files /dev/null and 
> b/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm differ
> diff --git 
> a/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm 
> b/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..890478e429ab
> Binary files /dev/null and 
> b/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm differ
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm 
> b/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..73fd939d1597
> Binary files /dev/null and 
> b/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm differ
> diff --git 
> a/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm 
> b/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..0cc24073aa3e

When using git format-patch instead of git show you'll get something
that people can apply and that shows how big the files are. I assume
they are small? The existing ones are between the 4K and 12K.

> Binary files /dev/null and 
> b/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm differ
> diff --git a/tests/debuginfod-rpms/hello3.spec. 
> b/tests/debuginfod-rpms/hello3.spec.
> new file mode 100644
> index 000000000000..ffb95134e2b6
> --- /dev/null
> +++ b/tests/debuginfod-rpms/hello3.spec.
> @@ -0,0 +1,60 @@
> +Summary: hello3 -- double hello, world rpm
> +Name: hello3
> +Version: 1.0
> +Release: 2
> +Group: Utilities
> +License: GPL
> +Distribution: RPM ^W Elfutils test suite.
> +Vendor: Red Hat Software
> +Packager: Red Hat Software <b...@redhat.com>
> +URL: http://www.redhat.com
> +BuildRequires: gcc make
> +Source0: hello-1.0.tar.gz
> +
> +%description
> +Simple rpm demonstration with an eye to consumption by debuginfod.
> +
> +%package two
> +Summary: hello3two
> +License: GPL
> +
> +%description two
> +Dittoish.
> +
> +%prep
> +%setup -q -n hello-1.0
> +
> +%build
> +mkdir foobar
> +gcc -g -O1 foobar///./../hello.c -o hello
> +gcc -g -O2 -D_FORTIFY_SOURCE=2 foobar///./../hello.c -o hello3
> +
> +%install
> +rm -rf $RPM_BUILD_ROOT
> +mkdir -p $RPM_BUILD_ROOT/usr/local/bin
> +cp hello $RPM_BUILD_ROOT/usr/local/bin/
> +cp hello3 $RPM_BUILD_ROOT/usr/local/bin/
> +
> +%clean
> +rm -rf $RPM_BUILD_ROOT
> +
> +%files 
> +%defattr(-,root,root)
> +%attr(0751,root,root)   /usr/local/bin/hello
> +
> +%files two
> +%defattr(-,root,root)
> +%attr(0751,root,root)   /usr/local/bin/hello3
> +
> +%changelog
> +* Tue Mar 24 2020 Frank Ch. Eigler <f...@redhat.com>
> +- New variant of hello2, with crazy source file paths
> +
> +* Thu Nov 14 2019 Frank Ch. Eigler <f...@redhat.com>
> +- Dropped misc files not relevant to debuginfod testing.
> +
> +* Wed May 18 2016 Mark Wielaard <m...@redhat.com>
> +- Add hello2 for dwz testing support.
> +
> +* Tue Oct 20 1998 Jeff Johnson <j...@redhat.com>
> +- create.
> diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh
> index b64130282d86..e0a9beb9c11c 100755
> --- a/tests/run-debuginfod-find.sh
> +++ b/tests/run-debuginfod-find.sh
> @@ -23,8 +23,8 @@ type rpm2cpio 2>/dev/null || (echo "need rpm2cpio"; exit 77)
>  type bzcat 2>/dev/null || (echo "need bzcat"; exit 77)
>  
>  # for test case debugging, uncomment:
> -# set -x
> -# VERBOSE=-vvvv
> +#set -x
> +#VERBOSE=-vvvv

whitespace change?

>  DB=${PWD}/.debuginfod_tmp.sqlite
>  tempfiles $DB
> @@ -40,7 +40,7 @@ cleanup()
>    if [ $PID2 -ne 0 ]; then kill $PID2; wait $PID2; fi
>    if [ $PID3 -ne 0 ]; then kill $PID3; wait $PID3; fi
>  
> -  rm -rf F R D L Z ${PWD}/mocktree ${PWD}/.client_cache* ${PWD}/tmp*
> +  rm -rf F R D L Z ${PWD}/foobar ${PWD}/mocktree ${PWD}/.client_cache* 
> ${PWD}/tmp*
>    exit_cleanup
>  }
>  
> @@ -112,7 +112,9 @@ export DEBUGINFOD_TIMEOUT=10
>  # cannot find it without debuginfod.
>  echo "int main() { return 0; }" > ${PWD}/prog.c
>  tempfiles prog.c
> -gcc -Wl,--build-id -g -o prog ${PWD}/prog.c
> +# Create a subdirectory to confound source path names
> +mkdir foobar
> +gcc -Wl,--build-id -g -o prog ${PWD}/foobar///./../prog.c
>  testrun ${abs_top_builddir}/src/strip -g -f prog.debug ${PWD}/prog
>  BUILDID=`env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../src/readelf \
>            -a prog | grep 'Build ID' | cut -d ' ' -f 7`
> @@ -142,9 +144,15 @@ cmp $filename F/prog.debug
>  filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable 
> $BUILDID`
>  cmp $filename F/prog
>  
> +# raw source filename
> +filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find source 
> $BUILDID ${PWD}/foobar///./../prog.c`
> +cmp $filename  ${PWD}/prog.c
> +
> +# and also the canonicalized one
>  filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find source 
> $BUILDID ${PWD}/prog.c`
>  cmp $filename  ${PWD}/prog.c
>  
> +
>  ########################################################################

And another?

>  # Test whether the cache default locations are correct
> @@ -291,6 +299,9 @@ archive_test() {
>  
>  # common source file sha1
>  SHA=f4a1a8062be998ae93b8f1cd744a398c6de6dbb1
> +# fedora31
> +archive_test 420e9e3308971f4b817cc5bf83928b41a6909d88 
> /usr/src/debug/hello3-1.0-2.x86_64/foobar////./../hello.c $SHA
> +archive_test 87c08d12c78174f1082b7c888b3238219b0eb265 
> /usr/src/debug/hello3-1.0-2.x86_64///foobar/./..//hello.c $SHA
>  # fedora30
>  archive_test c36708a78618d597dee15d0dc989f093ca5f9120 
> /usr/src/debug/hello2-1.0-2.x86_64/hello.c $SHA
>  archive_test 41a236eb667c362a1c4196018cc4581e09722b1b 
> /usr/src/debug/hello2-1.0-2.x86_64/hello.c $SHA

Nice testcases.

Thanks,

Mark

Reply via email to