Hi Frank, On Tue, 2020-03-24 at 16:32 -0400, Frank Ch. Eigler via Elfutils-devel wrote: > The following patch generalizes the webapi slightly, which allows > debuggers like lldb to operate against packages/binaries with > source files that include posix pathname noise. > > commit 8ac3883f36c3c3d48bc90891aad6718a798bdd7a (HEAD -> fche/pr25548) > Author: Frank Ch. Eigler <f...@redhat.com> > Date: Mon Mar 23 15:33:56 2020 -0400 > > PR25548: support canonicalized source-path names in debuginfod webapi > > Programs are sometimes compiled with source path names containing > noise like /./ or // or /foo/../, and these survive into DWARF. This > code allows either raw or canonicalized pathnames in the webapi, by > letting the client pass things verbatim, and letting the server > store/accept both raw and canonicalized path names for source files. > Tests included & docs updated.
This looks good. And seems very useful. Thanks. Just a comment below about the documentation (and some silly whitespace). > Signed-off-by: Frank Ch. Eigler <f...@redhat.com> > > diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog > index 2410430c2361..4d687e75bdf8 100644 > --- a/debuginfod/ChangeLog > +++ b/debuginfod/ChangeLog > @@ -1,3 +1,14 @@ > +2020-03-24 Frank Ch. Eigler <f...@redhat.com> > + > + * debuginfod-client.c (debuginfod_query_server): Set > + CURLOPT_PATH_AS_IS, to propagate file names verbatim. > + * debuginfod.cxx (canon_pathname): Implement RFC3986 > + style pathname canonicalization. > + (handle_buildid): Canonicalize incoming webapi source > + paths, accept either one. > + (scan_source_file, archive_classify): Store both > + original and canonicalized dwarf-source file names. > + > 2020-03-22 Frank Ch. Eigler <f...@redhat.com> > > * debuginfod-client.c (struct debuginfod_client): Add url field. > diff --git a/debuginfod/debuginfod-client.c b/debuginfod/debuginfod-client.c > index 58a04b9a734b..3d6f645d56f2 100644 > --- a/debuginfod/debuginfod-client.c > +++ b/debuginfod/debuginfod-client.c > @@ -681,6 +681,7 @@ debuginfod_query_server (debuginfod_client *c, > curl_easy_setopt(data[i].handle, CURLOPT_FOLLOWLOCATION, (long) 1); > curl_easy_setopt(data[i].handle, CURLOPT_FAILONERROR, (long) 1); > curl_easy_setopt(data[i].handle, CURLOPT_NOSIGNAL, (long) 1); > + curl_easy_setopt(data[i].handle, CURLOPT_PATH_AS_IS, (long) 1); > curl_easy_setopt(data[i].handle, CURLOPT_AUTOREFERER, (long) 1); > curl_easy_setopt(data[i].handle, CURLOPT_ACCEPT_ENCODING, ""); > add_extra_headers(data[i].handle); > diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx > index 7c7e85eb6d14..dc62eb088c02 100644 > --- a/debuginfod/debuginfod.cxx > +++ b/debuginfod/debuginfod.cxx > @@ -944,6 +944,82 @@ shell_escape(const string& str) > } > > > +// PR25548: Perform POSIX / RFC3986 style path canonicalization on the input > string. > +// > +// Namely: > +// // -> / > +// /foo/../ -> / > +// /./ -> / > +// > +// This mapping is done on dwarf-side source path names, which may > +// include these constructs, so we can deal with debuginfod clients > +// that accidentally canonicalize the paths. > +// > +// realpath(3) is close but not quite right, because it also resolves > +// symbolic links. Symlinks at the debuginfod server have nothing to > +// do with the build-time symlinks, thus they must not be considered. > +// > +// see also curl Curl_dedotdotify() aka RFC3986, which we mostly follow here > +// see also libc __realpath() > +// see also llvm llvm::sys::path::remove_dots() It would be good to add (part of) this to the documentation. > +static string > +canon_pathname (const string& input) > +{ > + string i = input; // 5.2.4 (1) > + string o; > + > + while (i.size() != 0) > + { > + // 5.2.4 (2) A > + if (i.substr(0,3) == "../") > + i = i.substr(3); > + else if(i.substr(0,2) == "./") > + i = i.substr(2); > + > + // 5.2.4 (2) B > + else if (i.substr(0,3) == "/./") > + i = i.substr(2); > + else if (i == "/.") > + i = ""; // no need to handle "/." complete-path-segment case; we're > dealing with file names > + > + // 5.2.4 (2) C > + else if (i.substr(0,4) == "/../") { > + i = i.substr(3); > + string::size_type sl = o.rfind("/"); > + if (sl != string::npos) > + o = o.substr(0, sl); > + else > + o = ""; > + } else if (i == "/..") > + i = ""; // no need to handle "/.." complete-path-segment case; we're > dealing with file names > + > + // 5.2.4 (2) D > + // no need to handle these cases; we're dealing with file names > + else if (i == ".") > + i = ""; > + else if (i == "..") > + i = ""; > + > + // POSIX special: map // to / > + else if (i.substr(0,2) == "//") > + i = i.substr(1); > + > + // 5.2.4 (2) E > + else { > + string::size_type next_slash = i.find("/", (i[0]=='/' ? 1 : 0)); // > skip first slash > + o += i.substr(0, next_slash); > + if (next_slash == string::npos) > + i = ""; > + else > + i = i.substr(next_slash); > + } > + } > + > + return o; > +} > + > + > + > // A map-like class that owns a cache of file descriptors (indexed by > // file / content names). > // > @@ -1393,12 +1469,17 @@ static struct MHD_Response* handle_buildid (const > string& buildid /* unsafe */, > } > else if (atype_code == "S") > { > + // PR25548 > + // Incoming source queries may come in with either dwarf-level OR > canonicalized paths. > + // We let the query pass with either one. > + > pp = new sqlite_ps (db, "mhd-query-s", > - "select mtime, sourcetype, source0, source1 from " > BUILDIDS "_query_s where buildid = ? and artifactsrc = ? " > + "select mtime, sourcetype, source0, source1 from " > BUILDIDS "_query_s where buildid = ? and artifactsrc in (?,?) " > "order by sharedprefix(source0,source0ref) desc, > mtime desc"); > pp->reset(); > pp->bind(1, buildid); > pp->bind(2, suffix); > + pp->bind(3, canon_pathname(suffix)); > } > unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or return > > @@ -2083,6 +2164,27 @@ scan_source_file (const string& rps, const stat_t& st, > .bind(4, sfs.st_mtime) > .step_ok_done(); > > + // PR25548: also store canonicalized source path > + string dwarfsrc_canon = canon_pathname (dwarfsrc); > + if (dwarfsrc_canon != dwarfsrc) > + { > + if (verbose > 3) > + obatched(clog) << "canonicalized src=" << dwarfsrc << " > alias=" << dwarfsrc_canon << endl; > + > + ps_upsert_files > + .reset() > + .bind(1, dwarfsrc_canon) > + .step_ok_done(); > + > + ps_upsert_s > + .reset() > + .bind(1, buildid) > + .bind(2, dwarfsrc_canon) > + .bind(3, srps) > + .bind(4, sfs.st_mtime) > + .step_ok_done(); > + } > + > inc_metric("found_sourcerefs_total","source","files"); > } > } > @@ -2244,6 +2346,26 @@ archive_classify (const string& rps, string& > archive_extension, > .bind(2, s) > .step_ok_done(); > > + // PR25548: also store canonicalized source path > + const string& dwarfsrc = s; > + string dwarfsrc_canon = canon_pathname (dwarfsrc); > + if (dwarfsrc_canon != dwarfsrc) > + { > + if (verbose > 3) > + obatched(clog) << "canonicalized src=" << dwarfsrc > << " alias=" << dwarfsrc_canon << endl; > + > + ps_upsert_files > + .reset() > + .bind(1, dwarfsrc_canon) > + .step_ok_done(); > + > + ps_upsert_sref > + .reset() > + .bind(1, buildid) > + .bind(2, dwarfsrc_canon) > + .step_ok_done(); > + } > + > fts_sref ++; > } > } Looks good. > diff --git a/doc/ChangeLog b/doc/ChangeLog > index 59809ea8a1e2..564644f41907 100644 > --- a/doc/ChangeLog > +++ b/doc/ChangeLog > @@ -1,3 +1,8 @@ > +2020-03-24 Frank Ch. Eigler <f...@redhat.com> > + > + * debuginfod-find.1, debuginfod_find_debuginfo.3: Document > + source path canonicalization. > + > 2020-03-22 Frank Ch. Eigler <f...@redhat.com> > > * debuginfod_get_url.3: New function, documented ... > diff --git a/doc/debuginfod-find.1 b/doc/debuginfod-find.1 > index e71ca29be96e..10c609f2c18e 100644 > --- a/doc/debuginfod-find.1 > +++ b/doc/debuginfod-find.1 > @@ -78,10 +78,9 @@ is made up of multiple CUs. Therefore, to disambiguate, > debuginfod > expects source queries to prefix relative path names with the CU > compilation-directory, followed by a mandatory "/". > > -Note: the user should not elide \fB../\fP or \fB/./\fP or extraneous > -\fB///\fP sorts of path components in the directory names, because if > -this is how those names appear in the DWARF files, that is what > -debuginfod needs to see too. > +Note: the caller may or may not elide \fB../\fP or \fB/./\fP or extraneous > +\fB///\fP sorts of path components in the directory names. debuginfod > +accepts both forms. > > For example: > .TS > diff --git a/doc/debuginfod_find_debuginfo.3 b/doc/debuginfod_find_debuginfo.3 > index f7ec6cc134ba..48846119b3aa 100644 > --- a/doc/debuginfod_find_debuginfo.3 > +++ b/doc/debuginfod_find_debuginfo.3 > @@ -87,10 +87,9 @@ paths, and yet an executable is made up of multiple CUs. > Therefore, to > disambiguate, debuginfod expects source queries to prefix relative path > names with the CU compilation-directory, followed by a mandatory "/". > > -Note: the caller should not elide \fB../\fP or \fB/./\fP or extraneous > -\fB///\fP sorts of path components in the directory names, because if > -this is how those names appear in the DWARF files, that is what > -debuginfod needs to see too. > +Note: the caller may or may not elide \fB../\fP or \fB/./\fP or extraneous > +\fB///\fP sorts of path components in the directory names. debuginfod > +accepts both forms. > > If \fIpath\fP is not NULL and the query is successful, \fIpath\fP is set > to the path of the file in the cache. The caller must \fBfree\fP() this > value. Could you add something that describes the exact form of canonicalization done? Maybe just: "Canonicalization is done according to RFC3986 section 5.2.4 (Remove Dot Segments), plus reducing any '//' to '/' in the path." > diff --git a/tests/ChangeLog b/tests/ChangeLog > index cefbceb4f3a8..47b40f1c18a9 100644 > --- a/tests/ChangeLog > +++ b/tests/ChangeLog > @@ -1,3 +1,9 @@ > +2020-03-24 Frank Ch. Eigler <f...@redhat.com> > + > + * debuginfod-rpms/hello3.spec., /fedora31/*: New files with > + uncanonicalized source paths. > + * run-debuginfod-find.sh: Test them. > + > 2020-03-22 Frank Ch. Eigler <f...@redhat.com> > > * run-debuginfod-find.sh: Look for URL in default progressfn > diff --git a/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm > b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm > new file mode 100644 > index 000000000000..d0b3454083d8 > Binary files /dev/null and > b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm differ > diff --git a/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm > b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm > new file mode 100644 > index 000000000000..8b2fe9bbd9e7 > Binary files /dev/null and > b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm differ > diff --git a/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm > b/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm > new file mode 100644 > index 000000000000..ee479ecb4909 > Binary files /dev/null and > b/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm differ > diff --git > a/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm > b/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm > new file mode 100644 > index 000000000000..890478e429ab > Binary files /dev/null and > b/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm differ > diff --git a/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm > b/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm > new file mode 100644 > index 000000000000..73fd939d1597 > Binary files /dev/null and > b/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm differ > diff --git > a/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm > b/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm > new file mode 100644 > index 000000000000..0cc24073aa3e When using git format-patch instead of git show you'll get something that people can apply and that shows how big the files are. I assume they are small? The existing ones are between the 4K and 12K. > Binary files /dev/null and > b/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm differ > diff --git a/tests/debuginfod-rpms/hello3.spec. > b/tests/debuginfod-rpms/hello3.spec. > new file mode 100644 > index 000000000000..ffb95134e2b6 > --- /dev/null > +++ b/tests/debuginfod-rpms/hello3.spec. > @@ -0,0 +1,60 @@ > +Summary: hello3 -- double hello, world rpm > +Name: hello3 > +Version: 1.0 > +Release: 2 > +Group: Utilities > +License: GPL > +Distribution: RPM ^W Elfutils test suite. > +Vendor: Red Hat Software > +Packager: Red Hat Software <b...@redhat.com> > +URL: http://www.redhat.com > +BuildRequires: gcc make > +Source0: hello-1.0.tar.gz > + > +%description > +Simple rpm demonstration with an eye to consumption by debuginfod. > + > +%package two > +Summary: hello3two > +License: GPL > + > +%description two > +Dittoish. > + > +%prep > +%setup -q -n hello-1.0 > + > +%build > +mkdir foobar > +gcc -g -O1 foobar///./../hello.c -o hello > +gcc -g -O2 -D_FORTIFY_SOURCE=2 foobar///./../hello.c -o hello3 > + > +%install > +rm -rf $RPM_BUILD_ROOT > +mkdir -p $RPM_BUILD_ROOT/usr/local/bin > +cp hello $RPM_BUILD_ROOT/usr/local/bin/ > +cp hello3 $RPM_BUILD_ROOT/usr/local/bin/ > + > +%clean > +rm -rf $RPM_BUILD_ROOT > + > +%files > +%defattr(-,root,root) > +%attr(0751,root,root) /usr/local/bin/hello > + > +%files two > +%defattr(-,root,root) > +%attr(0751,root,root) /usr/local/bin/hello3 > + > +%changelog > +* Tue Mar 24 2020 Frank Ch. Eigler <f...@redhat.com> > +- New variant of hello2, with crazy source file paths > + > +* Thu Nov 14 2019 Frank Ch. Eigler <f...@redhat.com> > +- Dropped misc files not relevant to debuginfod testing. > + > +* Wed May 18 2016 Mark Wielaard <m...@redhat.com> > +- Add hello2 for dwz testing support. > + > +* Tue Oct 20 1998 Jeff Johnson <j...@redhat.com> > +- create. > diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh > index b64130282d86..e0a9beb9c11c 100755 > --- a/tests/run-debuginfod-find.sh > +++ b/tests/run-debuginfod-find.sh > @@ -23,8 +23,8 @@ type rpm2cpio 2>/dev/null || (echo "need rpm2cpio"; exit 77) > type bzcat 2>/dev/null || (echo "need bzcat"; exit 77) > > # for test case debugging, uncomment: > -# set -x > -# VERBOSE=-vvvv > +#set -x > +#VERBOSE=-vvvv whitespace change? > DB=${PWD}/.debuginfod_tmp.sqlite > tempfiles $DB > @@ -40,7 +40,7 @@ cleanup() > if [ $PID2 -ne 0 ]; then kill $PID2; wait $PID2; fi > if [ $PID3 -ne 0 ]; then kill $PID3; wait $PID3; fi > > - rm -rf F R D L Z ${PWD}/mocktree ${PWD}/.client_cache* ${PWD}/tmp* > + rm -rf F R D L Z ${PWD}/foobar ${PWD}/mocktree ${PWD}/.client_cache* > ${PWD}/tmp* > exit_cleanup > } > > @@ -112,7 +112,9 @@ export DEBUGINFOD_TIMEOUT=10 > # cannot find it without debuginfod. > echo "int main() { return 0; }" > ${PWD}/prog.c > tempfiles prog.c > -gcc -Wl,--build-id -g -o prog ${PWD}/prog.c > +# Create a subdirectory to confound source path names > +mkdir foobar > +gcc -Wl,--build-id -g -o prog ${PWD}/foobar///./../prog.c > testrun ${abs_top_builddir}/src/strip -g -f prog.debug ${PWD}/prog > BUILDID=`env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../src/readelf \ > -a prog | grep 'Build ID' | cut -d ' ' -f 7` > @@ -142,9 +144,15 @@ cmp $filename F/prog.debug > filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable > $BUILDID` > cmp $filename F/prog > > +# raw source filename > +filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find source > $BUILDID ${PWD}/foobar///./../prog.c` > +cmp $filename ${PWD}/prog.c > + > +# and also the canonicalized one > filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find source > $BUILDID ${PWD}/prog.c` > cmp $filename ${PWD}/prog.c > > + > ######################################################################## And another? > # Test whether the cache default locations are correct > @@ -291,6 +299,9 @@ archive_test() { > > # common source file sha1 > SHA=f4a1a8062be998ae93b8f1cd744a398c6de6dbb1 > +# fedora31 > +archive_test 420e9e3308971f4b817cc5bf83928b41a6909d88 > /usr/src/debug/hello3-1.0-2.x86_64/foobar////./../hello.c $SHA > +archive_test 87c08d12c78174f1082b7c888b3238219b0eb265 > /usr/src/debug/hello3-1.0-2.x86_64///foobar/./..//hello.c $SHA > # fedora30 > archive_test c36708a78618d597dee15d0dc989f093ca5f9120 > /usr/src/debug/hello2-1.0-2.x86_64/hello.c $SHA > archive_test 41a236eb667c362a1c4196018cc4581e09722b1b > /usr/src/debug/hello2-1.0-2.x86_64/hello.c $SHA Nice testcases. Thanks, Mark