On 2023-02-15 06:05, George Valkov wrote:
gcc d.c && ./a.out
src 3 dst 4
c 14053376 p 20344832 h 20344832 d 6291456
total bytes copied 14053376 / 27551296
Thanks, this is due to a known incompatibility in macOS lseek that
coreutils is supposed to work around. See
<https://www.gnu.org/software/gnulib/manual/html_node/lseek.html>, which
says, "On some platforms, lseek (fd, offset, SEEK_DATA) returns a value
greater than offset even when offset addresses data: macOS 12".
I guess that somehow, the way you're building coreutils defeats the
workaround. If so, we'll need to change how coreutils is built in your
environment, or fix coreutils 'configure' so that the workaround isn't
defeated in your environment. Although in
<https://bugs.gnu.org/61386#128> Pádraig was dubious about this guess,
his reasoning that the bug is likely specific to APFS rather than an API
mismatch could be wrong, as I think HFS doesn't support SEEK_DATA at all
or reports trivial answers, so coreutils is not likely to run into the
problem on HFS even if the bug is an API issue.
Here are some things we can do to test this guess.
1. Please try the attached program e.c in place of your d.c program. e.c
is like d.c, except it attempts to use the coreutils workaround. What
symptoms do you observe? If e.c works then it's almost surely a problem
in how coreutils is built (compiler options or whatnot), not in the
coreutils workaround. If e.c does not work it's likely that the Gnulib
workaround does not suffice on your macOS platform, in which case we
need to improve the workaround by hacking further on e.c and porting the
result back to Gnulib. (There are other possibilities.)
2. Please verify that coreutils cp is using the Gnulib workaround. In
the src directory, the shell command "nm -o *.o | grep lseek" should
output only lines containing "rpl_lseek"; there shouldn't be any lines
saying just "lseek". Also, please run the command "objdump -d
lib/libcoreutils_a-lseek.o" and verify that the replacement lseek is
actually doing something nontrivial (you should get maybe three dozen
lines of assembly language; if it's much less then this is the problem).
3. Please confirm that _DARWIN_C_SOURCE is defined to 1 in lib/config.h.
4. What is the output of the following commands, in the coreutils build
directory?
rm lib/libcoreutils_a-lseek.o
make V=1 lib/libcoreutils_a-lseek.o
gcc -E -Ilib lib/lseek.c
#define _DARWIN_C_SOURCE 1
#define _GNU_SOURCE 1
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#include <sys/clonefile.h>
off_t
rpl_lseek (int fd, off_t offset, int whence)
{
#if defined __APPLE__ && defined __MACH__ && defined SEEK_DATA
if (whence == SEEK_DATA)
{
/* If OFFSET points to data, macOS lseek+SEEK_DATA returns the
start S of the first data region that begins *after* OFFSET,
where the region from OFFSET to S consists of possibly-empty
data followed by a possibly-empty hole. To work around this
portability glitch, check whether OFFSET is within data by
using lseek+SEEK_HOLE, and if so return to OFFSET by using
lseek+SEEK_SET. Also, contrary to the macOS documentation,
lseek+SEEK_HOLE can fail with ENXIO if there are no holes on
or after OFFSET. What a mess! */
off_t next_hole = lseek (fd, offset, SEEK_HOLE);
if (next_hole < 0)
return errno == ENXIO ? offset : next_hole;
if (next_hole != offset)
whence = SEEK_SET;
}
#endif
return lseek (fd, offset, whence);
}
#define lseek rpl_lseek
#define MAX_SIZE (1024 * 1024 * 100)
char msg[MAX_SIZE];
int main(int argc, char ** argv)
{
// sparse copy test
int src = open("cc1", O_RDONLY);
int dst = open("cc1-sparse", O_CREAT | O_RDWR, 0700);
printf("src %i dst %i\n", src, dst);
//- printf("SET %i CUR %i END %i HOLE %i DATA %i\n", SEEK_SET, SEEK_CUR, SEEK_END, SEEK_HOLE, SEEK_DATA);
long long a = 0;
long long b = 0;
long long d = 0;
long long e = 0;
long long h = 0;
long long p = 0;
long long s = -1;
long long t = 0;
ssize_t c = 0;
ssize_t i = 0;
ssize_t r = 0;
ssize_t w = 0;
int ea = 0;
int eb = 0;
int ed = 0;
int ee = 0;
int eh = 0;
int ep = 0;
do
{
if (++i >= 10)
{
printf("LOOP %zi\n", i);
break;
}
errno = 0;
d = lseek(src, d, SEEK_DATA);
ed = errno;
h = lseek(src, d, SEEK_HOLE);
eh = errno;
a = lseek(src, d, SEEK_SET);
ea = errno;
b = lseek(dst, d, SEEK_SET);
eb = errno;
c = h - d;
if ((a == -1) || (b == -1) || (d == -1) || (h == -1))
{
int handled = 0;
if ((d == -1) && (ed == ENXIO))
{
p = lseek(src, 0, SEEK_END);
ep = errno;
if (p == -1)
{
printf(
"lseek(SEEK_END) failed p %lli %2i %s\n",
p, ep, strerror(ep)
);
}
else
{
e = ftruncate(dst, p);
ee = errno;
if (e == -1)
{
printf(
"ftruncate(%lli) failed %lli %2i %s\n",
p, e, ee, strerror(ee)
);
}
else
{
handled = 1;
}
}
}
if (!handled)
{
printf(
"lseek failed"
" ENXIO %i EBADF %i EINVAL %i EOVERFLOW %i ESPIPE %i\n"
" p %lli\n"
" d %lli %2i %s\n"
" h %lli %2i %s\n"
" a %lli %2i %s\n"
" b %lli %2i %s\n",
ENXIO, EBADF, EINVAL, EOVERFLOW, ESPIPE,
p,
d, ed, strerror(ed),
h, eh, strerror(eh),
a, ea, strerror(ea),
b, eb, strerror(eb)
);
}
break;
}
if (c < 0)
{
printf("negative size c %zi = h %lli - d %lli\n", c, h, d);
continue;
}
if (s == d)
{
printf("EOF %llu %llu\n", s, d);
break;
}
s = d;
if (c > sizeof(msg))
{
printf("msg too small %zu %zi\n", sizeof(msg), c);
break;
}
r = read(src, msg, c);
w = write(dst, msg, c);
if ((r != c) || (w != c))
{
printf("expected %zi got %zi %zi\n", c, r, w);
break;
}
p = lseek(src, 0, SEEK_CUR);
t += c;
printf("c %zi p %llu h %llu d %llu\n", c, p, h, d);
d = p;
} while (1);
close(src);
close(dst);
printf("total bytes copied %llu / %lli\n", t, p);
return 0;
}