Bug#667960: [PATCH] combine: let "xor" read a file only once. This is required to correctly work when reading from pipes. Closes: #667960

Carsten Hey Mon, 09 Apr 2012 17:03:17 -0700

I noticed that moreutils includes isutf8.  The same, but with
a different interface, can be archived with "iconv -f UTF8 >/dev/null
filename".

Currently "combine file1 and file2 _" is valid syntax, as is "_ file1
and file2".  If you ever plan to add options, I think this must change
in some way, at least if an option is passed.  Otherwise combine can't
know if "_ -z or xor _" should process the files "-z" and "xor", or the
files "or" and "_".

Useful options are in my opinion -u (unique - if one pipes combine's
output through sort -u anyway one could also use comm instead), -z (zero
terminated entries) and -n (to print the line number instead of the
lines itself - this would solve all those "please reimplement join"
feature requests, but an exhaustive explanation would be too long for
now).  An argument against adding -u is that the perl one-liner perl -ne
'print unless $seen{$_}++' archives the same.

* Joey Hess [2012-04-09 16:14 -0400]:
> Carsten Hey wrote:
> > I'll send an update after being able to measure the memory usage of
> > a simple implementation in C.
>
> I'd rather avoid dealing with custom hash tables in C.

All testing that has been done for the attached C implementation is
comparing the output of all for operations on the two test files
I created with the output of the combine shipped in moreutils, so it is
likely to contain bugs.  For now I agree that the perl variant from my
former mail is the best option mentioned in this bug.

It reads the files into an array of C-strings.  The "hash table" is
built by qsort()ing it and the lookup is done by using bsearch().  If
both, the lines in the original order and the sorted lines are needed,
the array with pointers to char needs to be copied and sorted, which is
quite cheap.

Without a more complex data structure, xor needs to read both files into
memory.  All that is needed for such more complex data structures (which
are also needed for implementing some possible options and for
supporting embedded '\0' characters) is to additionally store some
integers for each entry, for example to store the entries length or to
mark an entry as seen.

With the same arguments I used to measure the memory usage of the
different perl variants it uses 22832 kbytes of memory, despite reading
both files into memory.  The perl implementations needed at least 36576
kbytes.

Regards
Carsten

/*
   Copyright (C) 2006 Joey Hess <j...@kitenet.net>
   Copyright (C) 2012 Carsten Hey <cars...@debian.org>

   This software is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty
   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>,
   or, on Debian systems, the file '/usr/share/common-licenses/GPL-2'.
*/

#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE 700
#endif
#ifndef _BSD_SOURCE
#define _BSD_SOURCE 1
#endif

#include <err.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <wchar.h>

static void *
xmalloc_error(void)
{
	(void)fflush(stdout);
	errx(EXIT_FAILURE, "Memory allocation failed");
}

static void *
xmalloc(size_t sz)
{
	void *rv = malloc(sz);
	if (!rv)
		xmalloc_error();
	return rv;
}

static void *
xrealloc(void *ptr, size_t sz)
{
	void *rv = realloc(ptr, sz);
	if (!rv)
		xmalloc_error();
	return rv;
}

static void *
xmemdup(void *s, size_t n)
{
	/* Add a trailing '\0' as long as support for embedded '\0'
	 * characters is not complete. */
	void *rv = malloc(n + 1);
	if (!rv)
		xmalloc_error();
	((char *)rv)[n] = '\0';
	return memcpy(rv, s, n);
}

static FILE *
xfopen(const char *filename, const char *mode)
{
	FILE *rv;
	errno = 0;
	rv = fopen(filename, mode);
	if (!rv) {
		int bak = errno;
		(void)fflush(stdout);
		errno = bak;
		err(EXIT_FAILURE, filename);
	}
	return rv;
}

/* Write from stream 'in' to stream 'out' until EOF is reached and add
 * a trailing delim character if appropriate.   Return 0 on success.
 */
static int
file_paste(FILE *in, FILE *out, const char delim)
{
	int c,
	    last = (int)~delim,
	    rv = -1;

	(void)fwide(in,  -1);
	(void)fwide(out, -1);

	flockfile(in);
	flockfile(out);

	while ((c = getc_unlocked(in)) != EOF)
		if ((last = fputc_unlocked(c, out)) == EOF)
			goto DONE;

	if (last != (int)delim)
		if (fputc_unlocked(delim, out) == EOF)
			goto DONE;

	if (ferror_unlocked(in))
		goto DONE;

	rv = 0;

DONE:
	funlockfile(out);
	funlockfile(in);

	return rv;
}

static char **
file_to_array(const char *filename, size_t *n, const char delim)
{
#define ARRAY_APPEND(type, ptr, current_size, malloced_size, val)             \
    do {                                                                      \
        if ((current_size) + 1 >= malloced_size) {                            \
            malloced_size += malloced_size / 2 + 1;                           \
            ptr = (type *)xrealloc(ptr, sizeof(type) * (size_t)malloced_size);\
        }                                                                     \
        (ptr)[(current_size)++] = (val);                                      \
    } while(0)

	FILE *fp;
	char **lines;
	size_t alloc = 128;         /* allocated memory in 'lines' */
	char *line = NULL;          /* buffer used by getdelim() */
	size_t buflen = 0;          /* legth of buffer used by getdelim() */
	ssize_t linelen = 0;        /* number of chars read by getdelim() */

	lines = (char **)xmalloc(sizeof(char *) * alloc);

	fp = xfopen(filename, "rb");

	while ((linelen = getdelim(&line, &buflen, delim, fp)) != -1) {
		if (line[linelen - 1] == delim)
			linelen--;
		ARRAY_APPEND(char *, lines, *n, alloc, (char *)xmemdup(line, linelen));
	}

	if (ferror(fp)
			|| !feof(fp)
			|| fclose(fp)) {
		(void)fflush(stdout);
		errx(EXIT_FAILURE, "%s: unknown error", filename);
	}

	return lines;
}

static int
str_compare(const void *a, const void *b)
{
	return strcmp(*(const char * const *)a, *(const char * const *)b);
}

enum { or, xor, not, and };

static void
combine_or(const char *file1, const char *file2, const char delim)
{
	FILE *fp;

	fp = xfopen(file1, "rb");
	if (file_paste(fp, stdout, delim)
			|| fclose(fp)) {
		(void)fflush(stdout);
		errx(EXIT_FAILURE, "%s: unknown error", file1);
	}

	fp = xfopen(file2, "rb");
	if (file_paste(fp, stdout, delim)
			|| fclose(fp)) {
		(void)fflush(stdout);
		errx(EXIT_FAILURE, "%s: unknown error", file2);
	}
}

static void
combine_and_not(int op, const char *file1, const char *file2, const char delim)
{
	FILE *fp;                   /* file pointer for 'file1' */
	char *line = NULL;          /* buffer used by getdelim() */
	char **lines;               /* lines read from 'file2' */
	size_t n = 0;               /* number of lines read from 'file2' */
	size_t buflen = 0;          /* legth of buffer used by getdelim() */
	ssize_t linelen = 0;        /* number of chars read by getdelim() */

	lines = file_to_array(file2, &n, delim);

	qsort(lines, n, sizeof(lines[0]), str_compare);

	fp = xfopen(file1, "rb");

	if (op == not)
		/* Read file1 and print the lines that are not in file2. */
		while ((linelen = getdelim(&line, &buflen, delim, fp)) != -1) {
			if (line[linelen - 1] == delim)
				line[--linelen] = '\0';
			if (!bsearch(&line, lines, n, sizeof(lines[0]), str_compare))
				printf("%s%c", line, delim);
		}
	else /* op == and */
		/* Read file1 and print the lines that are also in file2. */
		while ((linelen = getdelim(&line, &buflen, delim, fp)) != -1) {
			if (line[linelen - 1] == delim)
				line[--linelen] = '\0';
			if (bsearch(&line, lines, n, sizeof(lines[0]), str_compare))
				printf("%s%c", line, delim);
		}

	if (ferror(fp)
			|| !feof(fp)
			|| fclose(fp)) {
		(void)fflush(stdout);
		errx(EXIT_FAILURE, "%s: unknown error", file1);
	}

	/* Neither line nor lines is freed, exit() will do this for us. */
}

static void
combine_xor(const char *file1, const char *file2, const char delim)
{
	char **lines1,
	     **lines1_sorted,
	     **lines2,
	     **lines2_sorted;
	size_t i,
	       n1 = 0,
	       n2 = 0;

	lines1 = file_to_array(file1, &n1, delim);

	lines1_sorted = (char **)xmalloc(n1 * sizeof(lines1[0]));
	memcpy(lines1_sorted, lines1, n1 * sizeof(lines1[0]));
	qsort(lines1_sorted, n1, sizeof(lines1[0]), str_compare);

	lines2 = file_to_array(file2, &n2, delim);

	lines2_sorted = (char **)xmalloc(n2 * sizeof(lines2[0]));
	memcpy(lines2_sorted, lines2, n2 * sizeof(lines2[0]));
	qsort(lines2_sorted, n2, sizeof(lines2[0]), str_compare);

	for (i = 0; i < n1; i++)
		if (!bsearch(&lines1[i], lines2_sorted, n2,
				sizeof(lines2_sorted[0]), str_compare))
			printf("%s%c", lines1[i], delim);

	for (i = 0; i < n2; i++)
		if (!bsearch(&lines2[i], lines1_sorted, n1,
				sizeof(lines1_sorted[0]), str_compare))
			printf("%s%c", lines2[i], delim);
}

static void
combine(int op, const char *file1, const char *file2, const char delim)
{
	switch (op) {
	case or:
		combine_or(file1, file2, delim);
		break;
	case not: /*FALLTHROUGH*/
	case and:
		combine_and_not(op, file1, file2, delim);
		break;
	case xor:
		combine_xor(file1, file2, delim);
		break;
	}
}

static void
exit_usage(void)
{
	fprintf(stderr, "Usage: combine file1 OP file2\n");

	exit(EXIT_FAILURE);
}

int
main(int argc, char *argv[])
{
	int op = 0;

	if (argc == 5) {
		/* The perl implementation does not check the program name. */
		const char *progname;
		progname = strrchr(*argv,'/') ? strrchr(*argv,'/') + 1 : *argv;
		if (strcmp(progname, "_") == 0)
			if (strcmp(argv[4], "_") == 0)
				argv[--argc] = NULL;
	}

	if (argc != 4)
		exit_usage();

	if (strcmp(argv[2], "or") == 0)
		op = or;
	else if (strcmp(argv[2], "not") == 0)
		op = not;
	else if (strcmp(argv[2], "and") == 0)
		op = and;
	else if (strcmp(argv[2], "xor") == 0)
		op = xor;
	else
		exit_usage();

	combine(op, argv[1], argv[3], '\n');

	exit(EXIT_SUCCESS);
}

Bug#667960: [PATCH] combine: let "xor" read a file only once. This is required to correctly work when reading from pipes. Closes: #667960

Reply via email to