Hi

I have rewritten "tr" to use mmap and the wchar.h functions. It seems
to be quite slow but as far as I can tell it works reasonably well (at
least when using a UTF-8 locale). Comments/review and testing welcome
(I am relatively new to C so beware)!

If you think adding this version of "tr" to sbase makes sense I can
prepare a man page that points out all the shortcomings (e. g. no
character classes) of this implementation.


Cheers,

Silvan


--->8---

Add a basic version of tr that is Unicode-aware but does not yet support
character classes.

---
 Makefile |   1 +
 tr.c     | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 tr.c

diff --git a/Makefile b/Makefile
index 81dfaf6..ee84221 100644
--- a/Makefile
+++ b/Makefile
@@ -81,6 +81,7 @@ SRC = \
        tee.c      \
        test.c     \
        touch.c    \
+       tr.c       \
        true.c     \
        tty.c      \
        uname.c    \
diff --git a/tr.c b/tr.c
new file mode 100644
index 0000000..869dbfa
--- /dev/null
+++ b/tr.c
@@ -0,0 +1,142 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <locale.h>
+#include <wchar.h>
+#include "text.h"
+#include "util.h"
+
+static void
+usage(void)
+{
+       eprintf("usage: %s set1 [set2]\n", argv0);
+}
+
+void
+handle_escapes(char *s)
+{
+       switch(*s) {
+       case 'n':
+               *s = '\x0A';
+               break;
+       case 't':
+               *s = '\x09';
+               break;
+       case '\\':
+               *s = '\x5c';
+               break;
+       }
+}
+
+void
+parse_mapping(char *set1, char *set2, wchar_t *mappings)
+{
+       char *s;
+       wchar_t runeleft;
+       wchar_t runeright;
+       int leftbytes;
+       int rightbytes;
+       size_t n = 0;
+       size_t lset2;
+
+       if(set2) {
+               lset2 = strnlen(set2, 255 * sizeof(wchar_t));
+       } else {
+               set2 = (char*) &set1[0];
+               lset2 = 0;
+       }
+
+       s = set1;
+       while(*s) {
+               if(*s == '\\') {
+                       handle_escapes(++s);
+               }
+
+               leftbytes = mbtowc(&runeleft, (const char *) s, 4);
+               if(*(set2 + n))
+                       rightbytes = mbtowc(&runeright, (const char *) set2 + 
n, 4);
+               mappings[runeleft] = runeright;
+
+               s += leftbytes;
+               if(n < lset2)
+                       n += rightbytes;
+       }
+}
+
+void
+map_to_null(const wchar_t *mappings, char *in)
+{
+       const char *s;
+       wchar_t runeleft;
+       int leftbytes = 0;
+
+       s = in;
+       while(*s) {
+               leftbytes = mbtowc(&runeleft, s, 4);
+               if(!mappings[runeleft])
+                       putwchar(runeleft);
+               s += leftbytes;
+       }
+}
+
+void
+map_to_set(const wchar_t *mappings, char *in)
+{
+       const char *s;
+       wchar_t runeleft;
+       int leftbytes = 0;
+
+       s = in;
+       while(*s) {
+               leftbytes = mbtowc(&runeleft, s, 4);
+               if(!mappings[runeleft]) {
+                       putwchar(runeleft);
+               } else {
+                       putwchar(mappings[runeleft]);
+               }
+               s += leftbytes;
+       }
+}
+
+int
+main(int argc, char *argv[])
+{
+       wchar_t *mappings;
+       char *buf = NULL;
+       size_t size = 0;
+       void (*mapfunc) (const wchar_t*, char*);
+
+       setlocale(LC_ALL, "");
+
+       mappings = (wchar_t *) mmap(NULL, 0x110000 * sizeof(wchar_t), 
PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+       ARGBEGIN {
+       default:
+               usage();
+       } ARGEND;
+
+       if(!argc)
+               usage();
+
+       if(argc >= 2) {
+               parse_mapping(argv[0], argv[1], mappings);
+               mapfunc = map_to_set;
+       } else {
+               parse_mapping(argv[0], NULL, mappings);
+               mapfunc = map_to_null;
+       }
+
+       while(afgets(&buf, &size, stdin))
+               mapfunc(mappings, buf);
+       free(buf);
+
+       if (ferror(stdin)) {
+               eprintf("<stdin>: read error:");
+               return EXIT_FAILURE;
+       }
+
+       return EXIT_SUCCESS;
+}
-- 
1.8.5.2


Reply via email to