Hi Valentin, thank You very, very much for Your answer.
Your idea is great !!! I am very happy with this solution :) Of course I have to recall that pleasure with C/C++ coding but for now this is the best and fastest way for me. Have a nice day, Best regards :) Artur On Thu, 28 Jun 2007 11:18:00 +0200, Valentin Kozamernik <[EMAIL PROTECTED]> wrote: > On Tue, 26 Jun 2007 at 12:35, Artur Litwinowicz wrote: >> O I think it is not good idea to change the code of OpenBSD by me. >> Years ago I was coding in C++ (at the University but with best mark ;). > Now >> I am working for Oracle Corp. (PL/SQL and etc.) and I am a little out of >> practice ;) with C/C++. >> >> Maybe someone core OpenBSD Developer will agree with me, that the > extended >> COLLATION in OpenBSD will be the strong point in the system > functionality ? >> > > I had the same problem with PostgreSQL on OpenBSD a few years ago and > I've written some C-language extensions (database functions). Be > advised that this is a totally non-standard solution. Here you go: > > The idea was to make simple functions for sorting and upper/lower > conversion, that would be faster than typical unicode table lookups. > Since I only ever needed a single language at a time, I didn't need > all the fancy unicode stuff. > > In "lang.h", there are lower and upper alphabet strings for each > language (currently english and slovenian). You can add your own, of > course. > > If you run "make" (you may need to edit the Makefile first), you'll > get the "hash_en.so", "hash_sl.so", "upper_en.so", "upper_sl.so", > "lower_en.so" and "lower_sl.so" shared libs containing postgres > functions with the same names. > > To load them into database, use > CREATE FUNCTION func_name(TEXT) RETURNS TEXT AS \ > 'path/to/func_file.so', 'func_name' LANGUAGE C IMMUTABLE STRICT > for each of them. > > The upper_XX and lower_XX functions return the upper/lowercase version > of the input string. The hash_XX function replaces the input string > with new string where each letter is replaced with its position in the > alphabet. > > For example, instead of > SELECT ... ORDER BY my_column > you can use > SELECT ... ORDER BY hash_sl(my_column) > and you've got slovenian sort order. > > For performance, create an index on hash_sl(my_column), not my_column. > > This will only work on "unicode" databases. > > Of course, there may be bugs. They are quite likely, actually. I > remember I wrote all this in a hurry. But it has worked OK for at > least three projects now. > > All the files except for "lang.h" follow below. For "lang.h", go to > "http://www.komna.com/tin/lang.h" (it's UTF-8 encoded, so I can't put > it here inline). > > > > # Makefile > # > # Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> > # > # Permission to use, copy, modify, and distribute this software for any > # purpose with or without fee is hereby granted, provided that the above > # copyright notice and this permission notice appear in all copies. > # > # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > # > > INCLUDE_DIR=/usr/local/include/postgresql/server > > build: hash_en.so hash_sl.so lower_en.so lower_sl.so upper_en.so > upper_sl.so > > hash_en.so: hash.c > cc -Wall -Werror -fpic -c -o hash_en.o hash.c -I$(INCLUDE_DIR) -DEN > ld -Bshareable -o hash_en.so hash_en.o > > hash_sl.so: hash.c > cc -Wall -Werror -fpic -c -o hash_sl.o hash.c -I$(INCLUDE_DIR) -DSL > ld -Bshareable -o hash_sl.so hash_sl.o > > lower_en.so: lower.c > cc -Wall -Werror -fpic -c -o lower_en.o lower.c -I$(INCLUDE_DIR) -DEN > ld -Bshareable -o lower_en.so lower_en.o > > lower_sl.so: lower.c > cc -Wall -Werror -fpic -c -o lower_sl.o lower.c -I$(INCLUDE_DIR) -DSL > ld -Bshareable -o lower_sl.so lower_sl.o > > upper_en.so: upper.c > cc -Wall -Werror -fpic -c -o upper_en.o upper.c -I$(INCLUDE_DIR) -DEN > ld -Bshareable -o upper_en.so upper_en.o > > upper_sl.so: upper.c > cc -Wall -Werror -fpic -c -o upper_sl.o upper.c -I$(INCLUDE_DIR) -DSL > ld -Bshareable -o upper_sl.so upper_sl.o > > clean: > -rm *.o *.so > > > > /* > * $Id: utf8.h,v 1.1.1.1 2004/12/14 14:53:28 tin Exp $ > * > * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> > * > * Permission to use, copy, modify, and distribute this software for any > * purpose with or without fee is hereby granted, provided that the above > * copyright notice and this permission notice appear in all copies. > * > * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL > WARRANTIES > * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > */ > > #include <sys/types.h> > > typedef u_int32_t widechar_t; > > #define STR_WCHAR_UNKNOWN ((widechar_t)'?') > > > /* read a wide char from UTF-8 string *ptr and point *ptr to the next one > */ > widechar_t > utf8get(char **ptr) > { > unsigned char uch; > widechar_t res; > > uch = (unsigned char)(**ptr); > (*ptr)++; > > if (!(uch & 0x80)) > return((widechar_t)uch); > > if ((uch & 0xFE) == 0xC0) > return(STR_WCHAR_UNKNOWN); > > if (((uch & 0xFF) == 0xE0) && ((**ptr & 0xE0) == 0x80)) > return(STR_WCHAR_UNKNOWN); > > if (((uch & 0xFF) == 0xF0) && ((**ptr & 0xF0) == 0x80)) > return(STR_WCHAR_UNKNOWN); > > if (((uch & 0xFF) == 0xF8) && ((**ptr & 0xF8) == 0x80)) > return(STR_WCHAR_UNKNOWN); > > if (((uch & 0xFF) == 0xFC) && ((**ptr & 0xFC) == 0x80)) > return(STR_WCHAR_UNKNOWN); > > if ((**ptr & 0xC0) != 0x80) > return(STR_WCHAR_UNKNOWN); > > res = (widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F; > if(uch & 0x20) { > if((**ptr & 0xC0) != 0x80) { > (*ptr)--; > return(STR_WCHAR_UNKNOWN); > } > res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & > 0x0000003F); > } else if((uch & 0xE0) != 0xC0) { > (*ptr)-=2; > return(STR_WCHAR_UNKNOWN); > } else > return(res | (((widechar_t)uch & 0x0000001F) << 6)); > > if(uch & 0x10) { > res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & > 0x0000003F); > } else if((uch & 0xF0) != 0xE0) { > (*ptr)-=2; > return(STR_WCHAR_UNKNOWN); > } else > return(res | (((widechar_t)uch & 0x0000000F) << 12)); > > if(uch & 0x08) { > if((**ptr & 0xC0) != 0x80) { > (*ptr)-=3; > return(STR_WCHAR_UNKNOWN); > } > res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & > 0x0000003F); > } else if((uch & 0xF8) != 0xF0) { > (*ptr)-=3; > return(STR_WCHAR_UNKNOWN); > } else > return(res | (((widechar_t)uch & 0x00000007) << 18)); > > if(uch & 0x04) > { > if((**ptr & 0xC0) != 0x80) { > (*ptr)-=4; > return(STR_WCHAR_UNKNOWN); > } > res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & > 0x0000003F); > } else if((uch & 0xFC) != 0xF8) { > (*ptr)-=4; > return(STR_WCHAR_UNKNOWN); > } else > return(res | (((widechar_t)uch & 0x00000003) << 24)); > > if ((uch & 0xFE) != 0xFC) { > (*ptr)-=5; > return(STR_WCHAR_UNKNOWN); > } > > return (res | (((widechar_t)uch & 0x00000001) << 30)); > } > > > /* write character ch to *ptr and point *ptr at the end */ > void > utf8put(char **ptr, widechar_t ch) > { > widechar_t bech = ch; > > if (bech < 0x00000080) { > *((*ptr)++) = (char)bech; > return; > } > > if (bech < 0x00000800) { > *((*ptr)++) = > (char)(((bech >> 6) & 0x0000001F) | (0x000000C0)); > *((*ptr)++) = > (char)(((bech) & 0x0000003F) | (0x00000080)); > return; > } > > if (bech < 0x00010000) { > *((*ptr)++) = > (char)(((bech >> 12) & 0x0000000F) | (0x000000E0)); > *((*ptr)++) = > (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = > (char)(((bech) & 0x0000003F) | (0x00000080)); > return; > } > > if (bech < 0x00200000) { > *((*ptr)++) = > (char)(((bech >> 18) & 0x00000007) | (0x000000F0)); > *((*ptr)++) = > (char)(((bech >> 12) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = > (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = > (char)(((bech) & 0x0000003F) | (0x00000080)); > return; > } > > if (bech < 0x04000000) { > *((*ptr)++) = > (char)(((bech >> 24) & 0x00000003) | (0x000000F8)); > *((*ptr)++) = > (char)(((bech >> 18) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = > (char)(((bech >> 12) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = > (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = > (char)(((bech) & 0x0000003F) | (0x00000080)); > return; > } > > *((*ptr)++) = (char)(((bech >> 30) & 0x00000001) | (0x000000FC)); > *((*ptr)++) = (char)(((bech >> 24) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = (char)(((bech >> 18) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = (char)(((bech >> 12) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); > *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080)); > } > > > /* get the number of UTF-8 characters in a string */ > int > utf8len(char *p, char *e) > { > int n; > > n = 0; > while (*p && p < e) { > n++; > > if (!(*p & 0x80)) { > p++; > continue; > } else if ((*p & 0xE0) == 0xC0) { > p+=2; > continue; > } else if ((*p & 0xF0) == 0xE0) { > p+=3; > continue; > } else if ((*p & 0xF8) == 0xF0) { > p+=4; > continue; > } else if ((*p & 0xFC) == 0xF8) { > p+=5; > continue; > } else if ((*p & 0xFE) == 0xFC) { > p+=6; > continue; > } else > return (n); > } > > return (n); > } > > > /* get the number of bytes the character takes when UTF-8 encoded */ > int > utf8size(widechar_t ch) > { > if (ch < 0x00000080) > return(1); > > if (ch < 0x00000800) > return(2); > > if (ch < 0x00010000) > return(3); > > if (ch < 0x00200000) > return(4); > > if (ch < 0x04000000) > return(5); > > return (6); > } > > > > /* > * $Id: hash.c,v 1.2 2007/02/15 14:16:39 tin Exp $ > * > * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> > * > * Permission to use, copy, modify, and distribute this software for any > * purpose with or without fee is hereby granted, provided that the above > * copyright notice and this permission notice appear in all copies. > * > * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL > WARRANTIES > * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > */ > > #include <postgres.h> > #include <fmgr.h> > #include "lang.h" > #include "utf8.h" > > #ifdef PG_MODULE_MAGIC > PG_MODULE_MAGIC; > #endif > > > PG_FUNCTION_INFO_V1(HASH); > > > Datum > HASH(PG_FUNCTION_ARGS) > { > static widechar_t input[sizeof(ABC_U)]; > static int value[sizeof(ABC_U)]; > static int count = -1; > > text *in, *out; > char *pi, *po, *limit; > widechar_t wc1, wc2; > unsigned int i, size, val; > > if (count < 0) { > count = 0; > val = 0; > pi = ABC_L; > po = ABC_U; > > while ((wc1 = utf8get(&pi))) { > wc2 = utf8get(&po); > if (wc1 == (widechar_t)',') { > val++; > continue; > } > input[count] = wc1; > value[count] = val; > if(wc1 != wc2) > { > count++; > input[count] = wc2; > value[count] = val; > } > count++; > } > } > > in = PG_GETARG_TEXT_P(0); > limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ; > > size = VARHDRSZ > + (2 * utf8len((char *)VARDATA(in), limit)); > out = (text *)palloc(size); > VARATT_SIZEP(out) = size; > > pi = (char *)VARDATA(in); > po = (char *)VARDATA(out); > > while (pi < limit) { > wc1 = utf8get(&pi); > for (i=0; i<count; i++) { > if(input[i] == wc1) > break; > } > > if (i < count) { > po[0] = 65 + (value[i] / 26); > po[1] = 65 + (value[i] % 26); > } else { > po[0] = po[1] = 'Z'; > } > > po += 2; > } > > PG_RETURN_TEXT_P(out); > } > > > > /* > * $Id: lower.c,v 1.2 2007/02/15 14:16:39 tin Exp $ > * > * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> > * > * Permission to use, copy, modify, and distribute this software for any > * purpose with or without fee is hereby granted, provided that the above > * copyright notice and this permission notice appear in all copies. > * > * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL > WARRANTIES > * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > */ > > #include <postgres.h> > #include <fmgr.h> > #include "lang.h" > #include "utf8.h" > > #ifdef PG_MODULE_MAGIC > PG_MODULE_MAGIC; > #endif > > > PG_FUNCTION_INFO_V1(LOWER); > > > Datum > LOWER(PG_FUNCTION_ARGS) > { > static widechar_t input[sizeof(ABC_U)]; > static widechar_t output[sizeof(ABC_U)]; > static int diff[sizeof(ABC_U)]; > static int count = -1; > > text *in, *out; > char *pi, *po, *limit; > widechar_t wc1, wc2; > unsigned int i, size; > > if (count < 0) { > count = 0; > pi = ABC_U; > po = ABC_L; > > while ((wc1 = utf8get(&pi))) { > wc2 = utf8get(&po); > if (wc1 != wc2) { > input[count] = wc1; > output[count] = wc2; > diff[count] = utf8size(wc2) - utf8size(wc1); > count++; > } > } > } > > in = PG_GETARG_TEXT_P(0); > limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ; > > size = VARSIZE(in); > out = (text *)palloc(size); > > pi = (char *)VARDATA(in); > po = (char *)VARDATA(out); > > while (pi < limit) { > wc1 = utf8get(&pi); > > for (i=0; i<count; i++) { > if (input[i] == wc1) > break; > } > > if (i < count) { > if (diff[i]) { > text *new_out; > > new_out = (text *)palloc(size + diff[i]); > memcpy(VARDATA(new_out), VARDATA(out), > size - VARHDRSZ); > > po = VARDATA(new_out) + (po - VARDATA(out)); > > pfree(out); > out = new_out; > } > utf8put(&po, output[i]); > > } else > utf8put(&po, wc1); > } > > VARATT_SIZEP(out) = size; > > PG_RETURN_TEXT_P(out); > } > > > > /* > * $Id: upper.c,v 1.2 2007/02/15 14:16:39 tin Exp $ > * > * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> > * > * Permission to use, copy, modify, and distribute this software for any > * purpose with or without fee is hereby granted, provided that the above > * copyright notice and this permission notice appear in all copies. > * > * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL > WARRANTIES > * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > */ > > #include <postgres.h> > #include <fmgr.h> > #include "lang.h" > #include "utf8.h" > > #ifdef PG_MODULE_MAGIC > PG_MODULE_MAGIC; > #endif > > > PG_FUNCTION_INFO_V1(UPPER); > > > Datum > UPPER(PG_FUNCTION_ARGS) > { > static widechar_t input[sizeof(ABC_U)]; > static widechar_t output[sizeof(ABC_U)]; > static int diff[sizeof(ABC_U)]; > static int count = -1; > > text *in, *out; > char *pi, *po, *limit; > widechar_t wc1, wc2; > unsigned int i, size; > > if (count < 0) { > count = 0; > pi = ABC_L; > po = ABC_U; > > while ((wc1 = utf8get(&pi))) { > wc2 = utf8get(&po); > if (wc1 != wc2) { > input[count] = wc1; > output[count] = wc2; > diff[count] = utf8size(wc2) - utf8size(wc1); > count++; > } > } > } > > in = PG_GETARG_TEXT_P(0); > limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ; > > size = VARSIZE(in); > out = (text *)palloc(size); > > pi = (char *)VARDATA(in); > po = (char *)VARDATA(out); > > while (pi < limit) { > wc1 = utf8get(&pi); > > for (i=0; i<count; i++) { > if (input[i] == wc1) > break; > } > > if (i < count) { > if (diff[i]) { > text *new_out; > > new_out = (text *)palloc(size + diff[i]); > memcpy(VARDATA(new_out), VARDATA(out), > size - VARHDRSZ); > > po = VARDATA(new_out) + (po - VARDATA(out)); > > pfree(out); > out = new_out; > } > utf8put(&po, output[i]); > > } else > utf8put(&po, wc1); > } > > VARATT_SIZEP(out) = size; > > PG_RETURN_TEXT_P(out); > } > > > -- > Tin