Hi,

it's simple - I hope :)

> So all I have to do is replace char* StringMgr::upperUTF8(char* t,
> const unsigned int maxlen)?

Reimplement upperUTF8 and then tell Sword to use an instance of your 
reimplementation to handle Unicode/Latin1 Strings with 
StringMgr::setSystemStringMgr.

> If so, then what is maxlen for? Is it expecting t to be overwritten or
> just a maximum buffer allocated? I assume that sword will dealloc any
> buffer I return.

If maxlen is > 0 upper only maxlen chars. It expects t to be overwritten. No 
buffers are allocated withing upperUTF8. I attached the BTStringMgr we use in 
BibleTime.
I advise to check a string if it contains unicode chars before uppering the 
chars using Unicode. Checking is a lot faster than without.

I hope that helps. And yes, we need better documentation :)

If you return true in supportsUnicode then LocaleMgr will only load locales 
which are in UTF-8, so you can be sure that all verse keys are in UTF-8.

Let me know if you need help,
Joachim
-- 
<>< Re: deemed!
//
// C++ Implementation: btstringmgr
//
// Description: 
//
//
// Author: The BibleTime team <[EMAIL PROTECTED]>, (C) 2004
//
// Copyright: See COPYING file that comes with this distribution
//
//

#include "btstringmgr.h"

//System includes
#include <ctype.h>

char* BTStringMgr::upperUTF8(char* text, const unsigned int maxlen) {
	const int max = (maxlen>0) ? maxlen : strlen(text);
	
	if (isUtf8(text)) {
		strncpy(text, (const char*)QString::fromUtf8(text).upper().utf8(), max);
	
		return text;
	}
	else {
		char* ret = text;	
		while (*text) {
			*text = toupper(*text);
			text++;
		}
		
		return ret;
	}

	return text;
}

char* BTStringMgr::upperLatin1(char* text) {
	char* ret = text;	
	
	while (*text) {
		*text++ = toupper(*text);
	}
	
	return ret;
}

const bool BTStringMgr::supportsUnicode() const {
	return true;
}

const bool BTStringMgr::isUtf8(const char *buf) {
  int i, n;
  register unsigned char c;
  bool gotone = false;

#define F 0   /* character never appears in text */
#define T 1   /* character appears in plain ASCII text */
#define I 2   /* character appears in ISO-8859 text */
#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */

  static const unsigned char text_chars[256] = {
        /*                  BEL BS HT LF    FF CR    */
        F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
        /*                              ESC          */
        F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
        /*            NEL                            */
        X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
        X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
  };

  /* *ulen = 0; */
  for (i = 0; (c = buf[i]); i++) {
    if ((c & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
      /*
       * Even if the whole file is valid UTF-8 sequences,
       * still reject it if it uses weird control characters.
       */

      if (text_chars[c] != T)
        return false;

    } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
      return false;
    } else {                           /* 11xxxxxx begins UTF-8 */
      int following;

    if ((c & 0x20) == 0) {             /* 110xxxxx */
      following = 1;
    } else if ((c & 0x10) == 0) {      /* 1110xxxx */
      following = 2;
    } else if ((c & 0x08) == 0) {      /* 11110xxx */
      following = 3;
    } else if ((c & 0x04) == 0) {      /* 111110xx */
      following = 4;
    } else if ((c & 0x02) == 0) {      /* 1111110x */
      following = 5;
    } else
      return false;

      for (n = 0; n < following; n++) {
        i++;
        if (!(c = buf[i]))
          goto done;

        if ((c & 0x80) == 0 || (c & 0x40))
          return false;
      }
      gotone = true;
    }
  }
done:
  return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
}

#undef F
#undef T
#undef I
#undef X
//
// C++ Interface: btstringmgr
//
// Description: 
//
//
// Author: The BibleTime team <[EMAIL PROTECTED]>, (C) 2004
//
// Copyright: See COPYING file that comes with this distribution
//
//
#ifndef BTSTRINGMGR_H
#define BTSTRINGMGR_H

//Sword includes
#include <stringmgr.h>

//Qt includes
#include <qstring.h>

using namespace sword;

class BTStringMgr : public StringMgr {
public:
	/** Converts the param to an upper case Utf8 string
	* @param The text encoded in utf8 which should be turned into an upper case string
	*/	
	virtual char* upperUTF8(char*, const unsigned int maxlen = 0);
	
	/** Converts the param to an uppercase latin1 string
	* @param The text encoded in latin1 which should be turned into an upper case string
	*/	
	virtual char* upperLatin1(char*);

protected:
	virtual const bool supportsUnicode() const;
	
	/** CODE TAKEN FROM KDELIBS 3.2
	* This function checks whether a string is utf8 or not.
	*
	* It was taken from kdelibs so we do not depend on KDE 3.2.
	*/
	const bool isUtf8(const char *buf);
};

#endif

Attachment: pgp0cAaMq14LZ.pgp
Description: PGP signature

_______________________________________________
sword-devel mailing list
[EMAIL PROTECTED]
http://www.crosswire.org/mailman/listinfo/sword-devel

Reply via email to