Changeset: 6e2781eea9c0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6e2781eea9c0
Modified Files:
        gdk/shared_memory.c
        monetdb5/extras/pyapi/Tests/pyapi_types_string.malC
        monetdb5/extras/pyapi/pyapi.c
        monetdb5/extras/pyapi/unicode.c
        monetdb5/extras/pyapi/unicode.h
Branch: pyapi
Log Message:

Now create a NPY_STRING array if there are no unicode characters in the BAT 
(rather than always creating a NPY_UNICODE array).


diffs (truncated from 476 to 300 lines):

diff --git a/gdk/shared_memory.c b/gdk/shared_memory.c
--- a/gdk/shared_memory.c
+++ b/gdk/shared_memory.c
@@ -256,6 +256,8 @@ int release_process_semaphore(int sem_id
 }
 #else
 //Windows -> Not yet implemented
+#include <stdio.h>
+#include <stdlib.h>
 
 #define NOTIMPLEMENTED() { \
     printf("FATAL ERROR: Shared memory isn't implemented on Windows yet.\n"); \
diff --git a/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC 
b/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC
--- a/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC
+++ b/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC
@@ -5,7 +5,7 @@ bat.append(bstr,"asdf":str);
 bat.append(bstr,"sd asd asd asd asd a":str);
 bat.append(bstr,"test123":str);
 bat.append(bstr,"test":str);
-rstr:bat[:oid,:str] := pyapi.eval(nil:ptr,"return(arg1)",bstr);
+rstr:bat[:oid,:str] := pyapi.eval(nil:ptr,"print(arg1)\nreturn(arg1)",bstr);
 io.print(rstr);
 
 # strings with nil value
diff --git a/monetdb5/extras/pyapi/pyapi.c b/monetdb5/extras/pyapi/pyapi.c
--- a/monetdb5/extras/pyapi/pyapi.c
+++ b/monetdb5/extras/pyapi/pyapi.c
@@ -117,7 +117,6 @@ int PyAPIEnabled(void) {
             || GDKgetenv_isyes(pyapi_enableflag));
 }
 
-
 char* FormatCode(char* code, char **args, size_t argcount, size_t tabwidth);
 
 // TODO: exclude pyapi from mergetable, too
@@ -375,6 +374,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
     PyInput *pyinput_values = NULL;
     int seqbase = 0;
 
+    bool numpy_string_array = true;
     bool option_verbose = GDKgetenv_isyes(verbose_enableflag) || 
GDKgetenv_istrue(verbose_enableflag);
     bool option_debug = GDKgetenv_isyes(debug_enableflag) || 
GDKgetenv_istrue(debug_enableflag);
     bool option_zerocopy = !(GDKgetenv_isyes(zerocopy_disableflag) || 
GDKgetenv_istrue(zerocopy_disableflag));
@@ -385,7 +385,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
     int sem_id = -1;
     int process_id = 0;
     int memory_size = 0;
-    int process_count;
+    int process_count = 0;
 #endif
 
     size_t count;
@@ -773,6 +773,8 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
                 goto wrapup;
             }
 
+            VERBOSE_MESSAGE("- Loading a BAT of type %s (%d)\n", 
BatType_Format(inp->bat_type), inp->bat_type);
+
 #ifndef WIN32
             if (mapped && process_id && process_count > 1)
             {
@@ -781,15 +783,14 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
                 double chunk = process_id - 1;
                 double totalchunks = process_count;
                 double count = BATcount(b);
-                t_start = ceil((count * chunk) / totalchunks);
-                t_end = floor((count * (chunk + 1)) / totalchunks);
-                if (((int)count) / 2 * 2 == (int)count) t_end--;
+                if (count >= process_count) {
+                    t_start = ceil((count * chunk) / totalchunks);
+                    t_end = floor((count * (chunk + 1)) / totalchunks);
+                    if (((int)count) / 2 * 2 == (int)count) t_end--;
+                    VERBOSE_MESSAGE("---Start: %d, End: %d, Count: %d\n", 
t_start, t_end, t_end - t_start);
+                }
             }
 #endif
-            VERBOSE_MESSAGE("Start: %d, End: %d, Count: %d\n", t_start, t_end, 
t_end - t_start);
-
-            VERBOSE_MESSAGE("- Loading a BAT of type %s (%d)\n", 
BatType_Format(inp->bat_type), inp->bat_type);
-
             switch (inp->bat_type) {
             case TYPE_bte:
                 vararray = BAT_TO_NP(b, bte, NPY_INT8);
@@ -810,60 +811,150 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
                 vararray = BAT_TO_NP(b, dbl, NPY_FLOAT64);
                 break;
             case TYPE_str:
-                li = bat_iterator(b);
+                if (numpy_string_array) {
+                    bool unicode = false;
 
-                //we first loop over all the strings in the BAT to find the 
maximum length of a single string
-                //this is because NUMPY only supports strings with a fixed 
maximum length
-                maxsize = 0;
-                count = inp->count;
-                BATloop(b, p, q)
-                {
-                    const char *t = (const char *) BUNtail(li, p);
-                    const size_t length = strlen(t);//utf8_strlen(t); //get 
the amount of UTF-8 characters in the string
+                    li = bat_iterator(b);
 
-                    if (length > maxsize)
-                        maxsize = length;
+                    //we first loop over all the strings in the BAT to find 
the maximum length of a single string
+                    //this is because NUMPY only supports strings with a fixed 
maximum length
+                    maxsize = 0;
+                    count = inp->count;
+                    j = 0;
+                    BATloop(b, p, q) {
+                        if (j >= t_start) {
+                            bool ascii;
+                            const char *t = (const char *) BUNtail(li, p);
+                            size_t length;
+                            if (strcmp(t, str_nil) == 0) {
+                                length = 1;
+                            } else {
+                                length = utf8_strlen(t, &ascii); //get the 
amount of UTF-8 characters in the string
+                                unicode = !ascii || unicode; //if even one 
string is unicode we have to store the entire array as unicode
+                            }
+                            if (length > maxsize)
+                                maxsize = length;
+                        }
+                        if (j == t_end) break;
+                        j++;
+                    }
+                    if (unicode) {
+                        VERBOSE_MESSAGE("- Unicode string!\n");
+                        //create a NPY_UNICODE array object
+                        vararray = PyArray_New(
+                            &PyArray_Type, 
+                            1, 
+                            (npy_intp[1]) {t_end - t_start},  
+                            NPY_UNICODE, 
+                            NULL, 
+                            NULL, 
+                            maxsize * 4,  //we have to do maxsize*4 because 
NPY_UNICODE is stored as UNICODE-32 (i.e. 4 bytes per character)           
+                            0, 
+                            NULL);
+                        //fill the NPY_UNICODE array object using the 
PyArray_SETITEM function
+                        j = 0;
+                        BATloop(b, p, q)
+                        {
+                            if (j >= t_start) {
+                                char *t = (char *) BUNtail(li, p);
+                                PyObject *obj;
+                                if (strcmp(t, str_nil) == 0) {
+                                     //str_nil isn't a valid UTF-8 character 
(it's 0x80), so we can't decode it as UTF-8 (it will throw an error)
+                                    obj = PyUnicode_FromString("-");
+                                }
+                                else {
+                                    //otherwise we can just decode the string 
as UTF-8
+                                    obj = PyUnicode_FromString(t);
+                                }
+
+                                if (obj == NULL)
+                                {
+                                    PyErr_Print();
+                                    msg = createException(MAL, "pyapi.eval", 
"Failed to decode string as UTF-8.");
+                                    goto wrapup;
+                                }
+                                PyArray_SETITEM((PyArrayObject*)vararray, 
PyArray_GETPTR1((PyArrayObject*)vararray, j), obj);
+                            }
+                            if (j == t_end) break;
+                            j++;
+                        }
+                    } else {
+                        VERBOSE_MESSAGE("- ASCII string!\n");
+                        //create a NPY_STRING array object
+                        vararray = PyArray_New(
+                            &PyArray_Type, 
+                            1, 
+                            (npy_intp[1]) {t_end - t_start},  
+                            NPY_STRING, 
+                            NULL, 
+                            NULL, 
+                            maxsize,
+                            0, 
+                            NULL);
+                        j = 0;
+                        BATloop(b, p, q)
+                        {
+                            if (j >= t_start) {
+                                char *t = (char *) BUNtail(li, p);
+                                PyObject *obj = PyString_FromString(t);
+
+                                if (obj == NULL)
+                                {
+                                    msg = createException(MAL, "pyapi.eval", 
"Failed to create string.");
+                                    goto wrapup;
+                                }
+                                PyArray_SETITEM((PyArrayObject*)vararray, 
PyArray_GETPTR1((PyArrayObject*)vararray, j), obj);
+                            }
+                            if (j == t_end) break;
+                            j++;
+                        }
+                    }
                 }
+                else {
+                    // TODO: This
+                    // NPY_OBJECT array
+                    // vararray = PyArray_New(
+                    //     &PyArray_Type, 
+                    //     1, 
+                    //     (npy_intp[1]) {count},  
+                    //     NPY_OBJECT, 
+                    //     NULL, 
+                    //     NULL, 
+                    //     0, 
+                    //     0, 
+                    //     NULL);
+                    // j = 0;
+                    // BATloop(b, p, q)
+                    // {
+                    //     if (j >= t_start) {
+                    //         char *t = (char *) BUNtail(li, p);
+                    //         PyObject *obj;
+                    //         if (strcmp(t, str_nil) == 0) {
+                    //              //str_nil isn't a valid UTF-8 character 
(it's 0x80), so we can't decode it as UTF-8 (it will throw an error)
+                    //             obj = PyString_FromString("-");
+                    //         }
+                    //         else {
+                    //             //otherwise we can just decode the string 
as UTF-8
+                    //             obj = PyString_FromString(t);
+                    //         }
 
-                //create a NPY_UNICODE array object
-                vararray = PyArray_New(
-                    &PyArray_Type, 
-                    1, 
-                    (npy_intp[1]) {count},  
-                    NPY_UNICODE, 
-                    NULL, 
-                    NULL, 
-                    maxsize * 4,  //we have to do maxsize*4 because 
NPY_UNICODE is stored as UNICODE-32 (i.e. 4 bytes per character)           
-                    0, 
-                    NULL);
-
-                //fill the NPY_UNICODE array object using the PyArray_SETITEM 
function
-                j = 0;
-                BATloop(b, p, q)
-                {
-                    const char *t = (const char *) BUNtail(li, p);
-                    PyObject *obj;
-                    if (strcmp(t, str_nil) == 0) {
-                         //str_nil isn't a valid UTF-8 character (it's 0x80), 
so we can't decode it as UTF-8 (it will throw an error)
-                        obj = PyUnicode_FromString("-");
-                    }
-                    else {
-                        //otherwise we can just decode the string as UTF-8
-                        obj = PyUnicode_FromString(t);
-                    }
-
-                    if (obj == NULL)
-                    {
-                        PyErr_Print();
-                        msg = createException(MAL, "pyapi.eval", "Failed to 
decode string as UTF-8.");
-                        goto wrapup;
-                    }
-                    PyArray_SETITEM((PyArrayObject*)vararray, 
PyArray_GETPTR1((PyArrayObject*)vararray, j), obj);
-                    j++;
+                    //         if (obj == NULL)
+                    //         {
+                    //             PyErr_Print();
+                    //             msg = createException(MAL, "pyapi.eval", 
"Failed to decode string as UTF-8.");
+                    //             goto wrapup;
+                    //         }
+                    //         PyArray_SETITEM((PyArrayObject*)vararray, 
PyArray_GETPTR1((PyArrayObject*)vararray, j), obj);
+                    //     }
+                    //     if (j == t_end) break;
+                    //     j++;
+                    // }
+                    // PyArray_INCREF((PyArrayObject*)vararray);
                 }
                 break;
 #ifdef HAVE_HGE
             case TYPE_hge:
+            {
                 li = bat_iterator(b);
                 count = inp->count;
 
@@ -883,7 +974,6 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
                 fprintf(stderr, "!WARNING: Type \"hge\" (128 bit) is 
unsupported by Numpy. The numbers are instead converted to python objects of 
type \"long\". This is likely very slow.\n");
                 BATloop(b, p, q) {
                     char hex[40];
-                    //we first convert the huge to a string in hex format
                     PyObject *obj;
                     const hge *t = (const hge *) BUNtail(li, p);
                     hge_to_string(hex, 40, *t);
@@ -898,6 +988,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
                     j++;
                 }
                 break;
+            }
 #endif
             default:
                 msg = createException(MAL, "pyapi.eval", "unknown argument 
type ");
@@ -947,7 +1038,6 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
         }
     }
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to