Changeset: 6e2781eea9c0 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6e2781eea9c0 Modified Files: gdk/shared_memory.c monetdb5/extras/pyapi/Tests/pyapi_types_string.malC monetdb5/extras/pyapi/pyapi.c monetdb5/extras/pyapi/unicode.c monetdb5/extras/pyapi/unicode.h Branch: pyapi Log Message:
Now create a NPY_STRING array if there are no unicode characters in the BAT (rather than always creating a NPY_UNICODE array). diffs (truncated from 476 to 300 lines): diff --git a/gdk/shared_memory.c b/gdk/shared_memory.c --- a/gdk/shared_memory.c +++ b/gdk/shared_memory.c @@ -256,6 +256,8 @@ int release_process_semaphore(int sem_id } #else //Windows -> Not yet implemented +#include <stdio.h> +#include <stdlib.h> #define NOTIMPLEMENTED() { \ printf("FATAL ERROR: Shared memory isn't implemented on Windows yet.\n"); \ diff --git a/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC b/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC --- a/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC +++ b/monetdb5/extras/pyapi/Tests/pyapi_types_string.malC @@ -5,7 +5,7 @@ bat.append(bstr,"asdf":str); bat.append(bstr,"sd asd asd asd asd a":str); bat.append(bstr,"test123":str); bat.append(bstr,"test":str); -rstr:bat[:oid,:str] := pyapi.eval(nil:ptr,"return(arg1)",bstr); +rstr:bat[:oid,:str] := pyapi.eval(nil:ptr,"print(arg1)\nreturn(arg1)",bstr); io.print(rstr); # strings with nil value diff --git a/monetdb5/extras/pyapi/pyapi.c b/monetdb5/extras/pyapi/pyapi.c --- a/monetdb5/extras/pyapi/pyapi.c +++ b/monetdb5/extras/pyapi/pyapi.c @@ -117,7 +117,6 @@ int PyAPIEnabled(void) { || GDKgetenv_isyes(pyapi_enableflag)); } - char* FormatCode(char* code, char **args, size_t argcount, size_t tabwidth); // TODO: exclude pyapi from mergetable, too @@ -375,6 +374,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st PyInput *pyinput_values = NULL; int seqbase = 0; + bool numpy_string_array = true; bool option_verbose = GDKgetenv_isyes(verbose_enableflag) || GDKgetenv_istrue(verbose_enableflag); bool option_debug = GDKgetenv_isyes(debug_enableflag) || GDKgetenv_istrue(debug_enableflag); bool option_zerocopy = !(GDKgetenv_isyes(zerocopy_disableflag) || GDKgetenv_istrue(zerocopy_disableflag)); @@ -385,7 +385,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st int sem_id = -1; int process_id = 0; int memory_size = 0; - int process_count; + int process_count = 0; #endif size_t count; @@ -773,6 +773,8 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st goto wrapup; } + VERBOSE_MESSAGE("- Loading a BAT of type %s (%d)\n", BatType_Format(inp->bat_type), inp->bat_type); + #ifndef WIN32 if (mapped && process_id && process_count > 1) { @@ -781,15 +783,14 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st double chunk = process_id - 1; double totalchunks = process_count; double count = BATcount(b); - t_start = ceil((count * chunk) / totalchunks); - t_end = floor((count * (chunk + 1)) / totalchunks); - if (((int)count) / 2 * 2 == (int)count) t_end--; + if (count >= process_count) { + t_start = ceil((count * chunk) / totalchunks); + t_end = floor((count * (chunk + 1)) / totalchunks); + if (((int)count) / 2 * 2 == (int)count) t_end--; + VERBOSE_MESSAGE("---Start: %d, End: %d, Count: %d\n", t_start, t_end, t_end - t_start); + } } #endif - VERBOSE_MESSAGE("Start: %d, End: %d, Count: %d\n", t_start, t_end, t_end - t_start); - - VERBOSE_MESSAGE("- Loading a BAT of type %s (%d)\n", BatType_Format(inp->bat_type), inp->bat_type); - switch (inp->bat_type) { case TYPE_bte: vararray = BAT_TO_NP(b, bte, NPY_INT8); @@ -810,60 +811,150 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st vararray = BAT_TO_NP(b, dbl, NPY_FLOAT64); break; case TYPE_str: - li = bat_iterator(b); + if (numpy_string_array) { + bool unicode = false; - //we first loop over all the strings in the BAT to find the maximum length of a single string - //this is because NUMPY only supports strings with a fixed maximum length - maxsize = 0; - count = inp->count; - BATloop(b, p, q) - { - const char *t = (const char *) BUNtail(li, p); - const size_t length = strlen(t);//utf8_strlen(t); //get the amount of UTF-8 characters in the string + li = bat_iterator(b); - if (length > maxsize) - maxsize = length; + //we first loop over all the strings in the BAT to find the maximum length of a single string + //this is because NUMPY only supports strings with a fixed maximum length + maxsize = 0; + count = inp->count; + j = 0; + BATloop(b, p, q) { + if (j >= t_start) { + bool ascii; + const char *t = (const char *) BUNtail(li, p); + size_t length; + if (strcmp(t, str_nil) == 0) { + length = 1; + } else { + length = utf8_strlen(t, &ascii); //get the amount of UTF-8 characters in the string + unicode = !ascii || unicode; //if even one string is unicode we have to store the entire array as unicode + } + if (length > maxsize) + maxsize = length; + } + if (j == t_end) break; + j++; + } + if (unicode) { + VERBOSE_MESSAGE("- Unicode string!\n"); + //create a NPY_UNICODE array object + vararray = PyArray_New( + &PyArray_Type, + 1, + (npy_intp[1]) {t_end - t_start}, + NPY_UNICODE, + NULL, + NULL, + maxsize * 4, //we have to do maxsize*4 because NPY_UNICODE is stored as UNICODE-32 (i.e. 4 bytes per character) + 0, + NULL); + //fill the NPY_UNICODE array object using the PyArray_SETITEM function + j = 0; + BATloop(b, p, q) + { + if (j >= t_start) { + char *t = (char *) BUNtail(li, p); + PyObject *obj; + if (strcmp(t, str_nil) == 0) { + //str_nil isn't a valid UTF-8 character (it's 0x80), so we can't decode it as UTF-8 (it will throw an error) + obj = PyUnicode_FromString("-"); + } + else { + //otherwise we can just decode the string as UTF-8 + obj = PyUnicode_FromString(t); + } + + if (obj == NULL) + { + PyErr_Print(); + msg = createException(MAL, "pyapi.eval", "Failed to decode string as UTF-8."); + goto wrapup; + } + PyArray_SETITEM((PyArrayObject*)vararray, PyArray_GETPTR1((PyArrayObject*)vararray, j), obj); + } + if (j == t_end) break; + j++; + } + } else { + VERBOSE_MESSAGE("- ASCII string!\n"); + //create a NPY_STRING array object + vararray = PyArray_New( + &PyArray_Type, + 1, + (npy_intp[1]) {t_end - t_start}, + NPY_STRING, + NULL, + NULL, + maxsize, + 0, + NULL); + j = 0; + BATloop(b, p, q) + { + if (j >= t_start) { + char *t = (char *) BUNtail(li, p); + PyObject *obj = PyString_FromString(t); + + if (obj == NULL) + { + msg = createException(MAL, "pyapi.eval", "Failed to create string."); + goto wrapup; + } + PyArray_SETITEM((PyArrayObject*)vararray, PyArray_GETPTR1((PyArrayObject*)vararray, j), obj); + } + if (j == t_end) break; + j++; + } + } } + else { + // TODO: This + // NPY_OBJECT array + // vararray = PyArray_New( + // &PyArray_Type, + // 1, + // (npy_intp[1]) {count}, + // NPY_OBJECT, + // NULL, + // NULL, + // 0, + // 0, + // NULL); + // j = 0; + // BATloop(b, p, q) + // { + // if (j >= t_start) { + // char *t = (char *) BUNtail(li, p); + // PyObject *obj; + // if (strcmp(t, str_nil) == 0) { + // //str_nil isn't a valid UTF-8 character (it's 0x80), so we can't decode it as UTF-8 (it will throw an error) + // obj = PyString_FromString("-"); + // } + // else { + // //otherwise we can just decode the string as UTF-8 + // obj = PyString_FromString(t); + // } - //create a NPY_UNICODE array object - vararray = PyArray_New( - &PyArray_Type, - 1, - (npy_intp[1]) {count}, - NPY_UNICODE, - NULL, - NULL, - maxsize * 4, //we have to do maxsize*4 because NPY_UNICODE is stored as UNICODE-32 (i.e. 4 bytes per character) - 0, - NULL); - - //fill the NPY_UNICODE array object using the PyArray_SETITEM function - j = 0; - BATloop(b, p, q) - { - const char *t = (const char *) BUNtail(li, p); - PyObject *obj; - if (strcmp(t, str_nil) == 0) { - //str_nil isn't a valid UTF-8 character (it's 0x80), so we can't decode it as UTF-8 (it will throw an error) - obj = PyUnicode_FromString("-"); - } - else { - //otherwise we can just decode the string as UTF-8 - obj = PyUnicode_FromString(t); - } - - if (obj == NULL) - { - PyErr_Print(); - msg = createException(MAL, "pyapi.eval", "Failed to decode string as UTF-8."); - goto wrapup; - } - PyArray_SETITEM((PyArrayObject*)vararray, PyArray_GETPTR1((PyArrayObject*)vararray, j), obj); - j++; + // if (obj == NULL) + // { + // PyErr_Print(); + // msg = createException(MAL, "pyapi.eval", "Failed to decode string as UTF-8."); + // goto wrapup; + // } + // PyArray_SETITEM((PyArrayObject*)vararray, PyArray_GETPTR1((PyArrayObject*)vararray, j), obj); + // } + // if (j == t_end) break; + // j++; + // } + // PyArray_INCREF((PyArrayObject*)vararray); } break; #ifdef HAVE_HGE case TYPE_hge: + { li = bat_iterator(b); count = inp->count; @@ -883,7 +974,6 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st fprintf(stderr, "!WARNING: Type \"hge\" (128 bit) is unsupported by Numpy. The numbers are instead converted to python objects of type \"long\". This is likely very slow.\n"); BATloop(b, p, q) { char hex[40]; - //we first convert the huge to a string in hex format PyObject *obj; const hge *t = (const hge *) BUNtail(li, p); hge_to_string(hex, 40, *t); @@ -898,6 +988,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st j++; } break; + } #endif default: msg = createException(MAL, "pyapi.eval", "unknown argument type "); @@ -947,7 +1038,6 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st } } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list