Changeset: 5a120a9bf79e for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/5a120a9bf79e
Modified Files:
        sql/backends/monet5/sql_upgrades.c
Branch: sqloptimizer
Log Message:

Merged with default


diffs (truncated from 670 to 300 lines):

diff --git a/CMakeLists.txt b/CMakeLists.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ if(NOT ${CMAKE_INSTALL_PREFIX} STREQUAL 
 #  SET(CMAKE_SKIP_RPATH TRUE)
 endif()
 
-# required for some instalation files
+# required for some installation files
 set(PROGRAM_PERMISSIONS_DEFAULT
   OWNER_WRITE
   OWNER_READ
diff --git a/clients/Tests/MAL-signatures-hge.test 
b/clients/Tests/MAL-signatures-hge.test
--- a/clients/Tests/MAL-signatures-hge.test
+++ b/clients/Tests/MAL-signatures-hge.test
@@ -45798,6 +45798,11 @@ similarity
 command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl] 
 fstrcmp0_impl_bulk;
 Normalized edit distance between two strings
+baturl
+extractURLHost
+command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str] 
+BATextractURLHost;
+Extract host from BAT of URLs
 batuuid
 isaUUID
 command batuuid.isaUUID(X_0:bat[:str]):bat[:bit] 
@@ -64589,6 +64594,11 @@ command txtsim.stringdiff(X_0:str, X_1:s
 stringdiff_impl;
 calculate the soundexed editdistance
 url
+extractURLHost
+command url.extractURLHost(X_0:str, X_1:bit):str 
+extractURLHost;
+Extract host from a URL relaxed version
+url
 getAnchor
 command url.getAnchor(X_0:url):str 
 URLgetAnchor;
@@ -64622,7 +64632,7 @@ url
 getHost
 command url.getHost(X_0:url):str 
 URLgetHost;
-Extract the server name from the URL
+Extract the server name from the URL strict version
 url
 getPort
 command url.getPort(X_0:url):str 
diff --git a/cmake/monetdb-defines.cmake b/cmake/monetdb-defines.cmake
--- a/cmake/monetdb-defines.cmake
+++ b/cmake/monetdb-defines.cmake
@@ -300,16 +300,18 @@ macro(monetdb_configure_misc)
       "PASSWORD_BACKEND invalid, choose one of MD5, SHA1, RIPEMD160, SHA224, 
SHA256, SHA384, SHA512")
   endif()
 
-  # Used for installing testing python module (don't pass a location, else we 
need to strip this again)
-  execute_process(COMMAND "${Python3_EXECUTABLE}" "-c" "import sysconfig; 
print(sysconfig.get_path('purelib', vars={'base': ''})[1:])"
-    RESULT_VARIABLE PY3_LIBDIR_CODE
-    OUTPUT_VARIABLE PYTHON3_SITEDIR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (PY3_LIBDIR_CODE)
-    message(WARNING
-      "Could not determine MonetDB Python3 site-packages instalation 
directory")
+  if(NOT DEFINED PYTHON3_LIBDIR)
+    # Used for installing testing python module (don't pass a location, else 
we need to strip this again)
+    execute_process(COMMAND "${Python3_EXECUTABLE}" "-c" "import sysconfig; 
print(sysconfig.get_path('purelib', vars={'base': ''})[1:])"
+      RESULT_VARIABLE PY3_LIBDIR_CODE
+      OUTPUT_VARIABLE PYTHON3_SITEDIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if (PY3_LIBDIR_CODE)
+      message(WARNING
+        "Could not determine MonetDB Python3 site-packages installation 
directory")
+    endif()
+    set(PYTHON3_LIBDIR "${PYTHON3_SITEDIR}")
   endif()
-  set(PYTHON3_LIBDIR "${PYTHON3_SITEDIR}")
   set(PYTHON "${Python3_EXECUTABLE}")
 
   if(MSVC)
diff --git a/debian/rules b/debian/rules
--- a/debian/rules
+++ b/debian/rules
@@ -13,6 +13,7 @@ DH_VERBOSE=1
 
 override_dh_auto_configure:
        dh_auto_configure -- \
+       -DPYTHON3_LIBDIR=lib/python3/dist-packages
        -DCMAKE_INSTALL_RUNSTATEDIR=/run \
        -DRELEASE_VERSION=ON \
        -DASSERT=OFF \
diff --git a/monetdb5/modules/atoms/url.c b/monetdb5/modules/atoms/url.c
--- a/monetdb5/modules/atoms/url.c
+++ b/monetdb5/modules/atoms/url.c
@@ -46,6 +46,7 @@
 #include "gdk.h"
 #include <ctype.h>
 #include "mal_exception.h"
+#include "str.h"
 
 typedef str url;
 
@@ -818,6 +819,177 @@ static str URLnoop(url *u, url *val)
        return MAL_SUCCEED;
 }
 
+
+/* Extract host identity from URL. This is a relaxed version,
+ * where no exceptions is thrown when the input URL is not valid,
+ * and empty string is returned instead.
+ * */
+static str
+extractURLHost(str *retval, str *url, bit *no_www)
+{
+       const char *s;
+       const char *h = NULL;
+       const char *p = NULL;
+
+       if ((url != NULL || *url != NULL) && !strNil(*url)) {
+               if ((s = skip_scheme(*url)) != NULL &&
+                       (s = skip_authority(s, NULL, NULL, &h, &p)) != NULL &&
+                       h != NULL)
+               {
+                       ssize_t l;
+                       const char *pos = s;
+                       const char *domain = NULL;
+                       while (pos > h) {
+                               if (*pos == '.') {
+                                       domain = pos;
+                                       break;
+                               }
+                               pos--;
+                       }
+
+                       if (p != NULL) {
+                               l = p - h - 1;
+                       } else {
+                               l = s - h;
+                       }
+                       if (domain && l > 3) {
+                               if ((*retval = GDKmalloc(l + 1)) != NULL) {
+                                       if (*no_www && strlen(h) > 4 && 
!strncmp(h, "www.", 4)) {
+                                               strcpy_len(*retval, (h + 4), l 
+ 1);
+                                       } else {
+                                               strcpy_len(*retval, h, l + 1);
+                                       }
+                                       // clean up if not valid UTF-8
+                                       if (!checkUTF8(*retval)) {
+                                               // printf("%s\n", h);
+                                               GDKfree(*retval);
+                                               *retval = GDKstrdup(str_nil);
+                                       }
+                               } else {
+                                       throw(MAL, "url.getURLHost", 
SQLSTATE(HY013) MAL_MALLOC_FAIL);
+                               }
+                       } else {
+                               // printf("%s\n", h);
+                               *retval = GDKstrdup(str_nil);
+                       }
+
+               } else {
+                       *retval = GDKstrdup(str_nil);
+               }
+       } else {
+               *retval = GDKstrdup(str_nil);
+       }
+
+       return MAL_SUCCEED;
+}
+
+
+static inline str
+str_buf_copy(str *buf, size_t *buflen, const char *s, size_t l) {
+       CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "url.str_buf_copy");
+       strcpy_len(*buf, s, l);
+       return MAL_SUCCEED;
+}
+
+
+// bulk version
+static str
+BATextractURLHost(bat *res, const bat *bid, bit *no_www)
+{
+       const char *s;
+       const char *host = NULL;
+       const char *port = NULL;
+       BAT *bn = NULL;
+       BAT *b = BATdescriptor(*bid);
+       BUN p, q;
+       size_t buflen = INITIAL_STR_BUFFER_LENGTH;
+       str buf = GDKmalloc(buflen);
+       str msg = MAL_SUCCEED;
+       bool nils = false;
+
+       if (buf == NULL)
+               throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
+
+       if (b == NULL)
+               throw(MAL, "baturl.extractURLHost", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+       if ((bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT)) == 
NULL) {
+               throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
+               BBPunfix(b->batCacheid);
+       }
+
+       BATiter bi = bat_iterator(b);
+       BATloop(b, p, q) {
+               const char *url = (const char *) BUNtail(bi, p);
+               if (strNil(url)) {
+                       if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
+                               msg = createException(MAL, 
"baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL );
+                               break;
+                       }
+                       nils = true;
+               } else {
+                       if ((s = skip_scheme(url)) != NULL &&
+                               (s = skip_authority(s, NULL, NULL, &host, 
&port)) != NULL &&
+                               host != NULL)
+                       {
+                               ssize_t l;
+                               const char *pos = s;
+                               const char *domain = NULL;
+                               while (pos > host) {
+                                       if (*pos == '.') {
+                                               domain = pos;
+                                               break;
+                                       }
+                                       pos--;
+                               }
+
+                               if (port != NULL) {
+                                       l = port - host - 1;
+                               } else {
+                                       l = s - host;
+                               }
+                               if (domain && l > 3) {
+                                       if (*no_www && !strncmp(host, "www.", 
4)) {
+                                               host += 4;
+                                               l -= 4;
+                                       }
+                                       if (l > 0) {
+                                               // if ((msg = 
str_Sub_String(&buf, &buflen, host, 0, l)) != MAL_SUCCEED)
+                                               //      break;
+                                               if ((msg = str_buf_copy(&buf, 
&buflen, host, (size_t) l)) != MAL_SUCCEED)
+                                                       break;
+                                               if (bunfastapp_nocheckVAR(bn, 
buf) != GDK_SUCCEED) {
+                                                       msg = 
createException(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL );
+                                                       break;
+                                               }
+                                               continue;
+                                       }
+                               }
+                       }
+                       // fall back insert nil str if no valid host
+                       if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
+                               msg = createException(MAL, 
"baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL );
+                               break;
+                       }
+                       nils = true;
+               }
+       }
+       bat_iterator_end(&bi);
+
+       GDKfree(buf);
+       if (msg == MAL_SUCCEED) {
+               BATsetcount(bn, q);
+               bn->tnil = nils;
+               bn->tnonil = !nils;
+               bn->tkey = BATcount(bn) <= 1;
+               bn->tsorted = BATcount(bn) <= 1;
+               bn->trevsorted = BATcount(bn) <= 1;
+               BBPkeepref(*res = bn->batCacheid);
+       }
+       BBPunfix(b->batCacheid);
+       return msg;
+}
+
+
 #include "mel.h"
 mel_atom url_init_atoms[] = {
  { .name="url", .basetype="str", .fromstr=URLfromString, .tostr=URLtoString, 
},  { .cmp=NULL }
@@ -833,7 +1005,7 @@ mel_func url_init_funcs[] = {
  command("url", "getDomain", URLgetDomain, false, "Extract Internet domain 
from the URL", args(1,2, arg("",str),arg("u",url))),
  command("url", "getExtension", URLgetExtension, false, "Extract the file 
extension of the URL", args(1,2, arg("",str),arg("u",url))),
  command("url", "getFile", URLgetFile, false, "Extract the last file name of 
the URL", args(1,2, arg("",str),arg("u",url))),
- command("url", "getHost", URLgetHost, false, "Extract the server name from 
the URL", args(1,2, arg("",str),arg("u",url))),
+ command("url", "getHost", URLgetHost, false, "Extract the server name from 
the URL strict version", args(1,2, arg("",str),arg("u",url))),
  command("url", "getPort", URLgetPort, false, "Extract the port id from the 
URL", args(1,2, arg("",str),arg("u",url))),
  command("url", "getProtocol", URLgetProtocol, false, "Extract the protocol 
from the URL", args(1,2, arg("",str),arg("u",url))),
  command("url", "getQuery", URLgetQuery, false, "Extract the query string from 
the URL", args(1,2, arg("",str),arg("u",url))),
@@ -842,6 +1014,8 @@ mel_func url_init_funcs[] = {
  command("url", "isaURL", URLisaURL, false, "Check conformity of the URL 
syntax", args(1,2, arg("",bit),arg("u",str))),
  command("url", "new", URLnew4, false, "Construct URL from protocol, host, 
port, and file", args(1,5, 
arg("",url),arg("p",str),arg("h",str),arg("prt",int),arg("f",str))),
  command("url", "new", URLnew3, false, "Construct URL from protocol, host,and 
file", args(1,4, arg("",url),arg("prot",str),arg("host",str),arg("fnme",str))),
+ command("url", "extractURLHost", extractURLHost, false, "Extract host from a 
URL relaxed version", args(1,3, arg("",str),arg("u",str), arg("no_www", bit))),
+ command("baturl", "extractURLHost", BATextractURLHost, false, "Extract host 
from BAT of URLs", args(1,3, batarg("",str), batarg("s",str), arg("no_www", 
bit))),
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/sql/backends/monet5/sql_upgrades.c 
b/sql/backends/monet5/sql_upgrades.c
--- a/sql/backends/monet5/sql_upgrades.c
+++ b/sql/backends/monet5/sql_upgrades.c
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to