Changeset: 5a120a9bf79e for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/5a120a9bf79e Modified Files: sql/backends/monet5/sql_upgrades.c Branch: sqloptimizer Log Message:
Merged with default diffs (truncated from 670 to 300 lines): diff --git a/CMakeLists.txt b/CMakeLists.txt --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ if(NOT ${CMAKE_INSTALL_PREFIX} STREQUAL # SET(CMAKE_SKIP_RPATH TRUE) endif() -# required for some instalation files +# required for some installation files set(PROGRAM_PERMISSIONS_DEFAULT OWNER_WRITE OWNER_READ diff --git a/clients/Tests/MAL-signatures-hge.test b/clients/Tests/MAL-signatures-hge.test --- a/clients/Tests/MAL-signatures-hge.test +++ b/clients/Tests/MAL-signatures-hge.test @@ -45798,6 +45798,11 @@ similarity command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl] fstrcmp0_impl_bulk; Normalized edit distance between two strings +baturl +extractURLHost +command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str] +BATextractURLHost; +Extract host from BAT of URLs batuuid isaUUID command batuuid.isaUUID(X_0:bat[:str]):bat[:bit] @@ -64589,6 +64594,11 @@ command txtsim.stringdiff(X_0:str, X_1:s stringdiff_impl; calculate the soundexed editdistance url +extractURLHost +command url.extractURLHost(X_0:str, X_1:bit):str +extractURLHost; +Extract host from a URL relaxed version +url getAnchor command url.getAnchor(X_0:url):str URLgetAnchor; @@ -64622,7 +64632,7 @@ url getHost command url.getHost(X_0:url):str URLgetHost; -Extract the server name from the URL +Extract the server name from the URL strict version url getPort command url.getPort(X_0:url):str diff --git a/cmake/monetdb-defines.cmake b/cmake/monetdb-defines.cmake --- a/cmake/monetdb-defines.cmake +++ b/cmake/monetdb-defines.cmake @@ -300,16 +300,18 @@ macro(monetdb_configure_misc) "PASSWORD_BACKEND invalid, choose one of MD5, SHA1, RIPEMD160, SHA224, SHA256, SHA384, SHA512") endif() - # Used for installing testing python module (don't pass a location, else we need to strip this again) - execute_process(COMMAND "${Python3_EXECUTABLE}" "-c" "import sysconfig; print(sysconfig.get_path('purelib', vars={'base': ''})[1:])" - RESULT_VARIABLE PY3_LIBDIR_CODE - OUTPUT_VARIABLE PYTHON3_SITEDIR - OUTPUT_STRIP_TRAILING_WHITESPACE) - if (PY3_LIBDIR_CODE) - message(WARNING - "Could not determine MonetDB Python3 site-packages instalation directory") + if(NOT DEFINED PYTHON3_LIBDIR) + # Used for installing testing python module (don't pass a location, else we need to strip this again) + execute_process(COMMAND "${Python3_EXECUTABLE}" "-c" "import sysconfig; print(sysconfig.get_path('purelib', vars={'base': ''})[1:])" + RESULT_VARIABLE PY3_LIBDIR_CODE + OUTPUT_VARIABLE PYTHON3_SITEDIR + OUTPUT_STRIP_TRAILING_WHITESPACE) + if (PY3_LIBDIR_CODE) + message(WARNING + "Could not determine MonetDB Python3 site-packages installation directory") + endif() + set(PYTHON3_LIBDIR "${PYTHON3_SITEDIR}") endif() - set(PYTHON3_LIBDIR "${PYTHON3_SITEDIR}") set(PYTHON "${Python3_EXECUTABLE}") if(MSVC) diff --git a/debian/rules b/debian/rules --- a/debian/rules +++ b/debian/rules @@ -13,6 +13,7 @@ DH_VERBOSE=1 override_dh_auto_configure: dh_auto_configure -- \ + -DPYTHON3_LIBDIR=lib/python3/dist-packages -DCMAKE_INSTALL_RUNSTATEDIR=/run \ -DRELEASE_VERSION=ON \ -DASSERT=OFF \ diff --git a/monetdb5/modules/atoms/url.c b/monetdb5/modules/atoms/url.c --- a/monetdb5/modules/atoms/url.c +++ b/monetdb5/modules/atoms/url.c @@ -46,6 +46,7 @@ #include "gdk.h" #include <ctype.h> #include "mal_exception.h" +#include "str.h" typedef str url; @@ -818,6 +819,177 @@ static str URLnoop(url *u, url *val) return MAL_SUCCEED; } + +/* Extract host identity from URL. This is a relaxed version, + * where no exceptions is thrown when the input URL is not valid, + * and empty string is returned instead. + * */ +static str +extractURLHost(str *retval, str *url, bit *no_www) +{ + const char *s; + const char *h = NULL; + const char *p = NULL; + + if ((url != NULL || *url != NULL) && !strNil(*url)) { + if ((s = skip_scheme(*url)) != NULL && + (s = skip_authority(s, NULL, NULL, &h, &p)) != NULL && + h != NULL) + { + ssize_t l; + const char *pos = s; + const char *domain = NULL; + while (pos > h) { + if (*pos == '.') { + domain = pos; + break; + } + pos--; + } + + if (p != NULL) { + l = p - h - 1; + } else { + l = s - h; + } + if (domain && l > 3) { + if ((*retval = GDKmalloc(l + 1)) != NULL) { + if (*no_www && strlen(h) > 4 && !strncmp(h, "www.", 4)) { + strcpy_len(*retval, (h + 4), l + 1); + } else { + strcpy_len(*retval, h, l + 1); + } + // clean up if not valid UTF-8 + if (!checkUTF8(*retval)) { + // printf("%s\n", h); + GDKfree(*retval); + *retval = GDKstrdup(str_nil); + } + } else { + throw(MAL, "url.getURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL); + } + } else { + // printf("%s\n", h); + *retval = GDKstrdup(str_nil); + } + + } else { + *retval = GDKstrdup(str_nil); + } + } else { + *retval = GDKstrdup(str_nil); + } + + return MAL_SUCCEED; +} + + +static inline str +str_buf_copy(str *buf, size_t *buflen, const char *s, size_t l) { + CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "url.str_buf_copy"); + strcpy_len(*buf, s, l); + return MAL_SUCCEED; +} + + +// bulk version +static str +BATextractURLHost(bat *res, const bat *bid, bit *no_www) +{ + const char *s; + const char *host = NULL; + const char *port = NULL; + BAT *bn = NULL; + BAT *b = BATdescriptor(*bid); + BUN p, q; + size_t buflen = INITIAL_STR_BUFFER_LENGTH; + str buf = GDKmalloc(buflen); + str msg = MAL_SUCCEED; + bool nils = false; + + if (buf == NULL) + throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL); + + if (b == NULL) + throw(MAL, "baturl.extractURLHost", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + if ((bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT)) == NULL) { + throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL); + BBPunfix(b->batCacheid); + } + + BATiter bi = bat_iterator(b); + BATloop(b, p, q) { + const char *url = (const char *) BUNtail(bi, p); + if (strNil(url)) { + if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) { + msg = createException(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL ); + break; + } + nils = true; + } else { + if ((s = skip_scheme(url)) != NULL && + (s = skip_authority(s, NULL, NULL, &host, &port)) != NULL && + host != NULL) + { + ssize_t l; + const char *pos = s; + const char *domain = NULL; + while (pos > host) { + if (*pos == '.') { + domain = pos; + break; + } + pos--; + } + + if (port != NULL) { + l = port - host - 1; + } else { + l = s - host; + } + if (domain && l > 3) { + if (*no_www && !strncmp(host, "www.", 4)) { + host += 4; + l -= 4; + } + if (l > 0) { + // if ((msg = str_Sub_String(&buf, &buflen, host, 0, l)) != MAL_SUCCEED) + // break; + if ((msg = str_buf_copy(&buf, &buflen, host, (size_t) l)) != MAL_SUCCEED) + break; + if (bunfastapp_nocheckVAR(bn, buf) != GDK_SUCCEED) { + msg = createException(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL ); + break; + } + continue; + } + } + } + // fall back insert nil str if no valid host + if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) { + msg = createException(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL ); + break; + } + nils = true; + } + } + bat_iterator_end(&bi); + + GDKfree(buf); + if (msg == MAL_SUCCEED) { + BATsetcount(bn, q); + bn->tnil = nils; + bn->tnonil = !nils; + bn->tkey = BATcount(bn) <= 1; + bn->tsorted = BATcount(bn) <= 1; + bn->trevsorted = BATcount(bn) <= 1; + BBPkeepref(*res = bn->batCacheid); + } + BBPunfix(b->batCacheid); + return msg; +} + + #include "mel.h" mel_atom url_init_atoms[] = { { .name="url", .basetype="str", .fromstr=URLfromString, .tostr=URLtoString, }, { .cmp=NULL } @@ -833,7 +1005,7 @@ mel_func url_init_funcs[] = { command("url", "getDomain", URLgetDomain, false, "Extract Internet domain from the URL", args(1,2, arg("",str),arg("u",url))), command("url", "getExtension", URLgetExtension, false, "Extract the file extension of the URL", args(1,2, arg("",str),arg("u",url))), command("url", "getFile", URLgetFile, false, "Extract the last file name of the URL", args(1,2, arg("",str),arg("u",url))), - command("url", "getHost", URLgetHost, false, "Extract the server name from the URL", args(1,2, arg("",str),arg("u",url))), + command("url", "getHost", URLgetHost, false, "Extract the server name from the URL strict version", args(1,2, arg("",str),arg("u",url))), command("url", "getPort", URLgetPort, false, "Extract the port id from the URL", args(1,2, arg("",str),arg("u",url))), command("url", "getProtocol", URLgetProtocol, false, "Extract the protocol from the URL", args(1,2, arg("",str),arg("u",url))), command("url", "getQuery", URLgetQuery, false, "Extract the query string from the URL", args(1,2, arg("",str),arg("u",url))), @@ -842,6 +1014,8 @@ mel_func url_init_funcs[] = { command("url", "isaURL", URLisaURL, false, "Check conformity of the URL syntax", args(1,2, arg("",bit),arg("u",str))), command("url", "new", URLnew4, false, "Construct URL from protocol, host, port, and file", args(1,5, arg("",url),arg("p",str),arg("h",str),arg("prt",int),arg("f",str))), command("url", "new", URLnew3, false, "Construct URL from protocol, host,and file", args(1,4, arg("",url),arg("prot",str),arg("host",str),arg("fnme",str))), + command("url", "extractURLHost", extractURLHost, false, "Extract host from a URL relaxed version", args(1,3, arg("",str),arg("u",str), arg("no_www", bit))), + command("baturl", "extractURLHost", BATextractURLHost, false, "Extract host from BAT of URLs", args(1,3, batarg("",str), batarg("s",str), arg("no_www", bit))), { .imp=NULL } }; #include "mal_import.h" diff --git a/sql/backends/monet5/sql_upgrades.c b/sql/backends/monet5/sql_upgrades.c --- a/sql/backends/monet5/sql_upgrades.c +++ b/sql/backends/monet5/sql_upgrades.c _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org