Changeset: 4d99b536bd1d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4d99b536bd1d
Added Files:
        monetdb5/modules/mal/tokenizer.c
        monetdb5/modules/mal/tokenizer.h
        monetdb5/modules/mal/tokenizer.mal
Removed Files:
        monetdb5/modules/mal/tokenizer.mx
Modified Files:
        monetdb5/modules/mal/Makefile.ag
Branch: default
Log Message:

De-mz the tokenizer.


diffs (truncated from 480 to 300 lines):

diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag
--- a/monetdb5/modules/mal/Makefile.ag
+++ b/monetdb5/modules/mal/Makefile.ag
@@ -57,7 +57,7 @@ lib_mal = {
                sabaoth.c sabaoth.h \
                tablet.c tablet.h \
                tablet_sql.c \
-               tokenizer.mx \
+               tokenizer.c tokenizer.h \
                trader.c trader.h \
                transaction.c \
                txtsim.c txtsim.h \
@@ -78,9 +78,9 @@ headers_mal = {
                mal_mapi.mal sabaoth.mal remote.mal  \
                txtsim.mal recycle.mal \
                cluster.mx trader.mal \
-               tokenizer.mx zorder.mal sample.mal
+               tokenizer.mal zorder.mal sample.mal
 }
 
-EXTRA_DIST = algebraExtensions.mal attach.mal batExtensions.mal iterator.mal 
constraints.mal groupby.mal histogram.mal mal_init.mal manual.mal mkey.mal 
pcre.mal profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal 
transaction.mal txtsim.mal tablet.mal tablet.h sample.mal mal_mapi.mal mat.mal
+EXTRA_DIST = algebraExtensions.mal attach.mal batExtensions.mal iterator.mal 
constraints.mal groupby.mal histogram.mal mal_init.mal manual.mal mkey.mal 
pcre.mal profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal 
transaction.mal txtsim.mal tablet.mal tablet.h sample.mal mal_mapi.mal mat.mal 
tokenizer.mal
 
 EXTRA_DIST_DIR = Tests
diff --git a/monetdb5/modules/mal/tokenizer.mx 
b/monetdb5/modules/mal/tokenizer.c
rename from monetdb5/modules/mal/tokenizer.mx
rename to monetdb5/modules/mal/tokenizer.c
--- a/monetdb5/modules/mal/tokenizer.mx
+++ b/monetdb5/modules/mal/tokenizer.c
@@ -1,29 +1,25 @@
-@/
-The contents of this file are subject to the MonetDB Public License
-Version 1.1 (the "License"); you may not use this file except in
-compliance with the License. You may obtain a copy of the License at
-http://www.monetdb.org/Legal/MonetDBLicense
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ * 
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * The Original Code is the MonetDB Database System.
+ * 
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2012 MonetDB B.V.
+ * All Rights Reserved.
+*/
 
-Software distributed under the License is distributed on an "AS IS"
-basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
-License for the specific language governing rights and limitations
-under the License.
-
-The Original Code is the MonetDB Database System.
-
-The Initial Developer of the Original Code is CWI.
-Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
-Copyright August 2008-2012 MonetDB B.V.
-All Rights Reserved.
-@
-
-@f tokenizer
-
-@c
 /*
- * @a Lefteris Sidirourgos
- * @v 0.1
- * @* Tokenizer
+ * author Lefteris Sidirourgos
+ * Tokenizer
  * This module implements a vertical fragmented tokenizer for strings. It is 
based
  * on the ideas of the urlbox module by mk.
  *
@@ -49,91 +45,6 @@ All Rights Reserved.
  * administrative issues and security aspects (e.g., opening a tokenizer of
  * a different schema) should be addressed more thoroughly.
  */
-@mal
-module tokenizer
-comment "The tokenizer provides fast access to a large collection of strings
-based on a vertical fragmented representation.";
-
-command open(name:str):void
-address TKNZRopen
-comment "open the named tokenizer store, a new one is created if the specified 
name does not exist";
-
-command close():void
-address TKNZRclose
-comment "close the current tokenizer store";
-
-pattern take(i:oid):str
-address TKNZRtakeOid
-comment "reconstruct and returns the i-th string";
-
-pattern locate(s:str):oid
-address TKNZRlocate
-comment "if the given string is in the store returns its oid, otherwise 
oid_nil";
-
-command append(u:str):oid
-address TKNZRappend
-comment "tokenize a new string and append it to the tokenizer (duplicate 
elimination is performed)";
-
-command depositFile(fnme:str):void
-address TKNZRdepositFile
-comment "batch insertion from a file of strings to tokenize, each string is 
separated by a new line";
-
-command getLevel(i:int):bat[:oid,:str]
-address TKNZRgetLevel
-comment "administrative function that returns the bat on level i";
-
-command getIndex():bat[:void,:oid]
-address TKNZRgetIndex
-comment "administrative function that returns the INDEX bat";
-
-command getCount():bat[:void,:wrd]
-address TKNZRgetCount
-comment "debugging function that returns the size of the bats at each level";
-
-command getCardinality():bat[:void,:wrd]
-address TKNZRgetCardinality
-comment "debugging function that returns the unique tokens at each level";
-
-@h
-/*
- * @-
- * @+ Implementation
- */
-#ifndef _TKNZR_H
-#define _TKNZR_H
-#include "mal.h"
-#include "mal_client.h"
-#include "mal_interpreter.h"
-
-#ifdef WIN32
-#if !defined(LIBMAL) && !defined(LIBATOMS) && !defined(LIBKERNEL) && 
!defined(LIBMAL) && !defined(LIBOPTIMIZER) && !defined(LIBSCHEDULER) && 
!defined(LIBMONETDB5)
-#define tokenizer_export extern __declspec(dllimport)
-#else
-#define tokenizer_export extern __declspec(dllexport)
-#endif
-#else
-#define tokenizer_export extern
-#endif
-
-@= params
-(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci);
-
-@
-@h
-tokenizer_export str TKNZRopen             (int *r, str *name);
-tokenizer_export str TKNZRclose            (int *r);
-tokenizer_export str TKNZRappend          (oid *pos, str *tuple);
-tokenizer_export str TKNZRlocate           @:params@
-tokenizer_export str TKNZRtakeOid          @:params@
-tokenizer_export str TKNZRdepositFile      (int *r, str *fnme);
-tokenizer_export str TKNZRgetLevel         (int *r, int *level);
-tokenizer_export str TKNZRgetIndex         (int *r);
-tokenizer_export str TKNZRgetCount         (int *r);
-tokenizer_export str TKNZRgetCardinality   (int *r);
-
-#endif /* _TKNZR_H */
-
-@c
 #include "monetdb_config.h"
 #include "bat5.h"
 #include "tokenizer.h"
@@ -228,20 +139,14 @@ TKNZRopen(int *ret, str *in)
        return MAL_SUCCEED;
 }
 
-@= init_check
-if (TRANS == NULL) {
-       throw(MAL, "tokenizer", "no tokenizer store open");
-}
-
-@
-@c
 str
 TKNZRclose(int *r)
 {
        int i;
        (void) r;
 
-       @:init_check@
+       if (TRANS == NULL)
+               throw(MAL, "tokenizer", "no tokenizer store open");
 
        TMsubcommit(TRANS);
 
@@ -258,7 +163,7 @@ TKNZRclose(int *r)
 }
 
 /*
- * @- Tokenize operations
+ * Tokenize operations
  * The tokenizer operation assumes a private copy to mark the
  * end of the token separators with a zero byte. Tokens are
  * separated by a single character for simplicity.
@@ -284,7 +189,27 @@ TKNZRtokenize(str in, str *parts, char t
        return depth;
 }
 
-@= insert
+str
+TKNZRappend(oid *pos, str *s)
+{
+       str url;
+       str batname;
+       str parts[MAX_TKNZR_DEPTH];
+       int i, new, r, depth;
+       BAT *b;
+       BUN p;
+       BUN idx = 0;
+       oid prv = 0;
+       oid comp;
+
+       if (TRANS == NULL)
+               throw(MAL, "tokenizer", "no tokenizer store open");
+
+       if ((url = GDKstrdup(*s)) == NULL) {
+               throw(MAL, "tokenizer.append",
+                               OPERATION_FAILED "could not allocate memory");
+       }
+
        depth = TKNZRtokenize(url, parts, '/');
        new = depth;
 
@@ -330,11 +255,7 @@ TKNZRtokenize(str in, str *parts, char t
                }
                tokenDepth = depth;
        }
-
-@
- * @-
- * Find the common prefix first
-@= findcommon
+       /* findcommn */
        p = BUNfnd(BATmirror(tokenBAT[0]), parts[0]);
        if (p != BUN_NONE) {
                prv = (oid) p;
@@ -357,10 +278,17 @@ TKNZRtokenize(str in, str *parts, char t
                i = 0;
        }
 
-@
- * @-
- * Insert the remainder as a new string
-@= insremainder
+       if (i == depth) {
+               comp = COMP(prv, depth);
+               *pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr) &comp);
+               if (*pos != BUN_NONE) {
+                       /* the string is already there */
+                       GDKfree(url);
+                       return MAL_SUCCEED;
+               }
+       }
+
+       /* insremainder */
        for(; i < depth; i++){
                idx = BATcount(tokenBAT[i]);
                if (idx > MAX_h) {
@@ -382,43 +310,6 @@ TKNZRtokenize(str in, str *parts, char t
                prv = (oid) idx;
        }
 
-@
-@c
-str
-TKNZRappend(oid *pos, str *s)
-{
-       str url;
-       str batname;
-       str parts[MAX_TKNZR_DEPTH];
-       int i, new, r, depth;
-       BAT *b;
-       BUN p;
-       BUN idx = 0;
-       oid prv = 0;
-       oid comp;
-
-       @:init_check@
-
-       if ((url = GDKstrdup(*s)) == NULL) {
-               throw(MAL, "tokenizer.append",
-                               OPERATION_FAILED "could not allocate memory");
-       }
-
-       @:insert@
-       @:findcommon@
-
-       if (i == depth) {
-               comp = COMP(prv, depth);
-               *pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr) &comp);
-               if (*pos != BUN_NONE) {
_______________________________________________
Checkin-list mailing list
Checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to