Changeset: 4d99b536bd1d for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4d99b536bd1d Added Files: monetdb5/modules/mal/tokenizer.c monetdb5/modules/mal/tokenizer.h monetdb5/modules/mal/tokenizer.mal Removed Files: monetdb5/modules/mal/tokenizer.mx Modified Files: monetdb5/modules/mal/Makefile.ag Branch: default Log Message:
De-mz the tokenizer. diffs (truncated from 480 to 300 lines): diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag --- a/monetdb5/modules/mal/Makefile.ag +++ b/monetdb5/modules/mal/Makefile.ag @@ -57,7 +57,7 @@ lib_mal = { sabaoth.c sabaoth.h \ tablet.c tablet.h \ tablet_sql.c \ - tokenizer.mx \ + tokenizer.c tokenizer.h \ trader.c trader.h \ transaction.c \ txtsim.c txtsim.h \ @@ -78,9 +78,9 @@ headers_mal = { mal_mapi.mal sabaoth.mal remote.mal \ txtsim.mal recycle.mal \ cluster.mx trader.mal \ - tokenizer.mx zorder.mal sample.mal + tokenizer.mal zorder.mal sample.mal } -EXTRA_DIST = algebraExtensions.mal attach.mal batExtensions.mal iterator.mal constraints.mal groupby.mal histogram.mal mal_init.mal manual.mal mkey.mal pcre.mal profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal transaction.mal txtsim.mal tablet.mal tablet.h sample.mal mal_mapi.mal mat.mal +EXTRA_DIST = algebraExtensions.mal attach.mal batExtensions.mal iterator.mal constraints.mal groupby.mal histogram.mal mal_init.mal manual.mal mkey.mal pcre.mal profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal transaction.mal txtsim.mal tablet.mal tablet.h sample.mal mal_mapi.mal mat.mal tokenizer.mal EXTRA_DIST_DIR = Tests diff --git a/monetdb5/modules/mal/tokenizer.mx b/monetdb5/modules/mal/tokenizer.c rename from monetdb5/modules/mal/tokenizer.mx rename to monetdb5/modules/mal/tokenizer.c --- a/monetdb5/modules/mal/tokenizer.mx +++ b/monetdb5/modules/mal/tokenizer.c @@ -1,29 +1,25 @@ -@/ -The contents of this file are subject to the MonetDB Public License -Version 1.1 (the "License"); you may not use this file except in -compliance with the License. You may obtain a copy of the License at -http://www.monetdb.org/Legal/MonetDBLicense +/* + * The contents of this file are subject to the MonetDB Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.monetdb.org/Legal/MonetDBLicense + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + * License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is the MonetDB Database System. + * + * The Initial Developer of the Original Code is CWI. + * Portions created by CWI are Copyright (C) 1997-July 2008 CWI. + * Copyright August 2008-2012 MonetDB B.V. + * All Rights Reserved. +*/ -Software distributed under the License is distributed on an "AS IS" -basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the -License for the specific language governing rights and limitations -under the License. - -The Original Code is the MonetDB Database System. - -The Initial Developer of the Original Code is CWI. -Portions created by CWI are Copyright (C) 1997-July 2008 CWI. -Copyright August 2008-2012 MonetDB B.V. -All Rights Reserved. -@ - -@f tokenizer - -@c /* - * @a Lefteris Sidirourgos - * @v 0.1 - * @* Tokenizer + * author Lefteris Sidirourgos + * Tokenizer * This module implements a vertical fragmented tokenizer for strings. It is based * on the ideas of the urlbox module by mk. * @@ -49,91 +45,6 @@ All Rights Reserved. * administrative issues and security aspects (e.g., opening a tokenizer of * a different schema) should be addressed more thoroughly. */ -@mal -module tokenizer -comment "The tokenizer provides fast access to a large collection of strings -based on a vertical fragmented representation."; - -command open(name:str):void -address TKNZRopen -comment "open the named tokenizer store, a new one is created if the specified name does not exist"; - -command close():void -address TKNZRclose -comment "close the current tokenizer store"; - -pattern take(i:oid):str -address TKNZRtakeOid -comment "reconstruct and returns the i-th string"; - -pattern locate(s:str):oid -address TKNZRlocate -comment "if the given string is in the store returns its oid, otherwise oid_nil"; - -command append(u:str):oid -address TKNZRappend -comment "tokenize a new string and append it to the tokenizer (duplicate elimination is performed)"; - -command depositFile(fnme:str):void -address TKNZRdepositFile -comment "batch insertion from a file of strings to tokenize, each string is separated by a new line"; - -command getLevel(i:int):bat[:oid,:str] -address TKNZRgetLevel -comment "administrative function that returns the bat on level i"; - -command getIndex():bat[:void,:oid] -address TKNZRgetIndex -comment "administrative function that returns the INDEX bat"; - -command getCount():bat[:void,:wrd] -address TKNZRgetCount -comment "debugging function that returns the size of the bats at each level"; - -command getCardinality():bat[:void,:wrd] -address TKNZRgetCardinality -comment "debugging function that returns the unique tokens at each level"; - -@h -/* - * @- - * @+ Implementation - */ -#ifndef _TKNZR_H -#define _TKNZR_H -#include "mal.h" -#include "mal_client.h" -#include "mal_interpreter.h" - -#ifdef WIN32 -#if !defined(LIBMAL) && !defined(LIBATOMS) && !defined(LIBKERNEL) && !defined(LIBMAL) && !defined(LIBOPTIMIZER) && !defined(LIBSCHEDULER) && !defined(LIBMONETDB5) -#define tokenizer_export extern __declspec(dllimport) -#else -#define tokenizer_export extern __declspec(dllexport) -#endif -#else -#define tokenizer_export extern -#endif - -@= params -(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci); - -@ -@h -tokenizer_export str TKNZRopen (int *r, str *name); -tokenizer_export str TKNZRclose (int *r); -tokenizer_export str TKNZRappend (oid *pos, str *tuple); -tokenizer_export str TKNZRlocate @:params@ -tokenizer_export str TKNZRtakeOid @:params@ -tokenizer_export str TKNZRdepositFile (int *r, str *fnme); -tokenizer_export str TKNZRgetLevel (int *r, int *level); -tokenizer_export str TKNZRgetIndex (int *r); -tokenizer_export str TKNZRgetCount (int *r); -tokenizer_export str TKNZRgetCardinality (int *r); - -#endif /* _TKNZR_H */ - -@c #include "monetdb_config.h" #include "bat5.h" #include "tokenizer.h" @@ -228,20 +139,14 @@ TKNZRopen(int *ret, str *in) return MAL_SUCCEED; } -@= init_check -if (TRANS == NULL) { - throw(MAL, "tokenizer", "no tokenizer store open"); -} - -@ -@c str TKNZRclose(int *r) { int i; (void) r; - @:init_check@ + if (TRANS == NULL) + throw(MAL, "tokenizer", "no tokenizer store open"); TMsubcommit(TRANS); @@ -258,7 +163,7 @@ TKNZRclose(int *r) } /* - * @- Tokenize operations + * Tokenize operations * The tokenizer operation assumes a private copy to mark the * end of the token separators with a zero byte. Tokens are * separated by a single character for simplicity. @@ -284,7 +189,27 @@ TKNZRtokenize(str in, str *parts, char t return depth; } -@= insert +str +TKNZRappend(oid *pos, str *s) +{ + str url; + str batname; + str parts[MAX_TKNZR_DEPTH]; + int i, new, r, depth; + BAT *b; + BUN p; + BUN idx = 0; + oid prv = 0; + oid comp; + + if (TRANS == NULL) + throw(MAL, "tokenizer", "no tokenizer store open"); + + if ((url = GDKstrdup(*s)) == NULL) { + throw(MAL, "tokenizer.append", + OPERATION_FAILED "could not allocate memory"); + } + depth = TKNZRtokenize(url, parts, '/'); new = depth; @@ -330,11 +255,7 @@ TKNZRtokenize(str in, str *parts, char t } tokenDepth = depth; } - -@ - * @- - * Find the common prefix first -@= findcommon + /* findcommn */ p = BUNfnd(BATmirror(tokenBAT[0]), parts[0]); if (p != BUN_NONE) { prv = (oid) p; @@ -357,10 +278,17 @@ TKNZRtokenize(str in, str *parts, char t i = 0; } -@ - * @- - * Insert the remainder as a new string -@= insremainder + if (i == depth) { + comp = COMP(prv, depth); + *pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr) &comp); + if (*pos != BUN_NONE) { + /* the string is already there */ + GDKfree(url); + return MAL_SUCCEED; + } + } + + /* insremainder */ for(; i < depth; i++){ idx = BATcount(tokenBAT[i]); if (idx > MAX_h) { @@ -382,43 +310,6 @@ TKNZRtokenize(str in, str *parts, char t prv = (oid) idx; } -@ -@c -str -TKNZRappend(oid *pos, str *s) -{ - str url; - str batname; - str parts[MAX_TKNZR_DEPTH]; - int i, new, r, depth; - BAT *b; - BUN p; - BUN idx = 0; - oid prv = 0; - oid comp; - - @:init_check@ - - if ((url = GDKstrdup(*s)) == NULL) { - throw(MAL, "tokenizer.append", - OPERATION_FAILED "could not allocate memory"); - } - - @:insert@ - @:findcommon@ - - if (i == depth) { - comp = COMP(prv, depth); - *pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr) &comp); - if (*pos != BUN_NONE) { _______________________________________________ Checkin-list mailing list Checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list