Changeset: e7b2db39dcd8 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7b2db39dcd8
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Generate the sample tables + instances + candidates for the evaluation


diffs (truncated from 480 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -1709,16 +1709,19 @@ str printmergeCSSet(CSset *freqCSset, in
 
 
 static 
-str printsubsetFromCSset(CSset *freqCSset, int* subsetIdx, int num){
+str printsubsetFromCSset(CSset *freqCSset, BAT* subsetIdxBat, int num, oid* 
mergeCSFreqCSMap){
 
        int     i,j; 
        FILE    *fout; 
        char    filename[100];
        char    tmpStr[20];
        int     ret;
+       int     *tblIdx; 
+       int     freqIdx; 
 
        str     propStr; 
        char*   schema = "rdf";
+       CS      cs; 
 
 
        if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
@@ -1735,7 +1738,9 @@ str printsubsetFromCSset(CSset *freqCSse
        fout = fopen(filename,"wt"); 
 
        for (i = 0; i < num; i++){
-               CS cs = (CS)freqCSset->items[subsetIdx[i]];
+               tblIdx = (int*) Tloc(subsetIdxBat, i); 
+               freqIdx = mergeCSFreqCSMap[*tblIdx];
+               cs = (CS)freqCSset->items[freqIdx];
                assert (cs.parentFreqIdx == -1);
                fprintf(fout, "Table %d (Coverage: %d, NumProp: %d) 
\n",i,cs.coverage, cs.numProp);
                for (j = 0; j < cs.numProp; j++){
@@ -3592,20 +3597,28 @@ void freeCSBats(CSBats *csBats){
 }
 
 static
-void generateTablesForEvaluating(CSset *freqCSset, int numTbl,oid* 
mergeCSFreqCSMap, int curNumMergeCS){
+BAT* generateTablesForEvaluating(CSset *freqCSset, int numTbl,oid* 
mergeCSFreqCSMap, int curNumMergeCS){
        int     *cumDist; 
        int     totalCoverage = 0; 
        int     curCoverage = 0;
        int     randValue = 0;
        int     tmpIdx; 
        int     freqId; 
+       BAT     *outputBat; 
        int     minIdx, maxIdx; 
        int     i;
-       int     *output;
+       BUN     bun = BUN_NONE;
+       int     numLoop; 
 
        cumDist = (int*)malloc(sizeof(int) * curNumMergeCS);
-       output = (int*)malloc(sizeof(int) * numTbl);
-                       
+       outputBat = BATnew(TYPE_void, TYPE_int, numTbl);
+       if (outputBat == NULL){
+               return NULL; 
+       }
+       (void)BATprepareHash(BATmirror(outputBat));
+       if (!(outputBat->T->hash)) 
+               return NULL; 
+
        for (i = 0; i < curNumMergeCS; i++){            
                freqId = mergeCSFreqCSMap[i];
                totalCoverage += freqCSset->items[freqId].coverage; 
@@ -3619,7 +3632,9 @@ void generateTablesForEvaluating(CSset *
        }
 
        srand(123456);
-       for (i = 0; i < numTbl; i++){
+       i = 0; 
+       numLoop = 0;
+       while(i < numTbl){
                //Get the index of freqCS for a random value [0-> totalCoverage 
-1]
                //Using binary search
                randValue = rand() % totalCoverage; 
@@ -3641,15 +3656,28 @@ void generateTablesForEvaluating(CSset *
                }
 
                tmpIdx = maxIdx; 
-               output[i] = mergeCSFreqCSMap[tmpIdx];
+
                //printf("tmpIdx = %d --> FreqCS %d \n",tmpIdx, output[i]);
+               bun = BUNfnd(BATmirror(outputBat),(ptr) &tmpIdx);
+               if (bun == BUN_NONE) {
+                       /*New FreqIdx*/
+                       if (outputBat->T->hash && BATcount(outputBat) > 4 * 
outputBat->T->hash->mask) {
+                               HASHdestroy(outputBat);
+                               BAThash(BATmirror(outputBat), 
2*BATcount(outputBat));
+                       }
+                       outputBat = BUNappend(outputBat, (ptr) &tmpIdx, TRUE);
+                       i++;
+               }
+               numLoop++;
        }
 
        //Print the results
-       printsubsetFromCSset(freqCSset, output, numTbl) ;
+       printf("Get the sample tables after %d loop \n",numLoop );
+       printsubsetFromCSset(freqCSset, outputBat, numTbl,mergeCSFreqCSMap);
 
        free(cumDist); 
-       free(output);
+
+       return outputBat; 
 }
 
 #if STOREFULLCS
@@ -4109,7 +4137,283 @@ str RDFExtractCSPropTypes(int *ret, BAT 
 
        return MAL_SUCCEED; 
 }
-
+static 
+void initSampleData(CSSample *csSample,BAT *candBat,CSset *freqCSset, oid 
*mergeCSFreqCSMap, CSlabel *label){
+       int     i, j, k; 
+       int     numCand = 0; 
+       int     freqId; 
+       int     *tblId; 
+       CS      cs; 
+       int     tmpNumcand; 
+       oid     tmpCandidate; 
+       int     randValue = 0; 
+
+       numCand = BATcount(candBat); 
+       srand(123456); 
+       for (i = 0; i < numCand; i++){
+               tblId = (int*) Tloc(candBat, i); 
+               freqId = mergeCSFreqCSMap[*tblId];
+               cs = freqCSset->items[freqId];
+               csSample[i].freqIdx = freqId;
+               tmpNumcand = (NUM_SAMPLE_CANDIDATE > 
label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE;
+               csSample[i].candidateCount = tmpNumcand;
+               csSample[i].candidates = (oid*)malloc(sizeof(oid) * 
tmpNumcand); 
+               for (k = 0; k < tmpNumcand; k++){
+                       csSample[i].candidates[k] = 
label[freqId].candidates[k]; 
+               }
+               //Randomly exchange the value, change the position k with a 
random pos
+               for (k = 0; k < tmpNumcand; k++){
+                       randValue = rand() % tmpNumcand;
+                       tmpCandidate = csSample[i].candidates[k];
+                       csSample[i].candidates[k] = 
csSample[i].candidates[randValue];
+                       csSample[i].candidates[randValue] = tmpCandidate;
+               }
+
+               csSample[i].numProp = cs.numProp;
+               csSample[i].lstProp = (oid*)malloc(sizeof(oid) * cs.numProp); 
+               memcpy(csSample[i].lstProp, cs.lstProp, cs.numProp * 
sizeof(oid));
+               csSample[i].lstSubjOid = (oid*)malloc(sizeof(oid) * 
NUM_SAMPLE_INSTANCE);
+               for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
+                       csSample[i].lstSubjOid[k] = BUN_NONE; 
+
+               csSample[i].lstObj = (oid**)malloc(sizeof(oid*) * cs.numProp); 
+               for (j = 0; j < cs.numProp; j++){
+                       csSample[i].lstObj[j] = (oid*)malloc(sizeof(oid) * 
NUM_SAMPLE_INSTANCE);
+                       for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
+                               csSample[i].lstObj[j][k] = BUN_NONE; 
+               }
+               csSample[i].numInstances = 0;
+
+       }
+}
+static 
+void freeSampleData(CSSample *csSample, int numCand){
+       int i, j; 
+       for (i = 0; i < numCand; i++){
+               free(csSample[i].lstProp);
+               free(csSample[i].candidates); 
+               free(csSample[i].lstSubjOid);
+               for (j = 0; j < csSample[i].numProp; j++){
+                       free(csSample[i].lstObj[j]);
+               }
+               free(csSample[i].lstObj);
+       }
+
+       free(csSample);
+}
+
+static 
+void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int 
sampleIdx, CSSample *csSample){
+       int i,j; 
+       int curPos; 
+       
+       j = 0;
+       curPos= csSample[sampleIdx].numInstances;
+       csSample[sampleIdx].lstSubjOid[curPos] = subj;
+       for (i = 0; i < numP; i++){
+               //printf("  P: " BUNFMT " Type: %d ", buffP[i], buffTypes[i]);
+               while (csSample[sampleIdx].lstProp[j] != buffP[i]){
+                       j++;
+               }       
+               assert(j < csSample[sampleIdx].numProp);
+               //j is position of the property buffP[i] in csPropTypes[tblId]
+               csSample[sampleIdx].lstObj[j][curPos] = buffO[i]; 
+       }
+       csSample[sampleIdx].numInstances++;
+}
+
+static 
+void getObjStr(BAT *mapbat, BATiter mapi, oid objOid, str *objStr, char 
*retObjType){
+       BUN bun; 
+
+       char objType = getObjType(objOid); 
+
+       if (objType == URI || objType == BLANKNODE){
+               objOid = objOid - ((oid)objType << (sizeof(BUN)*8 - 4));
+               takeOid(objOid, objStr); 
+       }
+       else{
+               objOid = objOid - (objType*2 + 1) *  RDF_MIN_LITERAL;   /* Get 
the real objOid from Map or Tokenizer */ 
+               bun = BUNfirst(mapbat);
+               *objStr = (str) BUNtail(mapi, bun + objOid); 
+       }
+
+       *retObjType = objType; 
+
+
+
+
+}
+static 
+str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num){
+
+       int     i,j, k; 
+       FILE    *fout; 
+       char    filename[100];
+       char    tmpStr[20];
+       int     ret;
+
+       str     propStr; 
+       str     subjStr; 
+       char*   schema = "rdf";
+       CSSample        sample; 
+       CS              freqCS; 
+       char    objType = 0; 
+       str     objStr;         
+       oid     objOid = BUN_NONE; 
+       BATiter mapi;
+       str     canStr; 
+
+       mapi = bat_iterator(mbat);
+
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               throw(RDF, "rdf.rdfschema",
+                               "could not open the tokenizer\n");
+       }
+       
+
+       strcpy(filename, "sampleData");
+       sprintf(tmpStr, "%d", num);
+       strcat(filename, tmpStr);
+       strcat(filename, ".txt");
+
+       fout = fopen(filename,"wt"); 
+
+       for (i = 0; i < num; i++){
+               sample = csSample[i];
+               freqCS = freqCSset->items[sample.freqIdx];
+               fprintf(fout,"Sample table %d \n Candidates: ", i);
+               for (j = 0; j < (int)sample.candidateCount; j++){
+                       //fprintf(fout,"  "  BUNFMT,sample.candidates[j]);
+                       if (sample.candidates[j] != BUN_NONE){
+                               takeOid(sample.candidates[j], &canStr); 
+                               fprintf(fout,"%s, ",  canStr);
+                               GDKfree(canStr); 
+                       
+                       }
+               }
+               fprintf(fout, "\n");
+               //List of columns
+               fprintf(fout,"Subject, ");
+               for (j = 0; j < sample.numProp; j++){
+                       takeOid(sample.lstProp[j], &propStr);   
+                       fprintf(fout,"%s, ", propStr);
+                       GDKfree(propStr);
+               }
+               fprintf(fout, "\n");
+               
+               //List of support
+               fprintf(fout,"NONE, ");
+               for (j = 0; j < sample.numProp; j++){
+                       fprintf(fout,"%d, ", freqCS.lstPropSupport[j]);
+               }
+               fprintf(fout, "\n");
+
+               //All the instances 
+               for (k = 0; k < sample.numInstances; k++){
+                       takeOid(sample.lstSubjOid[k], &subjStr); 
+                       fprintf(fout,"%s, ", subjStr);
+                       GDKfree(subjStr); 
+                       
+                       for (j = 0; j < sample.numProp; j++){
+                               objOid = sample.lstObj[j][k];
+                               if (objOid == BUN_NONE)
+                                       fprintf(fout,"NULL, ");
+                               else{
+                                       getObjStr(mbat, mapi, objOid, &objStr, 
&objType);
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to