On Tue, 2012-02-14 at 09:45 +0100, G.H.M.Valkenhoef, van wrote:
> > 
> Yes, I found that java code (the HelpIndexer I refer to). I'll work on
> a patch to replace the XInvocations of the Java code with calls to my
> code.

I can try and knock together a skeleton of a conversion of that Java
component to a C++ component for you to integration the clucene stuff
into.

> > Presumably just editing l10ntools/source/help/makefile.mk and adding
> > another target or so in there will do the trick. I can hook this up
> > and see if how it goes.
> Great, send me a patch if you get it going, then I can work on some of
> the other stuff.

Attached is your code back again but added into the l10ntools
makefile.mk and other build-foo. And a patch to helpcontent2 (different
repository) to use it to build the helpcontent and just use normal zip
to zip them up.

I hacked the helplinker to let the "shared" ones through without error
and cut out the cjk.analyzer for now as I don't happen to have that
compiled up here.


> I've got an update on this: I managed to create all the indexes and
> doing a few searches on both the Java-generated an the C++-generated
> indexes seems to give identical results (at least if I pipe the
> results through sort).

sounds great, as does the other news that building clucene only takes a
short time.

C.
>From 83dc0151cde6b765f4235456ae1e58813d3746bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caol...@redhat.com>
Date: Tue, 14 Feb 2012 11:53:27 +0000
Subject: [PATCH] add clucene helpindexer program

---
 l10ntools/prj/build.lst               |    2 +-
 l10ntools/prj/d.lst                   |    6 +-
 l10ntools/source/help/helpindexer.cxx |  247 +++++++++++++++++++++++++++++++++
 l10ntools/source/help/makefile.mk     |   30 ++---
 4 files changed, 263 insertions(+), 22 deletions(-)
 create mode 100644 l10ntools/source/help/helpindexer.cxx

diff --git a/l10ntools/prj/build.lst b/l10ntools/prj/build.lst
index 3cce7a3..c714256 100644
--- a/l10ntools/prj/build.lst
+++ b/l10ntools/prj/build.lst
@@ -1,4 +1,4 @@
-tr	l10ntools	:	tools LIBXSLT:libxslt BERKELEYDB:berkeleydb LUCENE:lucene NULL
+tr	l10ntools	:	tools LIBXSLT:libxslt BERKELEYDB:berkeleydb NULL
 tr	l10ntools						usr1	-	all	tr_mkout NULL
 tr	l10ntools\inc					nmake	-	all	tr_inc NULL
 tr	l10ntools\source					nmake	-	all	tr_src tr_inc NULL
diff --git a/l10ntools/prj/d.lst b/l10ntools/prj/d.lst
index eded848..174bb6c 100644
--- a/l10ntools/prj/d.lst
+++ b/l10ntools/prj/d.lst
@@ -26,12 +26,14 @@ mkdir: %_DEST%\bin\help\com\sun\star\help
 ..\%__SRC%\bin\txtconv %_DEST%\bin\txtconv
 ..\%__SRC%\bin\ulfconv %_DEST%\bin\ulfconv
 ..\%__SRC%\class\FCFGMerge.jar %_DEST%\bin\FCFGMerge.jar
-..\%__SRC%\class\HelpIndexerTool.jar %_DEST%\bin\HelpIndexerTool.jar
-..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker
 ..\%__SRC%\bin\HelpCompiler %_DEST%\bin\HelpCompiler
 ..\%__SRC%\bin\HelpCompiler.exe %_DEST%\bin\HelpCompiler.exe
+..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker
 ..\%__SRC%\bin\HelpLinker.exe %_DEST%\bin\HelpLinker.exe
 ..\%__SRC%\bin\HelpLinker* %_DEST%\bin
+..\%__SRC%\bin\HelpIndexer %_DEST%\bin\HelpIndexer
+..\%__SRC%\bin\HelpIndexer.exe %_DEST%\bin\HelpIndexer.exe
+..\%__SRC%\bin\HelpIndexer* %_DEST%\bin
 
 ..\scripts\localize %_DEST%\bin\localize
 ..\scripts\fast_merge.pl %_DEST%\bin\fast_merge.pl
diff --git a/l10ntools/source/help/helpindexer.cxx b/l10ntools/source/help/helpindexer.cxx
new file mode 100644
index 0000000..c327119
--- /dev/null
+++ b/l10ntools/source/help/helpindexer.cxx
@@ -0,0 +1,247 @@
+#include <CLucene/StdHeader.h>
+#include <CLucene.h>
+#ifdef TODO
+#include <CLucene/analysis/LanguageBasedAnalyzer.h>
+#endif
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <errno.h>
+#include <string.h>
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+#include <set>
+
+// I assume that TCHAR is defined as wchar_t throughout
+
+using namespace lucene::document;
+
+class HelpIndexer {
+	private:
+		std::string d_lang;
+		std::string d_module;
+		std::string d_captionDir;
+		std::string d_contentDir;
+		std::string d_indexDir;
+		std::string d_error;
+		std::set<std::string> d_files;
+
+	public:
+
+	/**
+	 * @param lang Help files language.
+	 * @param module The module of the helpfiles.
+	 * @param captionDir The directory to scan for caption files.
+	 * @param contentDir The directory to scan for content files.
+	 * @param indexDir The directory to write the index to.
+	 */
+	HelpIndexer(std::string const &lang, std::string const &module,
+		std::string const &captionDir, std::string const &contentDir,
+		std::string const &indexDir);
+
+	/**
+	 * Run the indexer.
+	 * @return true if index successfully generated.
+	 */
+	bool indexDocuments();
+
+	/**
+	 * Get the error string (empty if no error occurred).
+	 */
+	std::string const & getErrorMessage();
+
+	private:
+
+	/**
+	 * Scan the caption & contents directories for help files.
+	 */
+	bool scanForFiles();
+
+	/**
+	 * Scan for files in the given directory.
+	 */
+	bool scanForFiles(std::string const &path);
+
+	/**
+	 * Fill the Document with information on the given help file.
+	 */
+	bool helpDocument(std::string const & fileName, Document *doc);
+
+	/**
+	 * Create a reader for the given file, and create an "empty" reader in case the file doesn't exist.
+	 */
+	lucene::util::Reader *helpFileReader(std::string const & path);
+
+	std::wstring string2wstring(std::string const &source);
+};
+
+HelpIndexer::HelpIndexer(std::string const &lang, std::string const &module,
+	std::string const &captionDir, std::string const &contentDir, std::string const &indexDir) :
+d_lang(lang), d_module(module), d_captionDir(captionDir), d_contentDir(contentDir), d_indexDir(indexDir), d_error(""), d_files() {}
+
+bool HelpIndexer::indexDocuments() {
+	if (!scanForFiles()) {
+		return false;
+	}
+
+#ifdef TODO
+	// Construct the analyzer appropriate for the given language
+	lucene::analysis::Analyzer *analyzer = (
+		d_lang.compare("ja") == 0 ?
+		(lucene::analysis::Analyzer*)new lucene::analysis::LanguageBasedAnalyzer(L"cjk") :
+		(lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());
+#else
+	lucene::analysis::Analyzer *analyzer = (
+		(lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());
+#endif
+
+	lucene::index::IndexWriter writer(d_indexDir.c_str(), analyzer, true);
+
+	// Index the identified help files
+	Document doc;
+	for (std::set<std::string>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
+		doc.clear();
+		if (!helpDocument(*i, &doc)) {
+			delete analyzer;
+			return false;
+		}
+		writer.addDocument(&doc);
+	}
+
+	// Optimize the index
+	writer.optimize();
+
+	delete analyzer;
+	return true;
+}
+
+std::string const & HelpIndexer::getErrorMessage() {
+	return d_error;
+}
+
+bool HelpIndexer::scanForFiles() {
+	if (!scanForFiles(d_contentDir)) {
+		return false;
+	}
+	if (!scanForFiles(d_captionDir)) {
+		return false;
+	}
+	return true;
+}
+
+bool HelpIndexer::scanForFiles(std::string const & path) {
+	DIR *dir = opendir(path.c_str());
+	if (dir == 0) {
+		d_error = "Error reading directory " + path + strerror(errno);
+		return true;
+	}
+
+	struct dirent *ent;
+	struct stat info;
+	while ((ent = readdir(dir)) != 0) {
+		if (stat((path + "/" + ent->d_name).c_str(), &info) == 0 && S_ISREG(info.st_mode)) {
+			d_files.insert(ent->d_name);
+		}
+	}
+
+	closedir(dir);
+
+	return true;
+}
+
+bool HelpIndexer::helpDocument(std::string const & fileName, Document *doc) {
+	// Add the help path as an indexed, untokenized field.
+	std::wstring path(L"#HLP#" + string2wstring(d_module) + L"/" + string2wstring(fileName));
+	doc->add(*new Field(_T("path"), path.c_str(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+
+	// Add the caption as a field.
+	std::string captionPath = d_captionDir + "/" + fileName;
+	doc->add(*new Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
+	// FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?
+
+	// Add the content as a field.
+	std::string contentPath = d_contentDir + "/" + fileName;
+	doc->add(*new Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
+	// FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?
+
+	return true;
+}
+
+lucene::util::Reader *HelpIndexer::helpFileReader(std::string const & path) {
+	if (access(path.c_str(), R_OK) == 0) {
+		return new lucene::util::FileReader(path.c_str(), "UTF-8");
+	} else {
+		return new lucene::util::StringReader(L"");
+	}
+}
+
+std::wstring HelpIndexer::string2wstring(std::string const &source) {
+	std::wstring target(source.length(), L' ');
+	std::copy(source.begin(), source.end(), target.begin());
+	return target;
+}
+
+int main(int argc, char **argv) {
+	const std::string pLang("-lang");
+	const std::string pModule("-mod");
+	const std::string pOutDir("-zipdir");
+	const std::string pSrcDir("-srcdir");
+
+	std::string lang;
+	std::string module;
+	std::string srcDir;
+	std::string outDir;
+
+	bool error = false;
+	for (int i = 1; i < argc; ++i) {
+		if (pLang.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				lang = argv[++i];
+			} else {
+				error = true;
+			}
+		} else if (pModule.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				module = argv[++i];
+			} else {
+				error = true;
+			}
+		} else if (pOutDir.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				outDir = argv[++i];
+			} else {
+				error = true;
+			}
+		} else if (pSrcDir.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				srcDir = argv[++i];
+			} else {
+				error = true;
+			}
+		} else {
+			error = true;
+		}
+	}
+
+	if (error) {
+		std::cerr << "Error parsing command-line arguments" << std::endl;
+	}
+
+	if (error || lang.empty() || module.empty() || srcDir.empty() || outDir.empty()) {
+		std::cerr << "Usage: HelpIndexer -lang ISOLangCode -mod HelpModule -srcdir SourceDir -zipdir OutputDir" << std::endl;
+		return 1;
+	}
+
+	std::string captionDir(srcDir + "/caption");
+	std::string contentDir(srcDir + "/content");
+	std::string indexDir(outDir + "/" + module + ".idxl");
+	HelpIndexer indexer(lang, module, captionDir, contentDir, indexDir);
+	if (!indexer.indexDocuments()) {
+		std::cerr << indexer.getErrorMessage() << std::endl;
+		return 2;
+	}
+	return 0;
+}
diff --git a/l10ntools/source/help/makefile.mk b/l10ntools/source/help/makefile.mk
index bab01b8..e22c6a3 100644
--- a/l10ntools/source/help/makefile.mk
+++ b/l10ntools/source/help/makefile.mk
@@ -60,8 +60,10 @@ SLOFILES=\
 EXCEPTIONSFILES=\
         $(OBJ)$/HelpLinker.obj \
         $(OBJ)$/HelpCompiler.obj \
+        $(OBJ)$/helpindexer.obj \
         $(SLO)$/HelpLinker.obj \
         $(SLO)$/HelpCompiler.obj
+
 .IF "$(OS)" == "MACOSX" && "$(CPU)" == "P" && "$(COM)" == "GCC"
 # There appears to be a GCC 4.0.1 optimization error causing _file:good() to
 # report true right before the call to writeOut at HelpLinker.cxx:1.12 l. 954
@@ -72,6 +74,9 @@ NOOPTFILES=\
         $(SLO)$/HelpLinker.obj
 .ENDIF
 
+PKGCONFIG_MODULES=libclucene-core
+.INCLUDE : pkg_config.mk
+
 APP1TARGET= $(TARGET)
 APP1OBJS=\
       $(OBJ)$/HelpLinker.obj \
@@ -79,6 +84,12 @@ APP1OBJS=\
 APP1RPATH = NONE
 APP1STDLIBS+=$(SALLIB) $(BERKELEYLIB) $(XSLTLIB) $(EXPATASCII3RDLIB)
 
+APP2TARGET=HelpIndexer
+APP2OBJS=\
+      $(OBJ)$/helpindexer.obj
+APP2RPATH = NONE
+APP2STDLIBS+=$(SALLIB) $(PKGCONFIG_LIBS)
+
 SHL1TARGET	=$(LIBBASENAME)$(DLLPOSTFIX)
 SHL1LIBS=	$(SLB)$/$(TARGET).lib
 .IF "$(COM)" == "MSC"
@@ -93,26 +104,7 @@ SHL1USE_EXPORTS	=ordinal
 DEF1NAME	=$(SHL1TARGET) 
 DEFLIB1NAME	=$(TARGET)
 
-JAVAFILES = \
-    HelpIndexerTool.java			        \
-    HelpFileDocument.java
-
-
-JAVACLASSFILES = \
-    $(CLASSDIR)$/$(PACKAGE)$/HelpIndexerTool.class			        \
-    $(CLASSDIR)$/$(PACKAGE)$/HelpFileDocument.class
 
-.IF "$(SYSTEM_LUCENE)" == "YES"
-EXTRAJARFILES += $(LUCENE_CORE_JAR) $(LUCENE_ANALYZERS_JAR)
-.ELSE
-JARFILES += lucene-core-2.3.jar lucene-analyzers-2.3.jar
-.ENDIF
-JAVAFILES = $(subst,$(CLASSDIR)$/$(PACKAGE)$/, $(subst,.class,.java $(JAVACLASSFILES)))
-
-JARCLASSDIRS	   = $(PACKAGE)/*
-JARTARGET	       = HelpIndexerTool.jar
-JARCOMPRESS        = TRUE 
- 
 # --- Targets ------------------------------------------------------
 
 .INCLUDE :  target.mk
-- 
1.7.7.6

>From b0fee7a4c8c4aa177ef47988721108c7f466f0b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caol...@redhat.com>
Date: Tue, 14 Feb 2012 11:52:29 +0000
Subject: [PATCH] use clucene indexer

---
 helpcontent2/settings.pmk    |   12 ------------
 helpcontent2/util/target.pmk |   21 +++------------------
 2 files changed, 3 insertions(+), 30 deletions(-)

diff --git a/helpcontent2/settings.pmk b/helpcontent2/settings.pmk
index 185438e..3716281 100755
--- a/helpcontent2/settings.pmk
+++ b/helpcontent2/settings.pmk
@@ -1,17 +1,5 @@
 .INCLUDE : $(LOCAL_COMMON_OUT)/inc$/aux_langs.mk
 .INCLUDE : $(LOCAL_COMMON_OUT)/inc$/help_exist.mk
 
-my_cp:=$(CLASSPATH)$(PATH_SEPERATOR)$(SOLARBINDIR)$/jaxp.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/juh.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/parser.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/xt.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/unoil.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/ridl.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/jurt.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/xmlsearch.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/LuceneHelpWrapper.jar$(PATH_SEPERATOR)$(SOLARBINDIR)$/HelpIndexerTool.jar$
-
-.IF "$(SYSTEM_LUCENE)" == "YES"
-my_cp!:=$(my_cp)$(PATH_SEPERATOR)$(LUCENE_CORE_JAR)$(PATH_SEPERATOR)$(LUCENE_ANALYZERS_JAR)
-.ELSE
-my_cp!:=$(my_cp)$(PATH_SEPERATOR)$(SOLARBINDIR)/lucene-core-2.3.jar$(PATH_SEPERATOR)$(SOLARBINDIR)/lucene-analyzers-2.3.jar
-.ENDIF
- 
-.IF "$(SYSTEM_DB)" != "YES"
-JAVA_LIBRARY_PATH= -Djava.library.path=$(SOLARSHAREDBIN)
-.ENDIF 
-
 aux_alllangiso_all:=$(foreach,i,$(alllangiso) $(foreach,j,$(aux_langdirs) $(eq,$i,$j  $i $(NULL))))
 aux_alllangiso:=$(foreach,i,$(aux_alllangiso_all) $(foreach,j,$(help_exist) $(eq,$i,$j  $i $(NULL))))
diff --git a/helpcontent2/util/target.pmk b/helpcontent2/util/target.pmk
index 40f6e5d..7dd7e5b 100755
--- a/helpcontent2/util/target.pmk
+++ b/helpcontent2/util/target.pmk
@@ -30,25 +30,10 @@ LINKALLADDEDDEPS=$(foreach,i,$(aux_alllangiso) $(subst,LANGUAGE,$i $(LINKADDEDDP
 
 ALLTAR : $(LINKALLTARGETS)
 
-.IF "$(SYSTEM_DB)" != "YES"
-JAVA_LIBRARY_PATH= -Djava.library.path=$(SOLARSHAREDBIN)
-.ENDIF
-
 XSL_DIR*:=$(SOLARBINDIR)
 
 $(LINKALLTARGETS) : $(foreach,i,$(LINKLINKFILES) $(COMMONMISC)$/$$(@:b:s/_/./:e:s/.//)/$i) $(subst,LANGUAGE,$$(@:b:s/_/./:e:s/.//) $(LINKADDEDDEPS)) $(COMMONMISC)$/xhp_changed.flag
     $(HELPLINKER) @$(mktmp -mod $(LINKNAME) -src $(COMMONMISC) -sty $(XSL_DIR)/embed.xsl -zipdir $(MISC)$/ziptmp$(@:b) -idxcaption $(XSL_DIR)/idxcaption.xsl -idxcontent $(XSL_DIR)/idxcontent.xsl -lang {$(subst,$(LINKNAME)_, $(@:b))} $(subst,LANGUAGE,{$(subst,$(LINKNAME)_, $(@:b))} $(LINKADDEDFILES)) $(foreach,i,$(LINKLINKFILES) $(COMMONMISC)$/{$(subst,$(LINKNAME)_, $(@:b))}/$i) -o $@.$(INPATH))
-.IF "$(SOLAR_JAVA)" == "TRUE"
-.IF "$(CHECK_LUCENCE_INDEXER_OUTPUT)" == ""
-    $(JAVAI) $(JAVAIFLAGS) $(JAVA_LIBRARY_PATH) -cp "$(my_cp)" com.sun.star.help.HelpIndexerTool -lang $(@:b:s/_/./:e:s/.//) -mod $(LINKNAME) -zipdir $(MISC)$/ziptmp$(@:b) -o $@.$(INPATH)
-.ELSE
-    $(JAVAI) $(JAVAIFLAGS) $(JAVA_LIBRARY_PATH) -cp "$(my_cp)" com.sun.star.help.HelpIndexerTool -lang $(@:b:s/_/./:e:s/.//) -mod $(LINKNAME) -zipdir $(MISC)$/ziptmp$(@:b) -o $@.$(INPATH) -checkcfsandsegname _0 _3
-.ENDIF
-   $(RENAME) $@.$(INPATH) $@
-.ELSE
-    -$(RM) $(MISC)$/ziptmp$(@:b)$/content/*.*
-    -$(RM) $(MISC)$/ziptmp$(@:b)$/caption/*.*
-    zip -j -D $@.$(INPATH) $(MISC)$/ziptmp$(@:b)$/*
-    $(RENAME) $@.$(INPATH) $@
-    -$(RM) $(MISC)$/ziptmp$(@:b)$/*.*
-.ENDIF
+    $(HELPINDEXER) -lang $(@:b:s/_/./:e:s/.//) -mod $(LINKNAME) -srcdir $(MISC)$/ziptmp$(@:b) -zipdir $(MISC)$/ziptmp$(@:b)
+    cd $(MISC)$/ziptmp$(@:b) && zip -rX --filesync zipfile.zip $(LINKNAME).*
+    $(RENAME) $(MISC)$/ziptmp$(@:b)$/zipfile.zip $@
-- 
1.7.7.6

_______________________________________________
LibreOffice mailing list
LibreOffice@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/libreoffice

Reply via email to