Thanks for replying.
When I run the command, it generates a file with a "txt" extension. The
text in this file has spaces interspersed in odd spots. Here is output
from
a file I ran the command on...
Marc h 29, 2005
Hello t here m y good friend.
HELLO
Legal Soft w are is GOOD.
I would have expected this...
GoOD
March 29, 2005
Hello there my good friend.
HELLO
Legal Software is GOOD.
GoOD
----- Original Message -----
From: "Ben Litchfield" <[EMAIL PROTECTED]>
To: <java-user@lucene.apache.org>
Sent: Wednesday, May 25, 2005 4:38 PM
Subject: Re: Lucene - PDFBox
>
> Can you run the following command line application on the PDF to verify
> that the extracted text is correct
>
> java org.pdfbox.ExtractText <pdf-file>
>
> Ben
>
>
>
> On Wed, 25 May 2005, Thomas X Hoban wrote:
>
>>
>>
>> First, I am new to Lucene.
>>
>> Is there anyone out there who has had trouble getting hits when
>> running
>> phrase queries against an index that contains content from PDF files.
>> For PDF documents, I create the document using
>> LucenePDFDocument.getDocument(file) and then add it to the index. For
>> non-pdf documents, I create the document using
>> FileDocument.Document(file).
>>
>> For instance, I add documents with the following text:
>>
>> pdf1.pdf -- "Dave has good taste"
>> pdf2.pdf -- "Tom has good taste"
>> word1.doc -- "Liz has bad taste"
>> word2.doc -- "Troy has bad taste"
>>
>> When I search content for the following strings:
>>
>> has good taste
>> get expected results with hits on pdf1.doc, pdf2.doc, word1.doc
>> and
>> word2.doc
>>
>> "has good taste"
>> get unexpected result: 0 hits
>>
>> "has bad taste"
>> get expected results with hits on word1.doc and word2.doc
>>
>> It seems that searching for individual words work fine for both PDF
>> and
>> non-pdf files. However, searching on a phrase (enclosed in quotes)
>> works
>> on non-pdf files but not on files parsed with the LucenePDFDocument
>> class.
>>
>> Can anyone offer advise?
>>
>> Below is code for index creation. It is the demo IndexFiles class
>> provided with Lucene along with some changes...
>>
>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>> import org.apache.lucene.index.IndexWriter;
>> import org.apache.lucene.document.Field;
>> import org.apache.lucene.document.Document;
>>
>> import java.io.File;
>> import java.io.FileNotFoundException;
>> import java.io.IOException;
>> import java.util.Date;
>>
>> //import javax.activation.MimetypesFileTypeMap;
>>
>> import org.pdfbox.searchengine.lucene.LucenePDFDocument;
>>
>>
>> class IndexFiles {
>> public static void main(String[] args) throws IOException {
>> String usage = "java " + IndexFiles.class + " <root_directory>";
>> if (args.length == 0) {
>> System.err.println("Usage: " + usage);
>> System.exit(1);
>> }
>>
>> Date start = new Date();
>> try {
>> IndexWriter writer = new IndexWriter("index", new
>> StandardAnalyzer(), true);
>> indexDocs(writer, new File(args[0]));
>>
>> writer.optimize();
>> writer.close();
>>
>> Date end = new Date();
>>
>> System.out.print(end.getTime() - start.getTime());
>> System.out.println(" total milliseconds");
>>
>> } catch (IOException e) {
>> System.out.println(" caught a " + e.getClass() +
>> "\n with message: " + e.getMessage());
>> }
>> }
>>
>> public static void indexDocs(IndexWriter writer, File file)
>> throws IOException {
>> // do not try to index files that cannot be read
>>
>> if (file.canRead()) {
>> if (file.isDirectory()) {
>> String[] files = file.list();
>> // an IO error could occur
>> if (files != null) {
>> for (int i = 0; i < files.length; i++) {
>> indexDocs(writer, new File(file, files[i]));
>> }
>> }
>> } else {
>> System.out.println("adding " + file);
>> try {
>>
>> Document doc = null;
>> if (file.getName().indexOf(".pdf") >= 0)
>> //
>> writer.addDocument(LucenePDFDocument.getDocument(file));
>> doc = LucenePDFDocument.getDocument(file);
>> else
>> doc = FileDocument.Document(file);
>>
>> Field field = null;
>> if (file.getPath().indexOf("case1") >=0)
>> field = new Field("caseid", "1", false, true, false);
>> else if (file.getPath().indexOf("case2") >=0)
>> field = new Field("caseid", "2", false, true, false);
>> else if (file.getPath().indexOf("case3") >=0)
>> field = new Field("caseid", "3", false, true, false);
>> else
>> field = new Field("caseid", "0", false, true, false);
>>
>> doc.add(field);
>>
>> writer.addDocument(doc);
>> }
>> // at least on windows, some temporary files raise this
>> exception
>> with an "access denied" message
>> // checking if the file can be read doesn't help
>> catch (FileNotFoundException fnfe) {
>> ;
>> }
>> }
>> }
>> }
>> }
>>
>>
>> Here is the SearchFiles class with some minor modifications...
>>
>> import java.io.IOException;
>> import java.io.BufferedReader;
>> import java.io.InputStreamReader;
>> import java.util.StringTokenizer;
>>
>> import org.apache.lucene.analysis.Analyzer;
>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>> import org.apache.lucene.document.Document;
>> import org.apache.lucene.search.Searcher;
>> import org.apache.lucene.search.IndexSearcher;
>> import org.apache.lucene.search.Query;
>> import org.apache.lucene.search.BooleanQuery;
>> import org.apache.lucene.search.PhraseQuery;
>> import org.apache.lucene.search.Hits;
>> import org.apache.lucene.index.Term;
>> import org.apache.lucene.queryParser.QueryParser;
>> import org.apache.lucene.queryParser.ParseException;
>>
>> class SearchFiles {
>>
>> private static Query getCaseQuery(String line, Analyzer analyzer)
>> throws ParseException {
>> BooleanQuery bq = new BooleanQuery();
>> StringTokenizer st = new StringTokenizer(line);
>> Query query = QueryParser.parse(line, "contents", analyzer);
>> String caseId = null;
>> while (st.hasMoreTokens()) {
>> caseId = st.nextToken();
>> System.out.println("build case query for " + caseId);
>>
>> query = QueryParser.parse(caseId, "caseid", analyzer);
>> bq.add(query, false, false);
>> }
>>
>> return bq;
>> }
>> public static void main(String[] args) {
>> try {
>> Searcher searcher = new IndexSearcher("index");
>> Analyzer analyzer = new StandardAnalyzer();
>>
>> BufferedReader in = new BufferedReader(new
>> InputStreamReader(System.in));
>> while (true) {
>> System.out.print("Query: ");
>> String line = in.readLine();
>> System.out.print("Cases: ");
>> String caseLine = in.readLine();
>> Query caseQuery = getCaseQuery(caseLine, analyzer);
>>
>> if (line.length() == -1)
>> break;
>>
>>
>> Query query = QueryParser.parse(line, "contents", analyzer);
>> // PhraseQuery query = new PhraseQuery();
>> // query.add(new Term("contents",line));
>> System.out.println("Searching for: " +
>> query.toString("contents"));
>> /*
>> BooleanQuery wholeQuery = new BooleanQuery();
>> wholeQuery.add(caseQuery, true, false);
>> wholeQuery.add(query, true, false);
>> Hits hits = searcher.search(wholeQuery);
>> */
>> Hits hits = searcher.search(query);
>> System.out.println(hits.length() + " total matching
>> documents");
>>
>> final int HITS_PER_PAGE = 10;
>> for (int start = 0; start < hits.length(); start +=
>> HITS_PER_PAGE) {
>> int end = Math.min(hits.length(), start + HITS_PER_PAGE);
>> for (int i = start; i < end; i++) {
>> Document doc = hits.doc(i);
>> String path = doc.get("path");
>> if (path != null) {
>> System.out.println(i + ". " + path);
>> } else {
>> String url = doc.get("url");
>> if (url != null) {
>> System.out.println(i + ". " + url);
>> System.out.println(" - " + doc.get("title"));
>> } else {
>> System.out.println(i + ". " + "No path nor URL for
>> this
>> document");
>> }
>> }
>> }
>>
>> if (hits.length() > end) {
>> System.out.print("more (y/n) ? ");
>> line = in.readLine();
>> if (line.length() == 0 || line.charAt(0) == 'n')
>> break;
>> }
>> }
>> }
>> searcher.close();
>>
>> } catch (Exception e) {
>> System.out.println(" caught a " + e.getClass() +
>> "\n with message: " + e.getMessage());
>> }
>> }
>> }
>>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [EMAIL PROTECTED]
> For additional commands, e-mail: [EMAIL PROTECTED]
>
>
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]