Re: Lucene - PDFBox

Thomas X Hoban Wed, 25 May 2005 15:00:41 -0700

In creating the index, the code passes StandardAnalyzer to the IndexWriterconstructor.

----- Original Message -----From: "Chris Fraschetti" <[EMAIL PROTECTED]>

To: <java-user@lucene.apache.org>
Sent: Wednesday, May 25, 2005 4:53 PM
Subject: Re: Lucene - PDFBox



Also, which analyzer are you using when indexing your documents?

On 5/25/05, Ben Litchfield <[EMAIL PROTECTED]> wrote:


Can you run the following command line application on the PDF to verify
that the extracted text is correct

java org.pdfbox.ExtractText <pdf-file>

Ben



On Wed, 25 May 2005, Thomas X Hoban wrote:

>
>
> First, I am new to Lucene.
>

> Is there anyone out there who has had trouble getting hits when running> phrase queries against an index that contains content from PDF files.> For PDF documents, I create the document using> LucenePDFDocument.getDocument(file) and then add it to the index. For> non-pdf documents, I create the document using> FileDocument.Document(file).

>
> For instance, I add documents with the following text:
>
> pdf1.pdf -- "Dave has good taste"
> pdf2.pdf -- "Tom has good taste"
> word1.doc -- "Liz has bad taste"
> word2.doc -- "Troy has bad taste"
>
> When I search content for the following strings:
>
>     has good taste

> get expected results with hits on pdf1.doc, pdf2.doc, word1.doc> and word2.doc

>
>     "has good taste"
>        get unexpected result: 0 hits
>
>     "has bad taste"
>        get expected results with hits on word1.doc and word2.doc
>

> It seems that searching for individual words work fine for both PDF and> non-pdf files. However, searching on a phrase (enclosed in quotes)> works on non-pdf files but not on files parsed with the> LucenePDFDocument class.

>
> Can anyone offer advise?
>

> Below is code for index creation. It is the demo IndexFiles class> provided with Lucene along with some changes...

>
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.Document;
>
> import java.io.File;
> import java.io.FileNotFoundException;
> import java.io.IOException;
> import java.util.Date;
>
> //import javax.activation.MimetypesFileTypeMap;
>
> import org.pdfbox.searchengine.lucene.LucenePDFDocument;
>
>
> class IndexFiles {
>   public static void main(String[] args) throws IOException {
>     String usage = "java " + IndexFiles.class + " <root_directory>";
>     if (args.length == 0) {
>       System.err.println("Usage: " + usage);
>       System.exit(1);
>     }
>
>     Date start = new Date();
>     try {

> IndexWriter writer = new IndexWriter("index", new> StandardAnalyzer(), true);

>       indexDocs(writer, new File(args[0]));
>
>       writer.optimize();
>       writer.close();
>
>       Date end = new Date();
>
>       System.out.print(end.getTime() - start.getTime());
>       System.out.println(" total milliseconds");
>
>     } catch (IOException e) {
>       System.out.println(" caught a " + e.getClass() +
>        "\n with message: " + e.getMessage());
>     }
>   }
>
>   public static void indexDocs(IndexWriter writer, File file)
>     throws IOException {
>     // do not try to index files that cannot be read
>
>     if (file.canRead()) {
>       if (file.isDirectory()) {
>         String[] files = file.list();
>         // an IO error could occur
>         if (files != null) {
>           for (int i = 0; i < files.length; i++) {
>             indexDocs(writer, new File(file, files[i]));
>           }
>         }
>       } else {
>         System.out.println("adding " + file);
>         try {
>
>           Document doc = null;
>           if (file.getName().indexOf(".pdf") >= 0)

> //> writer.addDocument(LucenePDFDocument.getDocument(file));

>               doc = LucenePDFDocument.getDocument(file);
>           else
>               doc = FileDocument.Document(file);
>
>           Field field = null;
>           if (file.getPath().indexOf("case1") >=0)
>               field = new Field("caseid", "1", false, true, false);
>           else if (file.getPath().indexOf("case2") >=0)
>               field = new Field("caseid", "2", false, true, false);
>           else if (file.getPath().indexOf("case3") >=0)
>               field = new Field("caseid", "3", false, true, false);
>           else
>               field = new Field("caseid", "0", false, true, false);
>
>           doc.add(field);
>
>           writer.addDocument(doc);
>         }

> // at least on windows, some temporary files raise this> exception with an "access denied" message

>         // checking if the file can be read doesn't help
>         catch (FileNotFoundException fnfe) {
>           ;
>         }
>       }
>     }
>   }
> }
>
>
> Here is the SearchFiles class with some minor modifications...
>
> import java.io.IOException;
> import java.io.BufferedReader;
> import java.io.InputStreamReader;
> import java.util.StringTokenizer;
>
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.search.Searcher;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.BooleanQuery;
> import org.apache.lucene.search.PhraseQuery;
> import org.apache.lucene.search.Hits;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.queryParser.QueryParser;
> import org.apache.lucene.queryParser.ParseException;
>
> class SearchFiles {
>
>   private static Query getCaseQuery(String line, Analyzer analyzer)
>   throws ParseException {
>       BooleanQuery bq = new BooleanQuery();
>       StringTokenizer st = new StringTokenizer(line);
>       Query query = QueryParser.parse(line, "contents", analyzer);
>       String caseId = null;
>       while (st.hasMoreTokens()) {
>           caseId = st.nextToken();
>           System.out.println("build case query for " + caseId);
>
>           query = QueryParser.parse(caseId, "caseid", analyzer);
>           bq.add(query, false, false);
>       }
>
>       return bq;
>   }
>   public static void main(String[] args) {
>     try {
>       Searcher searcher = new IndexSearcher("index");
>       Analyzer analyzer = new StandardAnalyzer();
>

> BufferedReader in = new BufferedReader(new> InputStreamReader(System.in));

>       while (true) {
>         System.out.print("Query: ");
>         String line = in.readLine();
>         System.out.print("Cases: ");
>         String caseLine = in.readLine();
>         Query caseQuery = getCaseQuery(caseLine, analyzer);
>
>         if (line.length() == -1)
>           break;
>
>
>         Query query = QueryParser.parse(line, "contents", analyzer);
>         // PhraseQuery query = new PhraseQuery();
>         // query.add(new Term("contents",line));

> System.out.println("Searching for: " +> query.toString("contents"));

>         /*
>         BooleanQuery wholeQuery = new BooleanQuery();
>         wholeQuery.add(caseQuery, true, false);
>         wholeQuery.add(query,     true, false);
>         Hits hits = searcher.search(wholeQuery);
>         */
>         Hits hits = searcher.search(query);
>         System.out.println(hits.length() + " total matching documents");
>
>         final int HITS_PER_PAGE = 10;

> for (int start = 0; start < hits.length(); start +=> HITS_PER_PAGE) {

>           int end = Math.min(hits.length(), start + HITS_PER_PAGE);
>           for (int i = start; i < end; i++) {
>             Document doc = hits.doc(i);
>             String path = doc.get("path");
>             if (path != null) {
>               System.out.println(i + ". " + path);
>             } else {
>               String url = doc.get("url");
>               if (url != null) {
>                 System.out.println(i + ". " + url);
>                 System.out.println("   - " + doc.get("title"));
>               } else {

> System.out.println(i + ". " + "No path nor URL for this> document");

>               }
>             }
>           }
>
>           if (hits.length() > end) {
>             System.out.print("more (y/n) ? ");
>             line = in.readLine();
>             if (line.length() == 0 || line.charAt(0) == 'n')
>               break;
>           }
>         }
>       }
>       searcher.close();
>
>     } catch (Exception e) {
>       System.out.println(" caught a " + e.getClass() +
>                          "\n with message: " + e.getMessage());
>     }
>   }
> }
>

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]



--
___________________________________________________
Chris Fraschetti
e [EMAIL PROTECTED]

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]



---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Lucene - PDFBox

Reply via email to