Re: Lucene - PDFBox

Chris Fraschetti Wed, 25 May 2005 14:53:46 -0700

Also, which analyzer are you using when indexing your documents?

On 5/25/05, Ben Litchfield <[EMAIL PROTECTED]> wrote:
> 
> Can you run the following command line application on the PDF to verify
> that the extracted text is correct
> 
> java org.pdfbox.ExtractText <pdf-file>
> 
> Ben
> 
> 
> 
> On Wed, 25 May 2005, Thomas X Hoban wrote:
> 
> >
> >
> > First, I am new to Lucene.
> >
> > Is there anyone out there who has had trouble getting hits when running 
> > phrase queries against an index that contains content from PDF files.  For 
> > PDF documents, I create the document using 
> > LucenePDFDocument.getDocument(file) and then add it to the index.  For 
> > non-pdf documents, I create the document using FileDocument.Document(file).
> >
> > For instance, I add documents with the following text:
> >
> > pdf1.pdf -- "Dave has good taste"
> > pdf2.pdf -- "Tom has good taste"
> > word1.doc -- "Liz has bad taste"
> > word2.doc -- "Troy has bad taste"
> >
> > When I search content for the following strings:
> >
> >     has good taste
> >       get expected results with hits on pdf1.doc, pdf2.doc, word1.doc and 
> > word2.doc
> >
> >     "has good taste"
> >        get unexpected result: 0 hits
> >
> >     "has bad taste"
> >        get expected results with hits on word1.doc and word2.doc
> >
> > It seems that searching for individual words work fine for both PDF and 
> > non-pdf files.  However, searching on a phrase (enclosed in quotes) works 
> > on non-pdf files but not on files parsed with the LucenePDFDocument class.
> >
> > Can anyone offer advise?
> >
> > Below is code for index creation.  It is the demo IndexFiles class provided 
> > with Lucene along with some changes...
> >
> > import org.apache.lucene.analysis.standard.StandardAnalyzer;
> > import org.apache.lucene.index.IndexWriter;
> > import org.apache.lucene.document.Field;
> > import org.apache.lucene.document.Document;
> >
> > import java.io.File;
> > import java.io.FileNotFoundException;
> > import java.io.IOException;
> > import java.util.Date;
> >
> > //import javax.activation.MimetypesFileTypeMap;
> >
> > import org.pdfbox.searchengine.lucene.LucenePDFDocument;
> >
> >
> > class IndexFiles {
> >   public static void main(String[] args) throws IOException {
> >     String usage = "java " + IndexFiles.class + " <root_directory>";
> >     if (args.length == 0) {
> >       System.err.println("Usage: " + usage);
> >       System.exit(1);
> >     }
> >
> >     Date start = new Date();
> >     try {
> >       IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), 
> > true);
> >       indexDocs(writer, new File(args[0]));
> >
> >       writer.optimize();
> >       writer.close();
> >
> >       Date end = new Date();
> >
> >       System.out.print(end.getTime() - start.getTime());
> >       System.out.println(" total milliseconds");
> >
> >     } catch (IOException e) {
> >       System.out.println(" caught a " + e.getClass() +
> >        "\n with message: " + e.getMessage());
> >     }
> >   }
> >
> >   public static void indexDocs(IndexWriter writer, File file)
> >     throws IOException {
> >     // do not try to index files that cannot be read
> >
> >     if (file.canRead()) {
> >       if (file.isDirectory()) {
> >         String[] files = file.list();
> >         // an IO error could occur
> >         if (files != null) {
> >           for (int i = 0; i < files.length; i++) {
> >             indexDocs(writer, new File(file, files[i]));
> >           }
> >         }
> >       } else {
> >         System.out.println("adding " + file);
> >         try {
> >
> >           Document doc = null;
> >           if (file.getName().indexOf(".pdf") >= 0)
> >               // writer.addDocument(LucenePDFDocument.getDocument(file));
> >               doc = LucenePDFDocument.getDocument(file);
> >           else
> >               doc = FileDocument.Document(file);
> >
> >           Field field = null;
> >           if (file.getPath().indexOf("case1") >=0)
> >               field = new Field("caseid", "1", false, true, false);
> >           else if (file.getPath().indexOf("case2") >=0)
> >               field = new Field("caseid", "2", false, true, false);
> >           else if (file.getPath().indexOf("case3") >=0)
> >               field = new Field("caseid", "3", false, true, false);
> >           else
> >               field = new Field("caseid", "0", false, true, false);
> >
> >           doc.add(field);
> >
> >           writer.addDocument(doc);
> >         }
> >         // at least on windows, some temporary files raise this exception 
> > with an "access denied" message
> >         // checking if the file can be read doesn't help
> >         catch (FileNotFoundException fnfe) {
> >           ;
> >         }
> >       }
> >     }
> >   }
> > }
> >
> >
> > Here is the SearchFiles class with some minor modifications...
> >
> > import java.io.IOException;
> > import java.io.BufferedReader;
> > import java.io.InputStreamReader;
> > import java.util.StringTokenizer;
> >
> > import org.apache.lucene.analysis.Analyzer;
> > import org.apache.lucene.analysis.standard.StandardAnalyzer;
> > import org.apache.lucene.document.Document;
> > import org.apache.lucene.search.Searcher;
> > import org.apache.lucene.search.IndexSearcher;
> > import org.apache.lucene.search.Query;
> > import org.apache.lucene.search.BooleanQuery;
> > import org.apache.lucene.search.PhraseQuery;
> > import org.apache.lucene.search.Hits;
> > import org.apache.lucene.index.Term;
> > import org.apache.lucene.queryParser.QueryParser;
> > import org.apache.lucene.queryParser.ParseException;
> >
> > class SearchFiles {
> >
> >   private static Query getCaseQuery(String line, Analyzer analyzer)
> >   throws ParseException {
> >       BooleanQuery bq = new BooleanQuery();
> >       StringTokenizer st = new StringTokenizer(line);
> >       Query query = QueryParser.parse(line, "contents", analyzer);
> >       String caseId = null;
> >       while (st.hasMoreTokens()) {
> >           caseId = st.nextToken();
> >           System.out.println("build case query for " + caseId);
> >
> >           query = QueryParser.parse(caseId, "caseid", analyzer);
> >           bq.add(query, false, false);
> >       }
> >
> >       return bq;
> >   }
> >   public static void main(String[] args) {
> >     try {
> >       Searcher searcher = new IndexSearcher("index");
> >       Analyzer analyzer = new StandardAnalyzer();
> >
> >       BufferedReader in = new BufferedReader(new 
> > InputStreamReader(System.in));
> >       while (true) {
> >         System.out.print("Query: ");
> >         String line = in.readLine();
> >         System.out.print("Cases: ");
> >         String caseLine = in.readLine();
> >         Query caseQuery = getCaseQuery(caseLine, analyzer);
> >
> >         if (line.length() == -1)
> >           break;
> >
> >
> >         Query query = QueryParser.parse(line, "contents", analyzer);
> >         // PhraseQuery query = new PhraseQuery();
> >         // query.add(new Term("contents",line));
> >         System.out.println("Searching for: " + query.toString("contents"));
> >         /*
> >         BooleanQuery wholeQuery = new BooleanQuery();
> >         wholeQuery.add(caseQuery, true, false);
> >         wholeQuery.add(query,     true, false);
> >         Hits hits = searcher.search(wholeQuery);
> >         */
> >         Hits hits = searcher.search(query);
> >         System.out.println(hits.length() + " total matching documents");
> >
> >         final int HITS_PER_PAGE = 10;
> >         for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
> >           int end = Math.min(hits.length(), start + HITS_PER_PAGE);
> >           for (int i = start; i < end; i++) {
> >             Document doc = hits.doc(i);
> >             String path = doc.get("path");
> >             if (path != null) {
> >               System.out.println(i + ". " + path);
> >             } else {
> >               String url = doc.get("url");
> >               if (url != null) {
> >                 System.out.println(i + ". " + url);
> >                 System.out.println("   - " + doc.get("title"));
> >               } else {
> >                 System.out.println(i + ". " + "No path nor URL for this 
> > document");
> >               }
> >             }
> >           }
> >
> >           if (hits.length() > end) {
> >             System.out.print("more (y/n) ? ");
> >             line = in.readLine();
> >             if (line.length() == 0 || line.charAt(0) == 'n')
> >               break;
> >           }
> >         }
> >       }
> >       searcher.close();
> >
> >     } catch (Exception e) {
> >       System.out.println(" caught a " + e.getClass() +
> >                          "\n with message: " + e.getMessage());
> >     }
> >   }
> > }
> >
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [EMAIL PROTECTED]
> For additional commands, e-mail: [EMAIL PROTECTED]
> 
>



-- 
___________________________________________________
Chris Fraschetti
e [EMAIL PROTECTED]

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Lucene - PDFBox

Reply via email to