Re: Lucene - PDFBox

Ben Litchfield Wed, 25 May 2005 14:38:33 -0700

Can you run the following command line application on the PDF to verify
that the extracted text is correct


java org.pdfbox.ExtractText <pdf-file>

Ben



On Wed, 25 May 2005, Thomas X Hoban wrote:

>
>
> First, I am new to Lucene.
>
> Is there anyone out there who has had trouble getting hits when running 
> phrase queries against an index that contains content from PDF files.  For 
> PDF documents, I create the document using 
> LucenePDFDocument.getDocument(file) and then add it to the index.  For 
> non-pdf documents, I create the document using FileDocument.Document(file).
>
> For instance, I add documents with the following text:
>
> pdf1.pdf -- "Dave has good taste"
> pdf2.pdf -- "Tom has good taste"
> word1.doc -- "Liz has bad taste"
> word2.doc -- "Troy has bad taste"
>
> When I search content for the following strings:
>
>     has good taste
>       get expected results with hits on pdf1.doc, pdf2.doc, word1.doc and 
> word2.doc
>
>     "has good taste"
>        get unexpected result: 0 hits
>
>     "has bad taste"
>        get expected results with hits on word1.doc and word2.doc
>
> It seems that searching for individual words work fine for both PDF and 
> non-pdf files.  However, searching on a phrase (enclosed in quotes) works on 
> non-pdf files but not on files parsed with the LucenePDFDocument class.
>
> Can anyone offer advise?
>
> Below is code for index creation.  It is the demo IndexFiles class provided 
> with Lucene along with some changes...
>
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.Document;
>
> import java.io.File;
> import java.io.FileNotFoundException;
> import java.io.IOException;
> import java.util.Date;
>
> //import javax.activation.MimetypesFileTypeMap;
>
> import org.pdfbox.searchengine.lucene.LucenePDFDocument;
>
>
> class IndexFiles {
>   public static void main(String[] args) throws IOException {
>     String usage = "java " + IndexFiles.class + " <root_directory>";
>     if (args.length == 0) {
>       System.err.println("Usage: " + usage);
>       System.exit(1);
>     }
>
>     Date start = new Date();
>     try {
>       IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), 
> true);
>       indexDocs(writer, new File(args[0]));
>
>       writer.optimize();
>       writer.close();
>
>       Date end = new Date();
>
>       System.out.print(end.getTime() - start.getTime());
>       System.out.println(" total milliseconds");
>
>     } catch (IOException e) {
>       System.out.println(" caught a " + e.getClass() +
>        "\n with message: " + e.getMessage());
>     }
>   }
>
>   public static void indexDocs(IndexWriter writer, File file)
>     throws IOException {
>     // do not try to index files that cannot be read
>
>     if (file.canRead()) {
>       if (file.isDirectory()) {
>         String[] files = file.list();
>         // an IO error could occur
>         if (files != null) {
>           for (int i = 0; i < files.length; i++) {
>             indexDocs(writer, new File(file, files[i]));
>           }
>         }
>       } else {
>         System.out.println("adding " + file);
>         try {
>
>           Document doc = null;
>           if (file.getName().indexOf(".pdf") >= 0)
>               // writer.addDocument(LucenePDFDocument.getDocument(file));
>               doc = LucenePDFDocument.getDocument(file);
>           else
>               doc = FileDocument.Document(file);
>
>           Field field = null;
>           if (file.getPath().indexOf("case1") >=0)
>               field = new Field("caseid", "1", false, true, false);
>           else if (file.getPath().indexOf("case2") >=0)
>               field = new Field("caseid", "2", false, true, false);
>           else if (file.getPath().indexOf("case3") >=0)
>               field = new Field("caseid", "3", false, true, false);
>           else
>               field = new Field("caseid", "0", false, true, false);
>
>           doc.add(field);
>
>           writer.addDocument(doc);
>         }
>         // at least on windows, some temporary files raise this exception 
> with an "access denied" message
>         // checking if the file can be read doesn't help
>         catch (FileNotFoundException fnfe) {
>           ;
>         }
>       }
>     }
>   }
> }
>
>
> Here is the SearchFiles class with some minor modifications...
>
> import java.io.IOException;
> import java.io.BufferedReader;
> import java.io.InputStreamReader;
> import java.util.StringTokenizer;
>
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.search.Searcher;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.BooleanQuery;
> import org.apache.lucene.search.PhraseQuery;
> import org.apache.lucene.search.Hits;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.queryParser.QueryParser;
> import org.apache.lucene.queryParser.ParseException;
>
> class SearchFiles {
>
>   private static Query getCaseQuery(String line, Analyzer analyzer)
>   throws ParseException {
>       BooleanQuery bq = new BooleanQuery();
>       StringTokenizer st = new StringTokenizer(line);
>       Query query = QueryParser.parse(line, "contents", analyzer);
>       String caseId = null;
>       while (st.hasMoreTokens()) {
>           caseId = st.nextToken();
>           System.out.println("build case query for " + caseId);
>
>           query = QueryParser.parse(caseId, "caseid", analyzer);
>           bq.add(query, false, false);
>       }
>
>       return bq;
>   }
>   public static void main(String[] args) {
>     try {
>       Searcher searcher = new IndexSearcher("index");
>       Analyzer analyzer = new StandardAnalyzer();
>
>       BufferedReader in = new BufferedReader(new 
> InputStreamReader(System.in));
>       while (true) {
>         System.out.print("Query: ");
>         String line = in.readLine();
>         System.out.print("Cases: ");
>         String caseLine = in.readLine();
>         Query caseQuery = getCaseQuery(caseLine, analyzer);
>
>         if (line.length() == -1)
>           break;
>
>
>         Query query = QueryParser.parse(line, "contents", analyzer);
>         // PhraseQuery query = new PhraseQuery();
>         // query.add(new Term("contents",line));
>         System.out.println("Searching for: " + query.toString("contents"));
>         /*
>         BooleanQuery wholeQuery = new BooleanQuery();
>         wholeQuery.add(caseQuery, true, false);
>         wholeQuery.add(query,     true, false);
>         Hits hits = searcher.search(wholeQuery);
>         */
>         Hits hits = searcher.search(query);
>         System.out.println(hits.length() + " total matching documents");
>
>         final int HITS_PER_PAGE = 10;
>         for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
>           int end = Math.min(hits.length(), start + HITS_PER_PAGE);
>           for (int i = start; i < end; i++) {
>             Document doc = hits.doc(i);
>             String path = doc.get("path");
>             if (path != null) {
>               System.out.println(i + ". " + path);
>             } else {
>               String url = doc.get("url");
>               if (url != null) {
>                 System.out.println(i + ". " + url);
>                 System.out.println("   - " + doc.get("title"));
>               } else {
>                 System.out.println(i + ". " + "No path nor URL for this 
> document");
>               }
>             }
>           }
>
>           if (hits.length() > end) {
>             System.out.print("more (y/n) ? ");
>             line = in.readLine();
>             if (line.length() == 0 || line.charAt(0) == 'n')
>               break;
>           }
>         }
>       }
>       searcher.close();
>
>     } catch (Exception e) {
>       System.out.println(" caught a " + e.getClass() +
>                          "\n with message: " + e.getMessage());
>     }
>   }
> }
>

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Lucene - PDFBox

Reply via email to