Can you run the following command line application on the PDF to verify
that the extracted text is correct
java org.pdfbox.ExtractText <pdf-file>
Ben
On Wed, 25 May 2005, Thomas X Hoban wrote:
>
>
> First, I am new to Lucene.
>
> Is there anyone out there who has had trouble getting hits when running
> phrase queries against an index that contains content from PDF files.
> For PDF documents, I create the document using
> LucenePDFDocument.getDocument(file) and then add it to the index. For
> non-pdf documents, I create the document using
> FileDocument.Document(file).
>
> For instance, I add documents with the following text:
>
> pdf1.pdf -- "Dave has good taste"
> pdf2.pdf -- "Tom has good taste"
> word1.doc -- "Liz has bad taste"
> word2.doc -- "Troy has bad taste"
>
> When I search content for the following strings:
>
> has good taste
> get expected results with hits on pdf1.doc, pdf2.doc, word1.doc
> and word2.doc
>
> "has good taste"
> get unexpected result: 0 hits
>
> "has bad taste"
> get expected results with hits on word1.doc and word2.doc
>
> It seems that searching for individual words work fine for both PDF and
> non-pdf files. However, searching on a phrase (enclosed in quotes)
> works on non-pdf files but not on files parsed with the
> LucenePDFDocument class.
>
> Can anyone offer advise?
>
> Below is code for index creation. It is the demo IndexFiles class
> provided with Lucene along with some changes...
>
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.Document;
>
> import java.io.File;
> import java.io.FileNotFoundException;
> import java.io.IOException;
> import java.util.Date;
>
> //import javax.activation.MimetypesFileTypeMap;
>
> import org.pdfbox.searchengine.lucene.LucenePDFDocument;
>
>
> class IndexFiles {
> public static void main(String[] args) throws IOException {
> String usage = "java " + IndexFiles.class + " <root_directory>";
> if (args.length == 0) {
> System.err.println("Usage: " + usage);
> System.exit(1);
> }
>
> Date start = new Date();
> try {
> IndexWriter writer = new IndexWriter("index", new
> StandardAnalyzer(), true);
> indexDocs(writer, new File(args[0]));
>
> writer.optimize();
> writer.close();
>
> Date end = new Date();
>
> System.out.print(end.getTime() - start.getTime());
> System.out.println(" total milliseconds");
>
> } catch (IOException e) {
> System.out.println(" caught a " + e.getClass() +
> "\n with message: " + e.getMessage());
> }
> }
>
> public static void indexDocs(IndexWriter writer, File file)
> throws IOException {
> // do not try to index files that cannot be read
>
> if (file.canRead()) {
> if (file.isDirectory()) {
> String[] files = file.list();
> // an IO error could occur
> if (files != null) {
> for (int i = 0; i < files.length; i++) {
> indexDocs(writer, new File(file, files[i]));
> }
> }
> } else {
> System.out.println("adding " + file);
> try {
>
> Document doc = null;
> if (file.getName().indexOf(".pdf") >= 0)
> //
> writer.addDocument(LucenePDFDocument.getDocument(file));
> doc = LucenePDFDocument.getDocument(file);
> else
> doc = FileDocument.Document(file);
>
> Field field = null;
> if (file.getPath().indexOf("case1") >=0)
> field = new Field("caseid", "1", false, true, false);
> else if (file.getPath().indexOf("case2") >=0)
> field = new Field("caseid", "2", false, true, false);
> else if (file.getPath().indexOf("case3") >=0)
> field = new Field("caseid", "3", false, true, false);
> else
> field = new Field("caseid", "0", false, true, false);
>
> doc.add(field);
>
> writer.addDocument(doc);
> }
> // at least on windows, some temporary files raise this
> exception with an "access denied" message
> // checking if the file can be read doesn't help
> catch (FileNotFoundException fnfe) {
> ;
> }
> }
> }
> }
> }
>
>
> Here is the SearchFiles class with some minor modifications...
>
> import java.io.IOException;
> import java.io.BufferedReader;
> import java.io.InputStreamReader;
> import java.util.StringTokenizer;
>
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.search.Searcher;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.BooleanQuery;
> import org.apache.lucene.search.PhraseQuery;
> import org.apache.lucene.search.Hits;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.queryParser.QueryParser;
> import org.apache.lucene.queryParser.ParseException;
>
> class SearchFiles {
>
> private static Query getCaseQuery(String line, Analyzer analyzer)
> throws ParseException {
> BooleanQuery bq = new BooleanQuery();
> StringTokenizer st = new StringTokenizer(line);
> Query query = QueryParser.parse(line, "contents", analyzer);
> String caseId = null;
> while (st.hasMoreTokens()) {
> caseId = st.nextToken();
> System.out.println("build case query for " + caseId);
>
> query = QueryParser.parse(caseId, "caseid", analyzer);
> bq.add(query, false, false);
> }
>
> return bq;
> }
> public static void main(String[] args) {
> try {
> Searcher searcher = new IndexSearcher("index");
> Analyzer analyzer = new StandardAnalyzer();
>
> BufferedReader in = new BufferedReader(new
> InputStreamReader(System.in));
> while (true) {
> System.out.print("Query: ");
> String line = in.readLine();
> System.out.print("Cases: ");
> String caseLine = in.readLine();
> Query caseQuery = getCaseQuery(caseLine, analyzer);
>
> if (line.length() == -1)
> break;
>
>
> Query query = QueryParser.parse(line, "contents", analyzer);
> // PhraseQuery query = new PhraseQuery();
> // query.add(new Term("contents",line));
> System.out.println("Searching for: " +
> query.toString("contents"));
> /*
> BooleanQuery wholeQuery = new BooleanQuery();
> wholeQuery.add(caseQuery, true, false);
> wholeQuery.add(query, true, false);
> Hits hits = searcher.search(wholeQuery);
> */
> Hits hits = searcher.search(query);
> System.out.println(hits.length() + " total matching documents");
>
> final int HITS_PER_PAGE = 10;
> for (int start = 0; start < hits.length(); start +=
> HITS_PER_PAGE) {
> int end = Math.min(hits.length(), start + HITS_PER_PAGE);
> for (int i = start; i < end; i++) {
> Document doc = hits.doc(i);
> String path = doc.get("path");
> if (path != null) {
> System.out.println(i + ". " + path);
> } else {
> String url = doc.get("url");
> if (url != null) {
> System.out.println(i + ". " + url);
> System.out.println(" - " + doc.get("title"));
> } else {
> System.out.println(i + ". " + "No path nor URL for this
> document");
> }
> }
> }
>
> if (hits.length() > end) {
> System.out.print("more (y/n) ? ");
> line = in.readLine();
> if (line.length() == 0 || line.charAt(0) == 'n')
> break;
> }
> }
> }
> searcher.close();
>
> } catch (Exception e) {
> System.out.println(" caught a " + e.getClass() +
> "\n with message: " + e.getMessage());
> }
> }
> }
>
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]