Can you run the following command line application on the PDF to verify that the extracted text is correct
java org.pdfbox.ExtractText <pdf-file> Ben On Wed, 25 May 2005, Thomas X Hoban wrote: > > > First, I am new to Lucene. > > Is there anyone out there who has had trouble getting hits when running > phrase queries against an index that contains content from PDF files. For > PDF documents, I create the document using > LucenePDFDocument.getDocument(file) and then add it to the index. For > non-pdf documents, I create the document using FileDocument.Document(file). > > For instance, I add documents with the following text: > > pdf1.pdf -- "Dave has good taste" > pdf2.pdf -- "Tom has good taste" > word1.doc -- "Liz has bad taste" > word2.doc -- "Troy has bad taste" > > When I search content for the following strings: > > has good taste > get expected results with hits on pdf1.doc, pdf2.doc, word1.doc and > word2.doc > > "has good taste" > get unexpected result: 0 hits > > "has bad taste" > get expected results with hits on word1.doc and word2.doc > > It seems that searching for individual words work fine for both PDF and > non-pdf files. However, searching on a phrase (enclosed in quotes) works on > non-pdf files but not on files parsed with the LucenePDFDocument class. > > Can anyone offer advise? > > Below is code for index creation. It is the demo IndexFiles class provided > with Lucene along with some changes... > > import org.apache.lucene.analysis.standard.StandardAnalyzer; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.document.Field; > import org.apache.lucene.document.Document; > > import java.io.File; > import java.io.FileNotFoundException; > import java.io.IOException; > import java.util.Date; > > //import javax.activation.MimetypesFileTypeMap; > > import org.pdfbox.searchengine.lucene.LucenePDFDocument; > > > class IndexFiles { > public static void main(String[] args) throws IOException { > String usage = "java " + IndexFiles.class + " <root_directory>"; > if (args.length == 0) { > System.err.println("Usage: " + usage); > System.exit(1); > } > > Date start = new Date(); > try { > IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), > true); > indexDocs(writer, new File(args[0])); > > writer.optimize(); > writer.close(); > > Date end = new Date(); > > System.out.print(end.getTime() - start.getTime()); > System.out.println(" total milliseconds"); > > } catch (IOException e) { > System.out.println(" caught a " + e.getClass() + > "\n with message: " + e.getMessage()); > } > } > > public static void indexDocs(IndexWriter writer, File file) > throws IOException { > // do not try to index files that cannot be read > > if (file.canRead()) { > if (file.isDirectory()) { > String[] files = file.list(); > // an IO error could occur > if (files != null) { > for (int i = 0; i < files.length; i++) { > indexDocs(writer, new File(file, files[i])); > } > } > } else { > System.out.println("adding " + file); > try { > > Document doc = null; > if (file.getName().indexOf(".pdf") >= 0) > // writer.addDocument(LucenePDFDocument.getDocument(file)); > doc = LucenePDFDocument.getDocument(file); > else > doc = FileDocument.Document(file); > > Field field = null; > if (file.getPath().indexOf("case1") >=0) > field = new Field("caseid", "1", false, true, false); > else if (file.getPath().indexOf("case2") >=0) > field = new Field("caseid", "2", false, true, false); > else if (file.getPath().indexOf("case3") >=0) > field = new Field("caseid", "3", false, true, false); > else > field = new Field("caseid", "0", false, true, false); > > doc.add(field); > > writer.addDocument(doc); > } > // at least on windows, some temporary files raise this exception > with an "access denied" message > // checking if the file can be read doesn't help > catch (FileNotFoundException fnfe) { > ; > } > } > } > } > } > > > Here is the SearchFiles class with some minor modifications... > > import java.io.IOException; > import java.io.BufferedReader; > import java.io.InputStreamReader; > import java.util.StringTokenizer; > > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.standard.StandardAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.search.Searcher; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.BooleanQuery; > import org.apache.lucene.search.PhraseQuery; > import org.apache.lucene.search.Hits; > import org.apache.lucene.index.Term; > import org.apache.lucene.queryParser.QueryParser; > import org.apache.lucene.queryParser.ParseException; > > class SearchFiles { > > private static Query getCaseQuery(String line, Analyzer analyzer) > throws ParseException { > BooleanQuery bq = new BooleanQuery(); > StringTokenizer st = new StringTokenizer(line); > Query query = QueryParser.parse(line, "contents", analyzer); > String caseId = null; > while (st.hasMoreTokens()) { > caseId = st.nextToken(); > System.out.println("build case query for " + caseId); > > query = QueryParser.parse(caseId, "caseid", analyzer); > bq.add(query, false, false); > } > > return bq; > } > public static void main(String[] args) { > try { > Searcher searcher = new IndexSearcher("index"); > Analyzer analyzer = new StandardAnalyzer(); > > BufferedReader in = new BufferedReader(new > InputStreamReader(System.in)); > while (true) { > System.out.print("Query: "); > String line = in.readLine(); > System.out.print("Cases: "); > String caseLine = in.readLine(); > Query caseQuery = getCaseQuery(caseLine, analyzer); > > if (line.length() == -1) > break; > > > Query query = QueryParser.parse(line, "contents", analyzer); > // PhraseQuery query = new PhraseQuery(); > // query.add(new Term("contents",line)); > System.out.println("Searching for: " + query.toString("contents")); > /* > BooleanQuery wholeQuery = new BooleanQuery(); > wholeQuery.add(caseQuery, true, false); > wholeQuery.add(query, true, false); > Hits hits = searcher.search(wholeQuery); > */ > Hits hits = searcher.search(query); > System.out.println(hits.length() + " total matching documents"); > > final int HITS_PER_PAGE = 10; > for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) { > int end = Math.min(hits.length(), start + HITS_PER_PAGE); > for (int i = start; i < end; i++) { > Document doc = hits.doc(i); > String path = doc.get("path"); > if (path != null) { > System.out.println(i + ". " + path); > } else { > String url = doc.get("url"); > if (url != null) { > System.out.println(i + ". " + url); > System.out.println(" - " + doc.get("title")); > } else { > System.out.println(i + ". " + "No path nor URL for this > document"); > } > } > } > > if (hits.length() > end) { > System.out.print("more (y/n) ? "); > line = in.readLine(); > if (line.length() == 0 || line.charAt(0) == 'n') > break; > } > } > } > searcher.close(); > > } catch (Exception e) { > System.out.println(" caught a " + e.getClass() + > "\n with message: " + e.getMessage()); > } > } > } > --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]