Also, which analyzer are you using when indexing your documents? On 5/25/05, Ben Litchfield <[EMAIL PROTECTED]> wrote: > > Can you run the following command line application on the PDF to verify > that the extracted text is correct > > java org.pdfbox.ExtractText <pdf-file> > > Ben > > > > On Wed, 25 May 2005, Thomas X Hoban wrote: > > > > > > > First, I am new to Lucene. > > > > Is there anyone out there who has had trouble getting hits when running > > phrase queries against an index that contains content from PDF files. For > > PDF documents, I create the document using > > LucenePDFDocument.getDocument(file) and then add it to the index. For > > non-pdf documents, I create the document using FileDocument.Document(file). > > > > For instance, I add documents with the following text: > > > > pdf1.pdf -- "Dave has good taste" > > pdf2.pdf -- "Tom has good taste" > > word1.doc -- "Liz has bad taste" > > word2.doc -- "Troy has bad taste" > > > > When I search content for the following strings: > > > > has good taste > > get expected results with hits on pdf1.doc, pdf2.doc, word1.doc and > > word2.doc > > > > "has good taste" > > get unexpected result: 0 hits > > > > "has bad taste" > > get expected results with hits on word1.doc and word2.doc > > > > It seems that searching for individual words work fine for both PDF and > > non-pdf files. However, searching on a phrase (enclosed in quotes) works > > on non-pdf files but not on files parsed with the LucenePDFDocument class. > > > > Can anyone offer advise? > > > > Below is code for index creation. It is the demo IndexFiles class provided > > with Lucene along with some changes... > > > > import org.apache.lucene.analysis.standard.StandardAnalyzer; > > import org.apache.lucene.index.IndexWriter; > > import org.apache.lucene.document.Field; > > import org.apache.lucene.document.Document; > > > > import java.io.File; > > import java.io.FileNotFoundException; > > import java.io.IOException; > > import java.util.Date; > > > > //import javax.activation.MimetypesFileTypeMap; > > > > import org.pdfbox.searchengine.lucene.LucenePDFDocument; > > > > > > class IndexFiles { > > public static void main(String[] args) throws IOException { > > String usage = "java " + IndexFiles.class + " <root_directory>"; > > if (args.length == 0) { > > System.err.println("Usage: " + usage); > > System.exit(1); > > } > > > > Date start = new Date(); > > try { > > IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), > > true); > > indexDocs(writer, new File(args[0])); > > > > writer.optimize(); > > writer.close(); > > > > Date end = new Date(); > > > > System.out.print(end.getTime() - start.getTime()); > > System.out.println(" total milliseconds"); > > > > } catch (IOException e) { > > System.out.println(" caught a " + e.getClass() + > > "\n with message: " + e.getMessage()); > > } > > } > > > > public static void indexDocs(IndexWriter writer, File file) > > throws IOException { > > // do not try to index files that cannot be read > > > > if (file.canRead()) { > > if (file.isDirectory()) { > > String[] files = file.list(); > > // an IO error could occur > > if (files != null) { > > for (int i = 0; i < files.length; i++) { > > indexDocs(writer, new File(file, files[i])); > > } > > } > > } else { > > System.out.println("adding " + file); > > try { > > > > Document doc = null; > > if (file.getName().indexOf(".pdf") >= 0) > > // writer.addDocument(LucenePDFDocument.getDocument(file)); > > doc = LucenePDFDocument.getDocument(file); > > else > > doc = FileDocument.Document(file); > > > > Field field = null; > > if (file.getPath().indexOf("case1") >=0) > > field = new Field("caseid", "1", false, true, false); > > else if (file.getPath().indexOf("case2") >=0) > > field = new Field("caseid", "2", false, true, false); > > else if (file.getPath().indexOf("case3") >=0) > > field = new Field("caseid", "3", false, true, false); > > else > > field = new Field("caseid", "0", false, true, false); > > > > doc.add(field); > > > > writer.addDocument(doc); > > } > > // at least on windows, some temporary files raise this exception > > with an "access denied" message > > // checking if the file can be read doesn't help > > catch (FileNotFoundException fnfe) { > > ; > > } > > } > > } > > } > > } > > > > > > Here is the SearchFiles class with some minor modifications... > > > > import java.io.IOException; > > import java.io.BufferedReader; > > import java.io.InputStreamReader; > > import java.util.StringTokenizer; > > > > import org.apache.lucene.analysis.Analyzer; > > import org.apache.lucene.analysis.standard.StandardAnalyzer; > > import org.apache.lucene.document.Document; > > import org.apache.lucene.search.Searcher; > > import org.apache.lucene.search.IndexSearcher; > > import org.apache.lucene.search.Query; > > import org.apache.lucene.search.BooleanQuery; > > import org.apache.lucene.search.PhraseQuery; > > import org.apache.lucene.search.Hits; > > import org.apache.lucene.index.Term; > > import org.apache.lucene.queryParser.QueryParser; > > import org.apache.lucene.queryParser.ParseException; > > > > class SearchFiles { > > > > private static Query getCaseQuery(String line, Analyzer analyzer) > > throws ParseException { > > BooleanQuery bq = new BooleanQuery(); > > StringTokenizer st = new StringTokenizer(line); > > Query query = QueryParser.parse(line, "contents", analyzer); > > String caseId = null; > > while (st.hasMoreTokens()) { > > caseId = st.nextToken(); > > System.out.println("build case query for " + caseId); > > > > query = QueryParser.parse(caseId, "caseid", analyzer); > > bq.add(query, false, false); > > } > > > > return bq; > > } > > public static void main(String[] args) { > > try { > > Searcher searcher = new IndexSearcher("index"); > > Analyzer analyzer = new StandardAnalyzer(); > > > > BufferedReader in = new BufferedReader(new > > InputStreamReader(System.in)); > > while (true) { > > System.out.print("Query: "); > > String line = in.readLine(); > > System.out.print("Cases: "); > > String caseLine = in.readLine(); > > Query caseQuery = getCaseQuery(caseLine, analyzer); > > > > if (line.length() == -1) > > break; > > > > > > Query query = QueryParser.parse(line, "contents", analyzer); > > // PhraseQuery query = new PhraseQuery(); > > // query.add(new Term("contents",line)); > > System.out.println("Searching for: " + query.toString("contents")); > > /* > > BooleanQuery wholeQuery = new BooleanQuery(); > > wholeQuery.add(caseQuery, true, false); > > wholeQuery.add(query, true, false); > > Hits hits = searcher.search(wholeQuery); > > */ > > Hits hits = searcher.search(query); > > System.out.println(hits.length() + " total matching documents"); > > > > final int HITS_PER_PAGE = 10; > > for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) { > > int end = Math.min(hits.length(), start + HITS_PER_PAGE); > > for (int i = start; i < end; i++) { > > Document doc = hits.doc(i); > > String path = doc.get("path"); > > if (path != null) { > > System.out.println(i + ". " + path); > > } else { > > String url = doc.get("url"); > > if (url != null) { > > System.out.println(i + ". " + url); > > System.out.println(" - " + doc.get("title")); > > } else { > > System.out.println(i + ". " + "No path nor URL for this > > document"); > > } > > } > > } > > > > if (hits.length() > end) { > > System.out.print("more (y/n) ? "); > > line = in.readLine(); > > if (line.length() == 0 || line.charAt(0) == 'n') > > break; > > } > > } > > } > > searcher.close(); > > > > } catch (Exception e) { > > System.out.println(" caught a " + e.getClass() + > > "\n with message: " + e.getMessage()); > > } > > } > > } > > > > --------------------------------------------------------------------- > To unsubscribe, e-mail: [EMAIL PROTECTED] > For additional commands, e-mail: [EMAIL PROTECTED] > >
-- ___________________________________________________ Chris Fraschetti e [EMAIL PROTECTED] --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]