Hi Everyone I'm very new here. I wonder how I can raise the image quality in pdf?? what can I do for this, here is an example file) I try to keep the quality at 300 dpi and throw it into the PIX class. here is my sample code. outputs 0.52% when reading. I would like to raise it to 80-85%
*var documentText = new StringBuilder(); using (var pdf = new PdfDocument("chet6.pdf")) { using (var engine = new TesseractEngine(@"tessdata", "rus+eng", EngineMode.LstmOnly)) { for (int i = 0; i < pdf.PageCount; ++i) { if (documentText.Length > 0) documentText.Append("\r\n\r\n"); PdfPage page = pdf.Pages[i]; string searchableText = page.GetText(); // Simple check if the page contains searchable text. // We do not need to perform OCR in that case. //foreach (PdfImage image in page.GetImages()) //{ // // simple hack to replace the right-bottom image only // if (image.Height == 512) // image.ReplaceWith("1px.png"); //} if (!string.IsNullOrEmpty(searchableText.Trim())) { documentText.Append(searchableText); continue; } // Save PDF page as high-resolution image PdfDrawOptions options = PdfDrawOptions.Create(); options.BackgroundColor = new PdfRgbColor(255, 255, 255); options.HorizontalResolution = 300; options.VerticalResolution = 300; string pageImage = $"page_{i}.png"; page.Save(pageImage, options); //page.Rotation = PdfRotation.None; //page.Save(pageImage, options); // Perform OCR using (Pix img = Pix.LoadFromFile(pageImage)) { //using (Page recognizedPage = engine.Process(img, PageSegMode.SingleBlock)) using (Page recognizedPage = engine.Process(img)) { Console.WriteLine($"Mean confidence for page #{i}: {recognizedPage.GetMeanConfidence()}"); string recognizedText = recognizedPage.GetText(); documentText.Append(recognizedText); } } File.Delete(pageImage); } } } using (var writer = new StreamWriter("result.txt")) writer.Write(documentText.ToString()); }* -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/44e20de8-6ca3-4a0a-a797-f37aa672b09en%40googlegroups.com.
chet6.pdf
Description: Adobe PDF document