Hello! I'm trying to reinstall some code I made with artificial 
intelligence that uses tesseract. I managed to get everything working 
(intelligent scanning of minutes so that they are later renamed and moved 
to folders) but now I have changed offices and there is another machine. 
When trying to install the libraries I get that windows does not recognize 
the installation of tesseract. I downloaded it from 
https://github.com/UB-Mannheim/tesseract/wiki and tried various versions. 
When I put tesseract -v or where tesseract in the command line it tells me 
"tesseract" is not recognized as an internal or external command, program 
or executable batch file. I tried to edit the environment variable and it 
doesn't work either (select the installation path C:\Program 
Files\Tesseract-OCR and check that the files are there) try to open 
tesseract outside of python, and it opens a window (like a black console) 
and closes automatically I also tried to open from pycharm (where I have 
the codes) and it gives me this error: 
































*C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\venv\Scripts\python.exe 
C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\main.py Introduce la 
ruta de la carpeta que contiene los archivos PDF: 
C:\Users\UNTREF\Desktop\prueba_excelProcesando archivo: 1-49.pdfTraceback 
(most recent call last):  File 
"C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\venv\Lib\site-packages\pytesseract\pytesseract.py",
 
line 255, in run_tesseract    proc = subprocess.Popen(cmd_args, 
**subprocess_args())          
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  File 
"C:\Users\UNTREF\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", 
line 1024, in __init__    self._execute_child(args, executable, preexec_fn, 
close_fds,  File 
"C:\Users\UNTREF\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", 
line 1509, in _execute_child    hp, ht, pid, tid = 
_winapi.CreateProcess(executable, args,                      
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^FileNotFoundError: [WinError 2] El 
sistema no puede encontrar el archivo especificadoDuring handling of the 
above exception, another exception occurred:Traceback (most recent call 
last):  File 
"C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\main.py", line 33, in 
<module>    text = pytesseract.image_to_string(page, lang='spa', 
config='--psm 4 --oem 1')          
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  
File 
"C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\venv\Lib\site-packages\pytesseract\pytesseract.py",
 
line 423, in image_to_string    return {           ^  File 
"C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\venv\Lib\site-packages\pytesseract\pytesseract.py",
 
line 426, in <lambda>    Output.STRING: lambda: run_and_get_output(*args),  
                         ^^^^^^^^^^^^^^^^^^^^^^^^^  File 
"C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\venv\Lib\site-packages\pytesseract\pytesseract.py",
 
line 288, in run_and_get_output    run_tesseract(**kwargs)  File 
"C:\Users\UNTREF\PycharmProjects\obtencion_de_valores\venv\Lib\site-packages\pytesseract\pytesseract.py",
 
line 260, in run_tesseract    raise 
TesseractNotFoundError()pytesseract.pytesseract.TesseractNotFoundError: 
tesseract is not installed or it's not in your PATH. See README file for 
more information.Process finished with exit code 1*

Here I provide the python code, although I don't think it's the problem 
import os
import re
import pytesseract
from pdf2image import convert_from_path

# Ruta de la carpeta que contiene los archivos PDF a procesar
pdf_folder = input("Introduce la ruta de la carpeta que contiene los 
archivos PDF: ")

# Definir patrones de búsqueda
libro_pattern = r'LIBRO:\s+(\d+)'
folio_pattern = r'FOLIO:\s+(\d+)'
no_pattern = r'No\s+(\d+)'
materia_pattern = r'MATERIA\s+:\s+(.*)\n'
docente_pattern = r'DOCENTE\s+:\s+(.*)\n'
fecha_pattern = r'FECHA\s+:\s+(\d{2}/\d{2}/\d{4})\s+'

# Iterar sobre cada archivo PDF en la carpeta
for filename in os.listdir(pdf_folder):
if filename.endswith('.pdf'):
print(f"Procesando archivo: {filename}")

# Ruta del archivo PDF a procesar
pdf_path = os.path.join(pdf_folder, filename)

# Convertir cada página del PDF a una imagen
pages = convert_from_path(pdf_path)

# Lista para almacenar los resultados de cada página
results = []

# Procesar cada imagen con Pytesseract
for page in pages:
text = pytesseract.image_to_string(page, lang='spa', config='--psm 4 --oem 
1')
results.append(text)

# Buscar los valores de LIBRO, FOLIO, No, MATERIA, DOCENTE y FECHA en la 
cadena de texto
libro_match = re.search(libro_pattern, results[0])
folio_match = re.search(folio_pattern, results[0])
no_match = re.search(no_pattern, results[0])
materia_match = re.search(materia_pattern, results[0])
docente_match = re.search(docente_pattern, results[0])
fecha_match = re.search(fecha_pattern, results[0])

# Extraer los valores encontrados e imprimirlos
if libro_match:
libro = libro_match.group(1)
print(f"LIBRO: {libro}")
if folio_match:
folio = folio_match.group(1)
print(f"FOLIO: {folio}")
if no_match:
no = no_match.group(1)
print(f"No: {no}")
if materia_match:
materia = materia_match.group(1)
print(f"MATERIA: {materia}")
if docente_match:
docente = docente_match.group(1)
print(f"DOCENTE: {docente}")
if fecha_match:
fecha = fecha_match.group(1)
print(f"FECHA: {fecha}")

print(f"Archivo {filename} procesado correctamente.\n")

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/823aafc1-062a-4e67-87f4-b5912ec6ffa8n%40googlegroups.com.

Reply via email to