import pytesseract
from PIL import Image
import fitz
from pathlib import Path
import cv2
import numpy as np
import logging
from scripts.dictionary import old_rus_dict

logger = logging.getLogger(__name__)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

def preprocess_for_old_russian(img):
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img

    clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
    gray = clahe.apply(gray)

    binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    kernel = np.ones((2,2), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

    return binary

def extract_text_with_ocr(pdf_path: Path, is_old_russian=False):
    try:
        doc = fitz.open(pdf_path)
        full_text = ""
        total_pages = len(doc)

        mode = "СТАРОРУССКИЙ (best)" if is_old_russian else "Современный"
        logger.info(f"   OCR [{mode}]: {total_pages} страниц")

        lang = 'rus_best' if is_old_russian else 'rus'
        config = '--psm 6 -c tessedit_char_blacklist=|' if is_old_russian else '--psm 3'

        for i in range(total_pages):
            page = doc.load_page(i)
            pix = page.get_pixmap(matrix=fitz.Matrix(3.0, 3.0))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            opencv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
            processed = preprocess_for_old_russian(opencv_img)

            text = pytesseract.image_to_string(processed, lang=lang, config=config)
            full_text += text + "\n\n"

            if (i + 1) % 10 == 0 or (i + 1) == total_pages:
                logger.info(f"     Обработано: {i+1}/{total_pages}")

        doc.close()

        # Применяем словарь
        if is_old_russian:
            full_text = old_rus_dict.correct_text(full_text)

        return full_text.strip()

    except Exception as e:
        logger.error(f"Ошибка OCR {pdf_path.name}: {e}")
        return ""
