#!/opt/alt/python311/bin/python3.11
import os
import subprocess
import time
import shutil
from pathlib import Path
import mysql.connector
import re
from PIL import Image, ImageEnhance, ImageFilter

# ================== НАСТРОЙКИ ==================
DPI_LEVELS = [220, 200, 180, 160, 150]
PAGES_PER_BATCH = 15
CONVERSION_TIMEOUT = 250
TESSERACT_TIMEOUT = 230
MIN_TEXT_PER_PAGE = 25

os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract/tessdata/'

BASE_DIR = Path("/home/dayhanbiz/public_html/biblioteka/admin")
LOG_DIR = BASE_DIR / "logs"
LOG_FILE = LOG_DIR / "ocr_log.txt"
TEMP_DIR = Path("/home/dayhanbiz/ocr_temp")
FAILED_DIR = Path("/home/dayhanbiz/failed_pdfs")

LOG_DIR.mkdir(exist_ok=True)
TEMP_DIR.mkdir(exist_ok=True)
FAILED_DIR.mkdir(exist_ok=True)

LANGUAGES = "rus"

def log(message):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{timestamp}] {message}"
    print(line)
    try:
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(line + "\n")
    except:
        pass

log("=== OCR v5.6 — Только Хранилище + word_index ===")

db = mysql.connector.connect(
    host='localhost',
    user='dayhanbiz_biblioteka_index',
    password='HN_+@(ngb25642r1',
    database='dayhanbiz_biblioteka_index',
    autocommit=False
)
cursor = db.cursor(dictionary=True)

def clean_word(word):
    word = trim(mb_strtolower(word, 'UTF-8')) if 'mb_strtolower' in dir() else word.lower().strip()
    if len(word) < 3 or len(word) > 35: return None
    if not re.match(r'^[а-яёa-z\-\']+$', word, re.IGNORECASE | re.UNICODE): return None
    if re.search(r'[0-9]', word): return None
    return word

def clean_russian_text(text):
    words = re.findall(r'[А-Яа-яЁё]+', text)
    words = [w for w in words if len(w) >= 3]
    return ' '.join(words)

def preprocess_image(img_path):
    try:
        img = Image.open(img_path).convert('L')
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = ImageEnhance.Brightness(img).enhance(1.2)
        img = img.filter(ImageFilter.MedianFilter())
        img = img.point(lambda x: 0 if x < 140 else 255, '1')
        processed_path = img_path.with_name(img_path.stem + "_proc.jpg")
        img.save(processed_path, quality=95)
        return processed_path
    except:
        return img_path

while True:
    log("🔍 === ПОИСК СЛЕДУЮЩЕГО ФАЙЛА ===")

    try:
        cursor.execute("""
            SELECT id, full_path, title
            FROM books
            WHERE type = 'pdf'
              AND full_path LIKE '%/Хранилище/%'
              AND id NOT IN (SELECT DISTINCT book_id FROM pages WHERE is_pdf_page = 1)
            LIMIT 1
        """)
        book = cursor.fetchone()
    except Exception as e:
        log(f"❌ Ошибка SQL: {e}")
        time.sleep(10)
        continue

    if not book:
        log("✅ Все файлы из Хранилища обработаны. Ожидание 60 секунд...")
        time.sleep(60)
        continue

    book_id = book['id']
    pdf_path = book['full_path']
    title = book['title']

    log(f"▶ НАЧИНАЕМ: {title} (ID: {book_id})")

    success = False

    for dpi in DPI_LEVELS:
        log(f"   Попытка DPI = {dpi}")
        try:
            result = subprocess.run(['pdfinfo', pdf_path], capture_output=True, text=True, timeout=30)
            pages_line = [line for line in result.stdout.splitlines() if "Pages:" in line]
            total_pages = int(pages_line[0].split(":")[1].strip()) if pages_line else 0
            log(f"   Страниц: {total_pages}")

            for start_page in range(1, total_pages + 1, PAGES_PER_BATCH):
                end_page = min(start_page + PAGES_PER_BATCH - 1, total_pages)
                log(f"     Пакет {start_page}-{end_page}...")

                output_pattern = str(TEMP_DIR / f"page_{book_id}_{start_page}")
                subprocess.run([
                    'pdftoppm', '-jpeg', '-r', str(dpi), '-f', str(start_page), '-l', str(end_page), pdf_path, output_pattern
                ], check=True, timeout=CONVERSION_TIMEOUT)

                jpg_files = sorted(TEMP_DIR.glob(f"page_{book_id}_{start_page}-*.jpg"))

                for jpg_file in jpg_files:
                    match = re.search(r'-(\d{3})\.jpg$', jpg_file.name) or re.search(r'-(\d+)\.jpg$', jpg_file.name)
                    page_num = int(match.group(1)) if match else 0

                    log(f"       Страница {page_num}...")

                    processed_img = preprocess_image(jpg_file)
                    txt_base = TEMP_DIR / f"ocr_{book_id}_{page_num}"

                    try:
                        subprocess.run([
                            'tesseract', str(processed_img), str(txt_base), '-l', LANGUAGES, '--psm', '6'
                        ], capture_output=True, timeout=TESSERACT_TIMEOUT)

                        txt_file = txt_base.with_suffix('.txt')
                        raw_text = txt_file.read_text(encoding='utf-8', errors='ignore').strip() if txt_file.exists() else ""
                        cleaned_text = clean_russian_text(raw_text)

                        log(f"       Страница {page_num}: {len(cleaned_text)} символов")

                        if len(cleaned_text) >= MIN_TEXT_PER_PAGE:
                            snippet = cleaned_text[:750]
                            cursor.execute("""
                                INSERT INTO pages (book_id, page_or_file, page_number, is_pdf_page, snippet, full_content)
                                VALUES (%s, %s, %s, 1, %s, %s)
                            """, (book_id, f"page_{page_num}", page_num, snippet, cleaned_text))
                            db.commit()

                            # === Добавлен блок индексации слов ===
                            words = re.split(r'[\s\.,;:!?()«»]+', raw_text)
                            word_count = 0
                            for w in words:
                                if clean_w := clean_word(w):
                                    context = raw_text[max(0, raw_text.find(w)-60):raw_text.find(w)+100]
                                    wi = db.cursor()
                                    wi.execute("""
                                        INSERT INTO word_index (word, book_id, page_id, context) 
                                        VALUES (%s, %s, %s, %s)
                                    """, (clean_w, book_id, page_num, context[:160]))
                                    word_count += 1
                            log(f"       → Добавлено {word_count} слов в word_index")

                            log(f"       ✓ УСПЕШНО СОХРАНЕНО В БАЗУ")
                        else:
                            log(f"       ⚠️ Пропущено (мало текста)")

                    except Exception as e:
                        log(f"       ❌ Ошибка Tesseract/INSERT: {e}")
                        db.rollback()

            success = True
            break

        except Exception as e:
            log(f"   ❌ Ошибка DPI={dpi}: {e}")
            continue

    if success:
        log(f"✅ УСПЕШНО ЗАВЕРШЁН: {title}")
    else:
        log(f"❌ Не удалось: {title}")
        shutil.copy2(pdf_path, FAILED_DIR / os.path.basename(pdf_path))

    # Очистка
    for pattern in [f"page_{book_id}_*", f"ocr_{book_id}_*"]:
        for f in TEMP_DIR.glob(pattern):
            f.unlink(missing_ok=True)

    log("⏳ Пауза 3 секунды и возврат к поиску...")
    time.sleep(3)