#!/opt/alt/python311/bin/python3.11
import os
import subprocess
import time
import shutil
from pathlib import Path
import mysql.connector
import re
from PIL import Image, ImageEnhance, ImageFilter

# ================== НАСТРОЙКИ ==================
DPI_LEVELS = [220, 200, 180, 160, 150]
PAGES_PER_BATCH = 10
CONVERSION_TIMEOUT = 300
TESSERACT_TIMEOUT = 280
MIN_TEXT_PER_PAGE = 25

BASE_DIR = Path("/home/dayhanbiz/public_html/biblioteka/тестирование скриптов/Сканирование PDF для базы данных/Скрипты")
LOG_DIR = BASE_DIR / "Logs"
TEMP_DIR = BASE_DIR / "Temporary process files"
FAILED_DIR = BASE_DIR / "Failed"

LOG_DIR.mkdir(exist_ok=True)
TEMP_DIR.mkdir(exist_ok=True)
FAILED_DIR.mkdir(exist_ok=True)

LOG_FILE = LOG_DIR / "ocr_main.log"

LANGUAGES = "rus"

def log(message):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{timestamp}] {message}"
    print(line)
    try:
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(line + "\n")
    except:
        pass

log("=== OCR v6.4 — Исправленная обработка pdfinfo ===")

db = mysql.connector.connect(
    host='localhost',
    user='dayhanbiz_biblioteka_index',
    password='HN_+@(ngb25642r1',
    database='dayhanbiz_biblioteka_index',
    autocommit=False
)
cursor = db.cursor(dictionary=True)

def get_page_count(pdf_path):
    try:
        result = subprocess.run(['pdfinfo', str(pdf_path)], capture_output=True, text=True, timeout=30)
        for line in result.stdout.splitlines():
            if "Pages:" in line:
                pages = int(line.split(":")[1].strip())
                return pages
        return 0
    except:
        return 0

# (остальные функции clean_word, clean_russian_text, preprocess_image — без изменений)
def clean_word(word):
    word = word.lower().strip()
    if len(word) < 3 or len(word) > 35:
        return None
    if not re.match(r'^[а-яёa-z\-\']+$', word):
        return None
    return word

def clean_russian_text(text):
    words = re.findall(r'[А-Яа-яЁё]+', text)
    return ' '.join([w for w in words if len(w) >= 3])

def preprocess_image(img_path):
    try:
        img = Image.open(img_path).convert('L')
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = ImageEnhance.Brightness(img).enhance(1.3)
        img = img.filter(ImageFilter.MedianFilter())
        img = img.point(lambda x: 0 if x < 140 else 255, '1')
        proc_path = img_path.with_name(img_path.stem + "_proc.jpg")
        img.save(proc_path, quality=95)
        return proc_path
    except:
        return img_path

# ===================== ОСНОВНОЙ ЦИКЛ =====================
while True:
    log("🔍 Поиск следующего PDF...")

    cursor.execute("""
        SELECT id, full_path, title 
        FROM books 
        WHERE type = 'pdf' 
          AND id NOT IN (SELECT DISTINCT book_id FROM pages WHERE is_pdf_page = 1)
        LIMIT 1
    """)
    book = cursor.fetchone()

    if not book:
        log("✅ Все PDF обработаны. Ожидание 60 сек...")
        time.sleep(60)
        continue

    book_id = book['id']
    pdf_path = Path(book['full_path'])
    title = book['title']

    log(f"▶ Начинаем: {title} (ID: {book_id})")

    total_pages = get_page_count(pdf_path)
    if total_pages == 0:
        log(f"❌ Не удалось получить количество страниц: {title}")
        shutil.copy2(pdf_path, FAILED_DIR / pdf_path.name)
        continue

    log(f"   Страниц в документе: {total_pages}")

    success = False
    for dpi in DPI_LEVELS:
        log(f"   Попытка DPI = {dpi}")
        try:
            for start in range(1, total_pages + 1, PAGES_PER_BATCH):
                end = min(start + PAGES_PER_BATCH - 1, total_pages)
                log(f"     Страницы {start}-{end}")

                prefix = TEMP_DIR / f"page_{book_id}_{start}"
                subprocess.run([
                    'pdftoppm', '-jpeg', '-r', str(dpi), '-f', str(start), '-l', str(end),
                    str(pdf_path), str(prefix)
                ], check=True, timeout=CONVERSION_TIMEOUT)

                for jpg in sorted(TEMP_DIR.glob(f"page_{book_id}_{start}-*.jpg")):
                    page_num = int(jpg.stem.split('-')[-1])

                    processed = preprocess_image(jpg)
                    txt_base = TEMP_DIR / f"ocr_{book_id}_{page_num}"

                    subprocess.run([
                        'tesseract', str(processed), str(txt_base), '-l', LANGUAGES, '--psm', '6'
                    ], timeout=TESSERACT_TIMEOUT)

                    txt_file = txt_base.with_suffix('.txt')
                    raw_text = txt_file.read_text(encoding='utf-8', errors='ignore').strip() if txt_file.exists() else ""
                    cleaned = clean_russian_text(raw_text)

                    if len(cleaned) >= MIN_TEXT_PER_PAGE:
                        snippet = cleaned[:750]
                        cursor.execute("""
                            INSERT INTO pages (book_id, page_or_file, page_number, is_pdf_page, snippet, full_content)
                            VALUES (%s, %s, %s, 1, %s, %s)
                        """, (book_id, f"page_{page_num}", page_num, snippet, cleaned))
                        db.commit()

                        for w in re.split(r'[\s\.,;:!?()«»\-]+', raw_text):
                            if clean_w := clean_word(w):
                                context = raw_text[max(0, raw_text.find(w)-60):raw_text.find(w)+120]
                                wi = db.cursor()
                                wi.execute("""
                                    INSERT INTO word_index (word, book_id, page_id, context)
                                    VALUES (%s, %s, %s, %s)
                                """, (clean_w, book_id, page_num, context[:180]))

            success = True
            break

        except Exception as e:
            log(f"   Ошибка DPI={dpi}: {e}")

    if success:
        log(f"✅ УСПЕШНО: {title}")
    else:
        log(f"❌ Не удалось обработать: {title}")
        try:
            shutil.copy2(pdf_path, FAILED_DIR / pdf_path.name)
        except Exception as e:
            log(f"   Не удалось скопировать в Failed: {e}")

    # Очистка
    for f in TEMP_DIR.glob(f"*_{book_id}_*"):
        f.unlink(missing_ok=True)

    time.sleep(3)
