#!/opt/alt/python311/bin/python3.11
import os
import sys
import subprocess
import time
from pathlib import Path
import mysql.connector
from pdf2image import convert_from_path

DPI = 220

BASE_DIR = Path("/home/dayhanbiz/public_html/biblioteka/admin")
LOG_DIR = BASE_DIR / "logs"
LOG_FILE = LOG_DIR / "ocr_log.txt"
TEMP_DIR = Path("/home/dayhanbiz/ocr_temp")

LOG_DIR.mkdir(exist_ok=True)
TEMP_DIR.mkdir(exist_ok=True)

def log(message):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{timestamp}] {message}"
    print(line)
    try:
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(line + "\n")
    except:
        pass

log("=== НОВЫЙ OCR — Постраничное сохранение PDF ===")
log(f"DPI = {DPI} | Автоматический режим")

db = mysql.connector.connect(
    host='localhost',
    user='dayhanbiz_biblioteka_index',
    password='HN_+@(ngb25642r1',
    database='dayhanbiz_biblioteka_index'
)
cursor = db.cursor(dictionary=True)

while True:
    cursor.execute("""
        SELECT id, full_path, title 
        FROM books 
        WHERE type = 'pdf' 
          AND id NOT IN (SELECT book_id FROM pages WHERE is_pdf_page = 1)
        LIMIT 1
    """)
    book = cursor.fetchone()

    if not book:
        log("✅ Все PDF-файлы обработаны.")
        time.sleep(60)
        continue

    pdf_path = book['full_path']
    book_id = book['id']
    title = book['title']
    
    log(f"▶ Обрабатываем: {title}")

    try:
        pages = convert_from_path(pdf_path, dpi=DPI, thread_count=2)
        log(f"   Страниц: {len(pages)}")

        for i, page_img in enumerate(pages, 1):
            log(f"   Страница {i}/{len(pages)}...")
            
            img_path = TEMP_DIR / f"page_{book_id}_{i}.png"
            txt_base = TEMP_DIR / f"ocr_{book_id}_{i}"
            
            page_img.save(img_path, 'PNG')
            
            try:
                cmd = ['tesseract', str(img_path), str(txt_base), '-l', 'rus+eng', '--psm', '3']
                subprocess.run(cmd, capture_output=True, timeout=70)
            except subprocess.TimeoutExpired:
                log(f"   ⚠️ Таймаут на странице {i}")
                if img_path.exists(): img_path.unlink()
                continue

            txt_file = txt_base.with_suffix('.txt')
            page_text = ""
            if txt_file.exists():
                with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
                    page_text = f.read().strip()
                txt_file.unlink()
            if img_path.exists():
                img_path.unlink()

            if len(page_text) < 100:
                continue

            snippet = page_text[:750]

            cursor.execute("""
                INSERT INTO pages (book_id, page_number, is_pdf_page, snippet, full_content, page_or_file)
                VALUES (%s, %s, 1, %s, %s, 'page')
            """, (book_id, i, snippet, page_text))
            db.commit()

        log(f"   ✅ Успешно сохранено {len(pages)} страниц")

    except Exception as e:
        log(f"   ❌ Ошибка: {str(e)[:150]}")

    time.sleep(10)

