/home/dayhanbiz/public_html/biblioteka/admin/process_pdf_tesseract.py#!/opt/alt/python311/bin/python3.11
import os
import subprocess
import time
import shutil
from pathlib import Path
import mysql.connector
import re
from PIL import Image, ImageEnhance, ImageFilter# ================== НАСТРОЙКИ ==================
DPI_LEVELS = [220, 200, 180, 160, 150]
PAGES_PER_BATCH = 15
CONVERSION_TIMEOUT = 250
TESSERACT_TIMEOUT = 230
MIN_TEXT_PER_PAGE = 25os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract/tessdata/'BASE_DIR = Path("/home/dayhanbiz/public_html/biblioteka/admin")
LOG_DIR = BASE_DIR / "logs"
LOG_FILE = LOG_DIR / "ocr_log.txt"
TEMP_DIR = Path("/home/dayhanbiz/ocr_temp")
FAILED_DIR = Path("/home/dayhanbiz/failed_pdfs")LOG_DIR.mkdir(exist_ok=True)
TEMP_DIR.mkdir(exist_ok=True)
FAILED_DIR.mkdir(exist_ok=True)LANGUAGES = "rus"def log(message):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{timestamp}] {message}"
    print(line)
    try:
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(line + "\n")
    except:
        passlog("=== OCR v5.6 — Только Хранилище + word_index ===")db = mysql.connector.connect(
    host='localhost',
    user='dayhanbiz_biblioteka_index',
    password='HN_+@(ngb25642r1',
    database='dayhanbiz_biblioteka_index',
    autocommit=False
)
cursor = db.cursor(dictionary=True)def clean_word(word):
    word = trim(mb_strtolower(word, 'UTF-8')) if 'mb_strtolower' in dir() else word.lower().strip()
    if len(word) < 3 or len(word) > 35: return None
    if not re.match(r'^[а-яёa-z\-\']+$', word, re.IGNORECASE | re.UNICODE): return None
    if re.search(r'[0-9]', word): return None
    return worddef clean_russian_text(text):
    words = re.findall(r'[А-Яа-яЁё]+', text)
    words = [w for w in words if len(w) >= 3]
    return ' '.join(words)def preprocess_image(img_path):
    try:
        img = Image.open(img_path).convert('L')
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = ImageEnhance.Brightness(img).enhance(1.2)
        img = img.filter(ImageFilter.MedianFilter())
        img = img.point(lambda x: 0 if x < 140 else 255, '1')
        processed_path = img_path.with_name(img_path.stem + "_proc.jpg")
        img.save(processed_path, quality=95)
        return processed_path
    except:
        return img_pathwhile True:
    log(" === ПОИСК СЛЕДУЮЩЕГО ФАЙЛА ===")try:
    cursor.execute("""
        SELECT id, full_path, title
        FROM books
        WHERE type = 'pdf'
          AND full_path LIKE '%/Хранилище/%'
          AND id NOT IN (SELECT DISTINCT book_id FROM pages WHERE is_pdf_page = 1)
        LIMIT 1
    """)
    book = cursor.fetchone()
except Exception as e:
    log(f" Ошибка SQL: {e}")
    time.sleep(10)
    continue

if not book:
    log(" Все файлы из Хранилища обработаны. Ожидание 60 секунд...")
    time.sleep(60)
    continue

book_id = book['id']
pdf_path = book['full_path']
title = book['title']

log(f" НАЧИНАЕМ: {title} (ID: {book_id})")

success = False

for dpi in DPI_LEVELS:
    log(f"   Попытка DPI = {dpi}")
    try:
        result = subprocess.run(['pdfinfo', pdf_path], capture_output=True, text=True, timeout=30)
        pages_line = [line for line in result.stdout.splitlines() if "Pages:" in line]
        total_pages = int(pages_line[0].split(":")[1].strip()) if pages_line else 0
        log(f"   Страниц: {total_pages}")

        for start_page in range(1, total_pages + 1, PAGES_PER_BATCH):
            end_page = min(start_page + PAGES_PER_BATCH - 1, total_pages)
            log(f"     Пакет {start_page}-{end_page}...")

            output_pattern = str(TEMP_DIR / f"page_{book_id}_{start_page}")
            subprocess.run([
                'pdftoppm', '-jpeg', '-r', str(dpi), '-f', str(start_page), '-l', str(end_page), pdf_path, output_pattern
            ], check=True, timeout=CONVERSION_TIMEOUT)

            jpg_files = sorted(TEMP_DIR.glob(f"page_{book_id}_{start_page}-*.jpg"))

            for jpg_file in jpg_files:
                match = re.search(r'-(\d{3})\.jpg$', jpg_file.name) or re.search(r'-(\d+)\.jpg$', jpg_file.name)
                page_num = int(match.group(1)) if match else 0

                log(f"       Страница {page_num}...")

                processed_img = preprocess_image(jpg_file)
                txt_base = TEMP_DIR / f"ocr_{book_id}_{page_num}"

                try:
                    subprocess.run([
                        'tesseract', str(processed_img), str(txt_base), '-l', LANGUAGES, '--psm', '6'
                    ], capture_output=True, timeout=TESSERACT_TIMEOUT)

                    txt_file = txt_base.with_suffix('.txt')
                    raw_text = txt_file.read_text(encoding='utf-8', errors='ignore').strip() if txt_file.exists() else ""
                    cleaned_text = clean_russian_text(raw_text)

                    log(f"       Страница {page_num}: {len(cleaned_text)} символов")

                    if len(cleaned_text) >= MIN_TEXT_PER_PAGE:
                        snippet = cleaned_text[:750]
                        cursor.execute("""
                            INSERT INTO pages (book_id, page_or_file, page_number, is_pdf_page, snippet, full_content)
                            VALUES (%s, %s, %s, 1, %s, %s)
                        """, (book_id, f"page_{page_num}", page_num, snippet, cleaned_text))
                        db.commit()

                        # === Добавлен блок индексации слов ===
                        words = re.split(r'[\s\.,;:!?()«»]+', raw_text)
                        word_count = 0
                        for w in words:
                            if clean_w := clean_word(w):
                                context = raw_text[max(0, raw_text.find(w)-60):raw_text.find(w)+100]
                                wi = db.cursor()
                                wi.execute("""
                                    INSERT INTO word_index (word, book_id, page_id, context) 
                                    VALUES (%s, %s, %s, %s)
                                """, (clean_w, book_id, page_num, context[:160]))
                                word_count += 1
                        log(f"       → Добавлено {word_count} слов в word_index")

                        log(f"       ✓ УСПЕШНО СОХРАНЕНО В БАЗУ")
                    else:
                        log(f"        Пропущено (мало текста)")

                except Exception as e:
                    log(f"        Ошибка Tesseract/INSERT: {e}")
                    db.rollback()

        success = True
        break

    except Exception as e:
        log(f"    Ошибка DPI={dpi}: {e}")
        continue

if success:
    log(f" УСПЕШНО ЗАВЕРШЁН: {title}")
else:
    log(f" Не удалось: {title}")
    shutil.copy2(pdf_path, FAILED_DIR / os.path.basename(pdf_path))

# Очистка
for pattern in [f"page_{book_id}_*", f"ocr_{book_id}_*"]:
    for f in TEMP_DIR.glob(pattern):
        f.unlink(missing_ok=True)

log(" Пауза 3 секунды и возврат к поиску...")
time.sleep(3)
