from pathlib import Path
import re
import logging
from typing import Dict

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class OldRussianDictionary:
    def __init__(self):
        self.modern_to_old: Dict[str, str] = {}
        self.load_dictionary()

    def load_dictionary(self):
        dict_path = Path("Словари/dictionary_clean.tsv")
        
        if not dict_path.exists():
            logger.error(f"Файл словаря не найден: {dict_path}")
            return False

        count = 0
        try:
            with open(dict_path, 'r', encoding='utf-8') as f:
                next(f)  # Пропускаем заголовок

                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    
                    # Разделяем по табуляции
                    parts = line.split('\t', 1)
                    if len(parts) == 2:
                        modern = parts[0].strip()
                        old = parts[1].strip()
                        if modern and old and modern != old:
                            self.modern_to_old[modern.lower()] = old
                            count += 1

            logger.info(f"✅ Словарь успешно загружен. Записей для замены: {count}")
            return True

        except Exception as e:
            logger.error(f"Ошибка загрузки словаря: {e}")
            return False

    def correct_text(self, text: str) -> str:
        """Применяет замены из словаря после OCR"""
        if not text or not self.modern_to_old:
            return text

        words = re.findall(r'\b\w+\b', text)
        result = text

        for word in set(words):
            lower = word.lower()
            if lower in self.modern_to_old:
                old_form = self.modern_to_old[lower]
                # Сохраняем регистр
                if word and word[0].isupper() and len(old_form) > 0:
                    old_form = old_form[0].upper() + old_form[1:]
                result = result.replace(word, old_form)

        return result


# Глобальный экземпляр
old_rus_dict = OldRussianDictionary()

if __name__ == "__main__":
    print(f"Словарь загружен. Записей для замены: {len(old_rus_dict.modern_to_old)}")
