#!/usr/bin/env python3
import os
import sys
import pickle
import subprocess
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# --- CONFIGURATION ---
STORAGE_DIR = Path.home() / "Documents"
CACHE_FILE = Path.home() / ".file_router_cache.pkl"
MODEL_FILE = Path.home() / ".file_router_model.pkl"
ALLOWED_EXTENSIONS = {'.txt', '.md', '.org', '.pdf'}

def extract_text(file_path):
    """Extract text from plain text files and PDFs natively."""
    suffix = file_path.suffix.lower()
    if suffix in {'.txt', '.md', '.org'}:
        try:
            return file_path.read_text(encoding='utf-8', errors='ignore')
        except Exception:
            return ""
    elif suffix == '.pdf':
        try:
            result = subprocess.run(['pdftotext', str(file_path), '-'], 
                                    capture_output=True, text=True, check=True)
            return result.stdout
        except subprocess.CalledProcessError:
            return ""
    return ""

def load_cache():
    """Load the file cache metadata map safely."""
    if CACHE_FILE.exists():
        try:
            with open(CACHE_FILE, 'rb') as f:
                return pickle.load(f)
        except Exception:
            return {}
    return {}

def save_cache(cache_data):
    """Save the file cache metadata map safely."""
    try:
        with open(CACHE_FILE, 'wb') as f:
            pickle.dump(cache_data, f)
    except Exception:
        pass

def build_training_corpus(root_dir, old_cache):
    """Scan only bottom-level leaf directories using cache optimizations."""
    texts = []
    labels = []
    new_cache = {}
    cache_updated = False
    
    for root, dirs, files in os.walk(root_dir):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        current_path = Path(root)
        local_files = [
            current_path / f for f in files 
            if Path(f).suffix.lower() in ALLOWED_EXTENSIONS and not f.startswith('.')
        ]
        
        if local_files:
            folder_label = str(current_path.relative_to(root_dir))
            for file in local_files:
                file_str = str(file)
                try:
                    mtime = file.stat().st_mtime
                except OSError:
                    continue
                
                if file_str in old_cache and old_cache[file_str]['mtime'] == mtime:
                    text = old_cache[file_str]['text']
                else:
                    text = extract_text(file)
                    cache_updated = True
                
                if text.strip():
                    texts.append(text)
                    labels.append(folder_label)
                    new_cache[file_str] = {'mtime': mtime, 'text': text}
                            
    if cache_updated or len(new_cache) != len(old_cache):
        save_cache(new_cache)
        
    return texts, labels

def train_model():
    """Extract vocabulary features across the system and build the static classifier state."""
    old_cache = load_cache()
    texts, labels = build_training_corpus(STORAGE_DIR, old_cache)
    
    if not texts or len(set(labels)) < 2:
        print("❌ Error: Need at least 2 distinct destination folders containing files to train.")
        sys.exit(1)

    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train = vectorizer.fit_transform(texts)
    classifier = MultinomialNB()
    classifier.fit(X_train, labels)
    
    with open(MODEL_FILE, 'wb') as f:
        pickle.dump({'vectorizer': vectorizer, 'classifier': classifier}, f)

def main():
    # Force rebuild model if explicit train flag is sent or if no compilation exists yet
    if (len(sys.argv) > 1 and sys.argv[1] == "--train") or not MODEL_FILE.exists():
        print("🧠 Training classifier model across filesystem nodes...")
        train_model()
        print("✨ Model compiled successfully!")
        if len(sys.argv) > 1 and sys.argv[1] == "--train":
            return

    if len(sys.argv) < 2:
        print("Usage: file_router.py [file_path] or file_router.py --train")
        return

    target_file = Path(sys.argv[1])
    if not target_file.exists():
        return

    # Load compiled ML weights
    with open(MODEL_FILE, 'rb') as f:
        model = pickle.load(f)
        
    text = extract_text(target_file)
    
    # NEW FALLBACK LAYER: If file has no readable text, emit all folders alphabetically
    if not text.strip():
        for clazz in sorted(model['classifier'].classes_):
            print(f"CLASSIFY_TARGET: {STORAGE_DIR / clazz} | CONFIDENCE: 0.0%")
        return

    X_file = model['vectorizer'].transform([text])
    probabilities = model['classifier'].predict_proba(X_file)[0]
    
    # Sort ALL known filesystem directories by likelihood descending
    all_indices = np.argsort(probabilities)[::-1]
    
    for rank, idx in enumerate(all_indices):
        confidence = probabilities[idx] * 100
        if rank < 3 and confidence > 1.0:
            print(f"CLASSIFY_TARGET: {STORAGE_DIR / model['classifier'].classes_[idx]} | CONFIDENCE: {confidence:.1f}%")
        else:
            print(f"CLASSIFY_TARGET: {STORAGE_DIR / model['classifier'].classes_[idx]} | CONFIDENCE: 0.0%")

if __name__ == "__main__":
    main()
