/**
 * Stop Words Service
 *
 * This service handles stop words and filler words for text filtering.
 * It loads stop words from local JSON files and provides methods to:
 * - Filter out stop words from text
 * - Extract key words (important words) from transcripts
 * - Support both German and English languages
 *
 * Matches the C# implementation for compatibility.
 */

const fs = require('fs');
const path = require('path');

class StopWordsService {
  constructor() {
    this.stopWords = {
      de: [],
      en: []
    };

    // Filler words - matching frontend speechConfig.js
    this.fillerWords = {
      de: ['äh', 'ähm', 'also', 'halt', 'irgendwie', 'sozusagen', 'quasi', 'eigentlich'],
      en: ['um', 'uh', 'like', 'you know', 'actually', 'basically', 'literally', 'sort of']
    };

    this.loadStopWords();
  }

  /**
   * Load stop words from local JSON files
   */
  loadStopWords() {
    try {
      // Load German stop words
      const deFilePath = path.join(__dirname, '../data/stopwords/stopwords-de.json');
      if (fs.existsSync(deFilePath)) {
        const deData = fs.readFileSync(deFilePath, 'utf8');
        this.stopWords.de = JSON.parse(deData);
        console.log(`✅ Loaded ${this.stopWords.de.length} German stop words`);
      } else {
        console.warn('⚠️ German stop words file not found');
      }

      // Load English stop words
      const enFilePath = path.join(__dirname, '../data/stopwords/stopwords-en.json');
      if (fs.existsSync(enFilePath)) {
        const enData = fs.readFileSync(enFilePath, 'utf8');
        this.stopWords.en = JSON.parse(enData);
        console.log(`✅ Loaded ${this.stopWords.en.length} English stop words`);
      } else {
        console.warn('⚠️ English stop words file not found');
      }
    } catch (error) {
      console.error('❌ Error loading stop words:', error);
    }
  }

  /**
   * Get stop words for a specific language
   *
   * @param {string} language - Language code ('de' or 'en')
   * @returns {string[]} Array of stop words
   */
  getStopWords(language = 'en') {
    const lang = this.normalizeLanguage(language);
    return this.stopWords[lang] || [];
  }

  /**
   * Get filler words for a specific language
   *
   * @param {string} language - Language code ('de' or 'en')
   * @returns {string[]} Array of filler words
   */
  getFillerWords(language = 'en') {
    const lang = this.normalizeLanguage(language);
    return this.fillerWords[lang] || [];
  }

  /**
   * Normalize language code to 'de' or 'en'
   *
   * @param {string} language - Language code (e.g., 'de-DE', 'en-US', 'de', 'en')
   * @returns {string} Normalized language code ('de' or 'en')
   */
  normalizeLanguage(language) {
    if (!language) return 'en';
    const lang = language.toLowerCase();
    if (lang.startsWith('de')) return 'de';
    if (lang.startsWith('en')) return 'en';
    return 'en'; // Default to English
  }

  /**
   * Extract key words from text by removing stop words and filler words
   * This matches the C# implementation's logic
   *
   * @param {string} text - The text to extract key words from
   * @param {string} language - Language code ('de' or 'en')
   * @returns {string[]} Array of unique key words
   */
  extractKeyWords(text, language = 'en') {
    if (!text || typeof text !== 'string') {
      return [];
    }

    const lang = this.normalizeLanguage(language);
    const stopWords = this.getStopWords(lang);
    const fillerWords = this.getFillerWords(lang);

    // Convert stop words and filler words to Sets for faster lookup
    const stopWordsSet = new Set(stopWords.map(w => w.toLowerCase()));
    const fillerWordsSet = new Set(fillerWords.map(w => w.toLowerCase()));

    // Process text: split into words and normalize
    const words = text
      .toLowerCase()
      // Replace non-word characters with spaces
      .replace(/[^\w\säöüßàâéèêëïîôùûçæœ]/gi, ' ')
      // Split by whitespace
      .split(/\s+/)
      // Filter out empty strings
      .filter(word => word.length > 0)
      // Filter out single characters (except important ones like 'I' in English)
      .filter(word => word.length > 1 || (lang === 'en' && word === 'i'))
      // Filter out numbers
      .filter(word => !/^\d+$/.test(word))
      // Remove stop words
      .filter(word => !stopWordsSet.has(word))
      // Remove filler words
      .filter(word => !fillerWordsSet.has(word));

    // Get unique words while preserving order
    const uniqueWords = [...new Set(words)];

    console.log(`📝 Extracted ${uniqueWords.length} key words from ${text.split(/\s+/).length} total words (${lang})`);

    return uniqueWords;
  }

  /**
   * Count occurrences of each key word in text
   *
   * @param {string} text - The text to analyze
   * @param {string} language - Language code ('de' or 'en')
   * @returns {Object} Object with word counts { word: count }
   */
  countKeyWords(text, language = 'en') {
    if (!text || typeof text !== 'string') {
      return {};
    }

    const lang = this.normalizeLanguage(language);
    const stopWords = this.getStopWords(lang);
    const fillerWords = this.getFillerWords(lang);

    // Convert stop words and filler words to Sets for faster lookup
    const stopWordsSet = new Set(stopWords.map(w => w.toLowerCase()));
    const fillerWordsSet = new Set(fillerWords.map(w => w.toLowerCase()));

    // Process text: split into words and normalize
    const words = text
      .toLowerCase()
      .replace(/[^\w\säöüßàâéèêëïîôùûçæœ]/gi, ' ')
      .split(/\s+/)
      .filter(word => word.length > 0)
      .filter(word => word.length > 1 || (lang === 'en' && word === 'i'))
      .filter(word => !/^\d+$/.test(word))
      .filter(word => !stopWordsSet.has(word))
      .filter(word => !fillerWordsSet.has(word));

    // Count occurrences
    const wordCounts = {};
    words.forEach(word => {
      wordCounts[word] = (wordCounts[word] || 0) + 1;
    });

    return wordCounts;
  }

  /**
   * Get top N most frequent key words
   *
   * @param {string} text - The text to analyze
   * @param {string} language - Language code ('de' or 'en')
   * @param {number} topN - Number of top words to return
   * @returns {Array} Array of {word, count} objects sorted by frequency
   */
  getTopKeyWords(text, language = 'en', topN = 20) {
    const wordCounts = this.countKeyWords(text, language);

    // Convert to array and sort by count
    const sortedWords = Object.entries(wordCounts)
      .map(([word, count]) => ({ word, count }))
      .sort((a, b) => b.count - a.count)
      .slice(0, topN);

    return sortedWords;
  }
}

// Export singleton instance
module.exports = new StopWordsService();