/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool 
          with fuzzy matching, translation memory, keyword search, 
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2015 Thomas CORDONNIER
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.matching.external.lucene;

import java.io.Reader;
import java.io.StringReader;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Locale;
import java.util.Set;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
import java.util.Collection;

import org.omegat.core.matching.external.IEntryCursor;
import org.omegat.util.OConsts;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

public class NgramAnalyzer extends Analyzer {
    
    public Set<String> stopNgrams = null;
    
    /**
     * Calculates most often ngrams
     */
    public void calcstopNgrams (IEntryCursor entries, File dest) throws IOException {
        Map<String, Integer> ngrams = new HashMap<String,Integer> (); int size = 0;
        try {
            while (entries.next()) {
                size++; final TokenStream in = tokenStream("", new StringReader(entries.getEntrySource()));
                TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class);
                in.reset(); while (in.incrementToken()) {
                    String term = termAtt.term();
                    if (ngrams.get(term) == null) ngrams.put(term, 1); else ngrams.put(term, ngrams.get(term) + 1);
                }
                in.end(); in.close();
            }
        } catch (Exception ex) {
        
        }
        this.stopNgrams = new HashSet <String> ();
        for (Map.Entry<String,Integer> me: ngrams.entrySet())
            if (me.getValue() > (size / 2))
                stopNgrams.add (me.getKey());
        File destFile = new File (dest, "stop");
        BufferedWriter bw = null;
        try {
            bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(destFile), OConsts.UTF8));

            for (String text : stopNgrams) {
                bw.write(text);
                bw.newLine();
            }
        } catch (IOException ex) {
            // so now what?
        } finally {
            try {
                if (bw != null)
                    bw.close();
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
    }
    
    public void loadstopNgrams(String filename, String idxName) {
        BufferedReader br = null;
        try {
            br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), OConsts.UTF8));

            String thisLine; stopNgrams = new HashSet<String>();
            while ((thisLine = br.readLine()) != null) {
                boolean inIndex = true;
                if (thisLine.startsWith("[" + idxName + "]")) {
                    inIndex = true;
                } else if (thisLine.startsWith("[")) {
                    inIndex = false; // start another index
                } else { // non-index line
                    if (inIndex) stopNgrams.add(thisLine);
                }
            }
        } catch (FileNotFoundException ex) {
            // discard this
        } catch (IOException ex) {
            // so now what?
        } finally {
            try {
                if (br != null)
                    br.close();
            } catch (IOException ex) {
                // so now what?
            }
        }
    }
    
    
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream stream = new LowerCaseFilter (new NGramTokenizer(reader, 4, 4));
        if (stopNgrams != null)
            stream = new StopFilter(Version.LUCENE_31, stream, stopNgrams);
        return stream;
    }

}
