/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool 
          with fuzzy matching, translation memory, keyword search, 
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2015 Thomas CORDONNIER
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.matching.external.lucene;

import java.util.regex.*;
import java.util.*;
import java.io.*;

import org.omegat.core.matching.external.IBrowsableMemory;
import org.omegat.core.search.TextExpression;
import org.omegat.core.data.PrepareTMXEntry;
import org.omegat.util.Language;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.NGramPhraseQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.TermQuery;

public class LuceneReader implements IBrowsableMemory {

    //public static final DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'", Locale.ENGLISH);;
    
    private final NgramAnalyzer analyzer = new NgramAnalyzer();

    private IndexReader indexReader;
    private IndexSearcher searcher;
    private String name;

    public LuceneReader (java.util.Properties prop) throws IOException {
        this (new File(prop.getProperty("dir")));
    }
    
    public LuceneReader(File indexDirectory) throws IOException {
		System.err.println("Reading Lucene : " + indexDirectory);
        if (! indexDirectory.isDirectory()) indexDirectory = indexDirectory.getParentFile();
        name = indexDirectory.getPath();
        indexReader = IndexReader.open(FSDirectory.open(indexDirectory));
        searcher = new IndexSearcher(indexReader);
        File stopFile = new File (indexDirectory, "stop");
        if (stopFile.exists())  analyzer.loadstopNgrams(stopFile.getPath());
    }
    
    
    public String getProviderName() {
        return "Lucene";
    }
    
    public String getMemoryName() {
        return name;
    }
    
    private BooleanQuery buildQuery (String text, BooleanClause.Occur occur) throws IOException {
        final BooleanQuery query = new BooleanQuery();
        TokenStream queryTokenStream = analyzer.tokenStream("src", new StringReader(text));
        TermAttribute termAtt = queryTokenStream.addAttribute(TermAttribute.class);
        queryTokenStream.reset(); 
        Set<String> terms = new HashSet<String>();
        while (queryTokenStream.incrementToken()) terms.add (termAtt.term());
        for (String current: terms) {
            Term t = new Term("src", current); 
            query.add(new TermQuery(t), occur);
        }
        queryTokenStream.end();
        queryTokenStream.close();
        return query;
    }
	
	
    public Iterable<PrepareTMXEntry> findMatchingTranslations (
        Language sLang, Language tLang, String text, 
        int minScore, final int maxCount
    ) throws Exception {
        final BooleanQuery query = buildQuery(text, BooleanClause.Occur.SHOULD);
        int minShould = query.clauses().size() * minScore / 200;
        if (minShould > 1) query.setMinimumNumberShouldMatch (minShould); // minScore as %, divided by 2.              
		return executeQuery (query, maxCount);
	}

	private Iterable<PrepareTMXEntry> executeQuery (final BooleanQuery query, final int maxCount) throws IOException {
        final TopScoreDocCollector topCollector;
        topCollector = TopScoreDocCollector.create(maxCount, true); 
        searcher.search(query, topCollector);
        final List<ScoreDoc> aList = new ArrayList<ScoreDoc>(); 
        for (ScoreDoc sdoc: topCollector.topDocs().scoreDocs) aList.add(sdoc);
		return () -> new Iterator<PrepareTMXEntry>() {
                private Iterator<ScoreDoc> sdocs = aList.iterator();
                    
                public boolean hasNext() { return sdocs.hasNext(); }
                public PrepareTMXEntry next() {
                    try {
                        Document doc = searcher.doc (sdocs.next().doc);
                        return toEntry (doc);
                    } catch (Exception cie) {
                        throw new RuntimeException (cie);
                    }
                }
            };
    }
    
    @Override
    public Iterable<PrepareTMXEntry> getEntries() throws CorruptIndexException, IOException {		
        int max1 = indexReader.maxDoc() - 1;
		while ((max1 > 0) && (indexReader.isDeleted(max1))) max1--;
		final int max = max1;
		
		return () -> new Iterator<PrepareTMXEntry>() {
                private int pos = 0;
                    
                private Document nextDoc() throws CorruptIndexException, IOException {
                    pos++; if (pos > max) return null;
                    if (indexReader.isDeleted(pos)) return nextDoc();
                    return indexReader.document(pos);
                }
                    
                public PrepareTMXEntry next() {
                    try {
                        return toEntry (nextDoc());
                    } catch (Exception cie) {
                        throw new RuntimeException (cie);
                    }
                }
                
                public boolean hasNext() { return pos < max; }
            };
    }
    
	private static final Pattern LITTERAL = Pattern.compile("(?<!\\\\)[^\\\\\\(\\)\\{\\}\\[\\]\\+\\-\\*]{4}");
	
    @Override
    public Iterable<PrepareTMXEntry> search(int numberOfResults, TextExpression searchSource, TextExpression searchTarget, TextExpression searchNotes, boolean andSearch,
        TextExpression author, // NOTE: does not take DGT-specific "translator" fields, because they would be meaningless for external providers
        long dateAfter, long dateBefore) throws CorruptIndexException, IOException {
        
        // Only the source is indexed. Since expression uses OR between source, target and notes, we can use index only on source-only searches!
        if (searchSource == null) return getEntries();
        if ((!andSearch) && (searchTarget != null)) return getEntries();
        if ((!andSearch) && (searchNotes != null)) return getEntries();
       
	    if (searchSource instanceof TextExpression.RegexTextExpression) {
			String pattern = ((TextExpression.RegexTextExpression) searchSource).getPattern().pattern();
            // In a regex, only the litteral parts can be indexed
            final BooleanQuery query = new BooleanQuery();
            Matcher m = LITTERAL.matcher (pattern);
            while (m.find()) query.add(buildQuery(m.group(), BooleanClause.Occur.MUST), BooleanClause.Occur.MUST);
            if (query.iterator().hasNext()) return executeQuery (query, numberOfResults);
		}
	   
	    if (searchSource instanceof TextExpression.WordsTextExpression) {
            final BooleanQuery query = new BooleanQuery();
            for (TextExpression.RegexTextExpression word: ((TextExpression.WordsTextExpression) searchSource).split()) {
                // OR search (should) but each word is built using MUST
				String pattern = word.getPattern().pattern();
				// In a regex, only the litteral parts can be indexed
				Matcher m = LITTERAL.matcher (pattern);
				while (m.find()) query.add(buildQuery(m.group(), BooleanClause.Occur.MUST), BooleanClause.Occur.MUST);				
            }
            if (query.iterator().hasNext()) return executeQuery (query, numberOfResults);
        }

		// No lemma search (would need a distinct index)
		
		return getEntries();
    }
    
    
    private PrepareTMXEntry toEntry (Document doc) {
        PrepareTMXEntry entry = new PrepareTMXEntry();
        entry.source = doc.get("src");
        entry.translation = doc.get("tra");
        entry.creator = doc.get("author");
        entry.changer = doc.get("changer");
        //try { entry.creationDate = dateFormat.parse(doc.get("creationDate")); } catch (Exception e1) {}
        //try { entry.changeDate = dateFormat.parse(doc.get("changeDate")); } catch (Exception e1) {}
        return entry;
    }
}